In [34]:
import pandas as pd
import statistics
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_distances

In [35]:
df = pd.read_csv("../data/mle_screening_dataset.csv")
df.head(5)

Unnamed: 0,question,answer
0,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...
1,What is (are) Glaucoma ?,The optic nerve is a bundle of more than 1 mil...
2,What is (are) Glaucoma ?,Open-angle glaucoma is the most common form of...
3,Who is at risk for Glaucoma? ?,Anyone can develop glaucoma. Some people are a...
4,How to prevent Glaucoma ?,"At this time, we do not know how to prevent gl..."


In [36]:
print("question: ", df.iloc[0]['question'], "\nanswer: ", df.iloc[0]['answer'])

question:  What is (are) Glaucoma ? 
answer:  Glaucoma is a group of diseases that can damage the eye's optic nerve and result in vision loss and blindness. The most common form of the disease is open-angle glaucoma. With early treatment, you can often protect your eyes against serious vision loss. (Watch the video to learn more about glaucoma. To enlarge the video, click the brackets in the lower right-hand corner. To reduce the video, press the Escape (Esc) button on your keyboard.)  See this graphic for a quick overview of glaucoma, including how many people it affects, whos at risk, what to do if you have it, and how to learn more.  See a glossary of glaucoma terms.


In [37]:
df.shape

(16406, 2)

In [38]:
df.isnull().sum().to_dict()

{'question': 0, 'answer': 5}

In [39]:
df.duplicated().sum()

np.int64(48)

In [40]:
df['question'].duplicated().sum()

np.int64(1425)

In [41]:
df['answer'].duplicated().sum()

np.int64(594)

In [9]:
dup_counts = df.value_counts().reset_index(name="count")
dup_counts[dup_counts["count"] > 1].head(10)

Unnamed: 0,question,answer,count
0,What causes Causes of Diabetes ?,Other types of diabetes have a variety of poss...,8
1,What to do for Causes of Diabetes ?,- Diabetes is a complex group of diseases with...,4
2,What is (are) Causes of Diabetes ?,Diabetes is a complex group of diseases with a...,4
3,What causes Causes of Diabetes ?,Type 1 diabetes is caused by a lack of insulin...,4
4,What causes Causes of Diabetes ?,Insulin Resistance and Beta Cell Dysfunction\n...,4
5,What causes Causes of Diabetes ?,Type 2 diabetesthe most common form of diabete...,4
6,How to diagnose Glomerular Diseases ?,Patients with glomerular disease have signific...,2
7,What is (are) Kidney Failure: Eat Right to Fee...,"Potassium is a mineral found in many foods, es...",2
8,What is (are) Renal Tubular Acidosis ?,Type 1: Classical Distal RTA\n ...,2
9,What is (are) Renal Tubular Acidosis ?,Renal tubular acidosis (RTA) is a disease that...,2


In [10]:
def normalize_question(q):
    """
    Produce a normalized key so near-duplicates map to the same string.
    """
    if not isinstance(q, str): return ''
    q = q.strip().lower()
    q = re.sub(r'\s+', ' ', q)
    q = re.sub(r'\s+([?.!,;:])', r'\1', q)
    q = re.sub(r'\?{2,}$', '?', q)
    return q

In [11]:
def length_stats(series):
    """
    Compute token-length distribution stats for a Pandas Series.
    """
    lengths = series.astype(str).str.split().apply(len)
    return {
        "count": int(lengths.shape[0]),
        "min": int(lengths.min() if len(lengths) else 0),
        "p10": float(lengths.quantile(0.10)) if len(lengths) else 0.0,
        "median": float(lengths.median() if len(lengths) else 0),
        "p90": float(lengths.quantile(0.90)) if len(lengths) else 0.0,
        "max": int(lengths.max() if len(lengths) else 0),
        "mean": float(lengths.mean() if len(lengths) else 0.0),
        "std": float(lengths.std() if len(lengths) else 0.0,
        ),
    }

In [12]:
def plot_hist(data, title, xlabel, out_path):
    """
    Plotting helper; Save the plots for visibility / generating reports
    """
    fig = plt.figure()
    plt.hist(data, bins=50)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel('Count')
    plt.tight_layout()
    fig.savefig(out_path, dpi=150)
    plt.close(fig)

In [13]:
def top_ngrams(series, n=2, topk=25):
    """
    Word level top n-grams.
    Splits into word tokens, counts contiguous n-grams (default bigrams)
    """
    tokens = series.astype(str).str.lower().str.replace(r'[^a-z0-9\s]', ' ', regex=True).str.split()
    grams = Counter()
    for toks in tokens:
        for i in range(len(toks)-n+1):
            grams[tuple(toks[i:i+n])] += 1
    items = [{'ngram':' '.join(k), 'count':v} for k,v in grams.most_common(topk)]
    return pd.DataFrame(items)

In [14]:
def detect_near_duplicates(texts, sample_size=2000, threshold=0.10):
    """
    Flags pairs with near duplicates using cosine distance 
    in TF-IDF space (0 = identical, 2 = opposite direction, but practically 0–1).
    Using hand-wavy threshold as 0.1, but can be tuned.
    """
    idxs = np.arange(len(texts))
    if len(idxs) > sample_size:
        rng = np.random.default_rng(42)
        idxs = rng.choice(idxs, size=sample_size, replace=False)
    subset = texts.iloc[idxs].tolist()
    vec = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,5), min_df=2)
    X = vec.fit_transform(subset)
    D = cosine_distances(X)
    pairs = []
    for i in range(len(subset)):
        for j in range(i+1, len(subset)):
            d = float(D[i, j])
            if d <= threshold:
                pairs.append((subset[i], subset[j], d))
    return pd.DataFrame(pairs, columns=['q1','q2','cosine_distance']).sort_values('cosine_distance')

In [16]:
import re
df['q_norm'] = df['question'].apply(normalize_question)

In [17]:
dup_counts = df.groupby('q_norm').size().sort_values(ascending=False)
n_unique = int(dup_counts.shape[0])
n_dupe_groups = int((dup_counts>1).sum())
top_dupes = dup_counts.head(20).reset_index().rename(columns={0:'count'})

In [18]:
print(f"Total unique normalized questions: {n_unique:,}")
print(f"Groups with duplicates: {n_dupe_groups:,} "
      f"({n_dupe_groups / n_unique:.2%} of unique)")
print("\nTop duplicate questions:")
print(top_dupes.to_string(index=False))

Total unique normalized questions: 14,346
Groups with duplicates: 1,411 (9.84% of unique)

Top duplicate questions:
                                                                q_norm  count
                                       what causes causes of diabetes?     20
                                 what is (are) high blood cholesterol?     19
                           what is (are) medicare and continuing care?     14
                                                 what is (are) stroke?     13
                            what are the treatments for breast cancer?     13
                                            what is (are) skin cancer?     13
                                          what is (are) breast cancer?     13
what is (are) kidney failure: eat right to feel right on hemodialysis?     12
                                      what is (are) colorectal cancer?     12
                          what are the treatments for prostate cancer?     11
                          

In [19]:
# Get length of questions and answers for potential insights
def stats(series): return series.astype(str).str.split().apply(len).tolist()
q_lengths = stats(df['question'])
a_lengths = stats(df['answer'])

In [20]:
# Histogram plots plots
plot_hist(q_lengths, 'Question Length Distribution (tokens)', 'Question length', '../plots/question_len_hist.png')
plot_hist(a_lengths, 'Answer Length Distribution (tokens)', 'Answer length', '../plots/answer_len_hist.png')


In [21]:
median_q_lengths = statistics.median(q_lengths)
print(f"Median of q_lengths: {median_q_lengths}")

median_a_lengths = statistics.median(a_lengths)
print(f"Median of a_lengths: {median_a_lengths}")

Median of q_lengths: 8.0
Median of a_lengths: 138.0


### Exploration of question and answer lengths:
- Questions are short and concise (median 8 tokens), which simplifies preprocessing and ensures queries remain focused.
- Answers, however, vary dramatically: while most are concise, some are very long, indicating the need for preprocessing strategies such as truncation, summarization, or chunking.

This asymmetry suggests that retrieval is a suitable approach: queries are short, but the system must handle variable-length answers robustly.

In [22]:
# openings
first_two = df['q_norm'].str.split().apply(lambda x: ' '.join(x[:2]) if isinstance(x, list) and len(x)>=2 else '')
top_openings = first_two.value_counts().head(25).reset_index()
top_openings.columns = ['opening','count']

In [23]:
print("\nTop 10 question openings:")
print(top_openings.head(10))


Top 10 question openings:
         opening  count
0       what are   6394
1        what is   4605
2       how many   1117
3         how to    849
4    what causes    726
5  what research    395
6         who is    322
7         do you    295
8        what to    234
9    is familial     35


In [26]:
from collections import Counter
bigrams = top_ngrams(df['question'], 2, 30)
trigrams = top_ngrams(df['question'], 3, 30)
print("\nMost common bigrams in questions:")
print(bigrams.head())
print("\nMost common trigrams in questions:")
print(trigrams.head())


Most common bigrams in questions:
          ngram  count
0      what are   6394
1       are the   6391
2       what is   4605
3        is are   4235
4  the symptoms   2746

Most common trigrams in questions:
                ngram  count
0        what are the   6391
1         what is are   4235
2    are the symptoms   2746
3     the symptoms of   2745
4  are the treatments   2435


In [29]:
# near dupes
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances
near_dupes = detect_near_duplicates(df['q_norm'], 2000, 0.1)
near_dupes.to_csv('../data/near_duplicate_questions_sampled.csv', index=False)
near_dupes.head()

Unnamed: 0,q1,q2,cosine_distance
121,what is (are) high blood pressure?,what is (are) high blood pressure?,0.0
58,what is (are) medicare and continuing care?,what is (are) medicare and continuing care?,0.0
57,what is (are) medicare and continuing care?,what is (are) medicare and continuing care?,0.0
135,is leber hereditary optic neuropathy inherited?,is leber hereditary optic neuropathy inherited?,0.0
142,what are the treatments for osteoarthritis?,what are the treatments for osteoarthritis?,0.0


In [30]:
# outlier answers
a_len_series = df['answer'].astype(str).str.split().apply(len)
short_ans = df[a_len_series <= 5]
long_ans = df[a_len_series >= max(200, int(a_len_series.quantile(0.95)))]

In [31]:
short_ans.head(3)

Unnamed: 0,question,answer,q_norm
2556,What causes Bell's palsy ?,What causes Bell's palsy?,what causes bell's palsy?
3080,"Is 48,XXYY syndrome inherited ?","Can 48,XXYY syndrome be inherited?","is 48,xxyy syndrome inherited?"
3124,Is Wolff-Parkinson-White syndrome inherited ?,Is Wolff-Parkinson-White syndrome inherited?,is wolff-parkinson-white syndrome inherited?


In [32]:
long_ans.head(2)

Unnamed: 0,question,answer,q_norm
10,What is (are) High Blood Pressure ?,High blood pressure is a common disease in whi...,what is (are) high blood pressure?
13,How to prevent High Blood Pressure ?,Steps You Can Take You can take steps to preve...,how to prevent high blood pressure?


From short answers, we observe rows where the answer is exactly or semantically similar to the question. We will have to remove such rows during pre-processing. This encourages us to compare answers for sematically similar questions

In [33]:

total_rows = len(df)
unique_answers = df['answer'].astype(str).nunique()

# number of rows that are duplicates of another answer
duplicate_answer_rows = total_rows - unique_answers

# number of unique answers that are duplicated at least once
duplicated_answer_groups = df['answer'].astype(str).duplicated(keep=False).sum()

print(f"Total rows: {total_rows:,}")
print(f"Unique answers: {unique_answers:,}")
print(f"Rows with duplicate answers: {duplicate_answer_rows:,}")
print(f"Groups of answers duplicated at least once: {duplicated_answer_groups:,}")


Total rows: 16,406
Unique answers: 15,812
Rows with duplicate answers: 594
Groups of answers duplicated at least once: 679


In [34]:
len(near_dupes)

243

Our dataset shows 594 rows with duplicate answers and at least 243 pairs of near-duplicate questions. These redundancies highlight the need for a deduplication step. We should normalize questions and merge answers for semantically similar entries, ensuring each unique medical query is represented once with a consolidated answer. This will improve training stability and retrieval quality.