In [1]:
pip install wordcloud matplotlib numpy pandas nltk plotly seaborn   

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install wordcloud

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import nltk, re
from nltk.corpus import stopwords
from collections import Counter
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


In [4]:

ds = load_dataset("keivalya/MedQuad-MedicalQnADataset")

In [5]:
ds

DatasetDict({
    train: Dataset({
        features: ['qtype', 'Question', 'Answer'],
        num_rows: 16407
    })
})

In [6]:
train_data = ds["train"]

# Convert to DataFrame for EDA
df = train_data.to_pandas()

# Show schema and sample rows
print(df.head())
print("\nColumns:", df.columns)
print("\nNumber of samples:", len(df))


             qtype                                           Question  \
0   susceptibility  Who is at risk for Lymphocytic Choriomeningiti...   
1         symptoms  What are the symptoms of Lymphocytic Choriomen...   
2   susceptibility  Who is at risk for Lymphocytic Choriomeningiti...   
3  exams and tests  How to diagnose Lymphocytic Choriomeningitis (...   
4        treatment  What are the treatments for Lymphocytic Chorio...   

                                              Answer  
0  LCMV infections can occur after exposure to fr...  
1  LCMV is most commonly recognized as causing ne...  
2  Individuals of all ages who come into contact ...  
3  During the first phase of the disease, the mos...  
4  Aseptic meningitis, encephalitis, or meningoen...  

Columns: Index(['qtype', 'Question', 'Answer'], dtype='object')

Number of samples: 16407


In [7]:
print("Number of samples:", len(df))
print("Columns:", df.columns.tolist())

# Peek at a few examples
for i in range(3):
    print(f"\nQTYPE: {df['qtype'][i]}")
    print(f"QUESTION: {df['Question'][i]}")
    print(f"ANSWER: {df['Answer'][i][:200]}...")  # preview 200 chars


Number of samples: 16407
Columns: ['qtype', 'Question', 'Answer']

QTYPE: susceptibility
QUESTION: Who is at risk for Lymphocytic Choriomeningitis (LCM)? ?
ANSWER: LCMV infections can occur after exposure to fresh urine, droppings, saliva, or nesting materials from infected rodents.  Transmission may also occur when these materials are directly introduced into b...

QTYPE: symptoms
QUESTION: What are the symptoms of Lymphocytic Choriomeningitis (LCM) ?
ANSWER: LCMV is most commonly recognized as causing neurological disease, as its name implies, though infection without symptoms or mild febrile illnesses are more common clinical manifestations. 
           ...

QTYPE: susceptibility
QUESTION: Who is at risk for Lymphocytic Choriomeningitis (LCM)? ?
ANSWER: Individuals of all ages who come into contact with urine, feces, saliva, or blood of wild mice are potentially at risk for infection. Owners of pet mice or hamsters may be at risk for infection if the...


In [8]:
df["q_len"] = df["Question"].apply(lambda x: len(str(x).split()))
df["a_len"] = df["Answer"].apply(lambda x: len(str(x).split()))

print("Avg question length:", df["q_len"].mean())
print("Avg answer length:", df["a_len"].mean())


Avg question length: 8.212165539099164
Avg answer length: 201.35436094349973


#USING PLOTLY FOR INTERACTIVE VISUALIZATIONS

In [9]:
import plotly.express as px
import plotly.graph_objects as go


In [10]:
pip install --upgrade nbformat

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [11]:
fig = px.bar(
    df["qtype"].value_counts().reset_index(),
    x="count",
    y="qtype",
    orientation="h",
    color="qtype",
    title="Distribution of Question Types",
    labels={"qtype":"Question Type", "count":"Count"}
)
fig.update_layout(showlegend=False)
fig.show()


In [12]:
fig = px.histogram(
    df,
    x="q_len",
    nbins=50,
    title="Question Length Distribution (words)",
    marginal="box", # adds a boxplot on top
    color_discrete_sequence=["teal"]
)
fig.show()


In [13]:
fig = px.histogram(
    df,
    x="a_len",
    nbins=50,
    title="Answer Length Distribution (words)",
    marginal="box",
    color_discrete_sequence=["orange"]
)
fig.show()


In [14]:
melted = df.melt(value_vars=["q_len","a_len"], var_name="Text Type", value_name="Word Count")

fig = px.box(
    melted,
    x="Text Type",
    y="Word Count",
    color="Text Type",
    title="Distribution of Question vs Answer Lengths"
)
fig.show()


In [15]:
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

def clean_text(text):
    return re.sub(r"[^a-zA-Z0-9\s]", "", text.lower())

all_q_words = " ".join(df["Question"].apply(clean_text)).split()
filtered_q_words = [w for w in all_q_words if w not in stop_words]
top_q = Counter(filtered_q_words).most_common(20)

q_df = pd.DataFrame(top_q, columns=["Word","Frequency"])

fig = px.bar(
    q_df,
    x="Word",
    y="Frequency",
    title="Top 20 Words in Questions",
    color="Frequency",
    color_continuous_scale="viridis"
)
fig.show()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\arush\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [16]:
all_q_words = " ".join(df["Question"].apply(clean_text)).split()
filtered_q_words = [w for w in all_q_words if w not in stop_words]

# Frequency count
q_counter = Counter(filtered_q_words)
q_df = pd.DataFrame(q_counter.items(), columns=["Word", "Frequency"]).sort_values("Frequency", ascending=False)


In [17]:

# take top 100 words for visibility
top_q_df = q_df.head(100).copy()

# random positions for words
np.random.seed(42)
top_q_df["x"] = np.random.rand(len(top_q_df))
top_q_df["y"] = np.random.rand(len(top_q_df))

# scale word sizes by frequency
top_q_df["size"] = top_q_df["Frequency"] / top_q_df["Frequency"].max() * 50

fig = px.scatter(
    top_q_df,
    x="x",
    y="y",
    text="Word",
    size="size",
    color="Frequency",
    color_continuous_scale="rainbow",
    title="Interactive Word Cloud (Questions)"
)

fig.update_traces(textposition="top center", marker=dict(opacity=0.7, line=dict(width=1, color="DarkSlateGrey")))
fig.update_xaxes(visible=False)
fig.update_yaxes(visible=False)
fig.show()


In [18]:
# --- ANSWERS WORD CLOUD ---

# Combine all answers
all_a_words = " ".join(df["Answer"].apply(clean_text)).split()
filtered_a_words = [w for w in all_a_words if w not in stop_words]

# Frequency count
a_counter = Counter(filtered_a_words)
a_df = pd.DataFrame(a_counter.items(), columns=["Word", "Frequency"]).sort_values("Frequency", ascending=False)

# Take top 100 for visibility
top_a_df = a_df.head(100).copy()

# Random positions for cloud
np.random.seed(99)
top_a_df["x"] = np.random.rand(len(top_a_df))
top_a_df["y"] = np.random.rand(len(top_a_df))

# Scale sizes
top_a_df["size"] = top_a_df["Frequency"] / top_a_df["Frequency"].max() * 50

fig = px.scatter(
    top_a_df,
    x="x",
    y="y",
    text="Word",
    size="size",
    color="Frequency",
    color_continuous_scale="plasma",
    title="Interactive Word Cloud (Answers)"
)

fig.update_traces(textposition="top center", marker=dict(opacity=0.7, line=dict(width=1, color="DarkSlateGrey")))
fig.update_xaxes(visible=False)
fig.update_yaxes(visible=False)
fig.show()


In [19]:
import re
from sklearn.model_selection import train_test_split

# ✅ 1. Drop missing values
print("Before NA removal:", df.shape)
df = df.dropna(subset=["Question", "Answer"]).reset_index(drop=True)
print("After NA removal:", df.shape)

# ✅ 2. Remove duplicates
print("Before duplicate removal:", df.shape)
df = df.drop_duplicates(subset=["Question", "Answer"]).reset_index(drop=True)
print("After duplicate removal:", df.shape)

# ✅ 3. Text cleaning function
def clean_text(text):
    # lowercase
    text = text.lower()
    # remove special chars except medical terms like hyphens
    text = re.sub(r"[^a-z0-9\s\-]", " ", text)
    # collapse multiple spaces
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Apply cleaning
df["Question_clean"] = df["Question"].apply(clean_text)
df["Answer_clean"] = df["Answer"].apply(clean_text)



Before NA removal: (16407, 5)
After NA removal: (16407, 5)
Before duplicate removal: (16407, 5)
After duplicate removal: (16359, 5)


In [20]:
rare = df.groupby("qtype").filter(lambda x: len(x) < 2)
df_rest = df.drop(rare.index)

train_df, val_df = train_test_split(
    df_rest,
    test_size=0.2,
    random_state=42,
    stratify=df_rest["qtype"]
)

# add rare categories back into train
train_df = pd.concat([train_df, rare]).reset_index(drop=True)


In [21]:
print("Train size:", train_df.shape)
print("Validation size:", val_df.shape)


Train size: (13087, 7)
Validation size: (3272, 7)
