In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv('../Dataset/Cleaned_Data.csv')

In [3]:
df.isnull().sum()

eng                0
Subject            0
clean_question    10
dtype: int64

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(str(text))
    tokens = [w for w in tokens if w.isalpha()]
    tokens = [w for w in tokens if w not in stop_words]
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return " ".join(tokens)

df['final_text'] = df['clean_question'].apply(preprocess)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aaru\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aaru\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
df.drop(columns=['clean_question'],inplace=True)

In [6]:
topic_map = {
    "Physics": {
    "Kinematics": ["velocity", "speed", "displacement", "motion", "acceleration", "time"],
    "Laws of Motion": ["force", "newton", "mass", "friction", "inertia", "momentum"],
    "Work Power Energy": ["work", "energy", "power", "kinetic", "potential"],
    "Gravitation": ["gravity", "gravitational", "orbit", "satellite", "planet"],
    "Oscillations": ["oscillation", "shm", "spring", "frequency", "period"],
    "Waves": ["wave", "wavelength", "amplitude", "sound", "doppler"],
    "Thermodynamics": ["heat", "temperature", "entropy", "thermodynamic", "gas"],
    "Current Electricity": ["current", "resistance", "voltage", "ohm", "circuit"],
    "Capacitance": ["capacitor", "capacitance", "dielectric"],
    "Magnetism": ["magnetic", "field", "flux", "coil", "induction"],
    "Electromagnetic Induction": ["emf", "lenz", "faraday", "transformer"],
    "Optics": ["mirror", "lens", "focal", "refraction", "reflection", "image"],
    "Modern Physics": ["photoelectric", "photon", "electron", "nuclear", "radioactive"]
},
    "Mathematics": {
    "Algebra": ["equation", "polynomial", "root", "factor"],
    "Quadratic Equations": ["quadratic", "discriminant"],
    "Sequences and Series": ["series", "progression", "ap", "gp"],
    "Trigonometry": ["sin", "cos", "tan", "angle", "identity"],
    "Inverse Trigonometry": ["inverse", "arcsin", "arccos"],
    "Limits and Continuity": ["limit", "continuous"],
    "Differentiation": ["derivative", "differentiate", "slope"],
    "Integration": ["integral", "integrate", "area under"],
    "Differential Equations": ["differential equation", "solution"],
    "Vectors": ["vector", "magnitude", "direction"],
    "3D Geometry": ["line", "plane", "distance", "coordinates"],
    "Matrices": ["matrix", "determinant", "inverse"],
    "Probability": ["probability", "random", "event", "distribution"]
},
    "Chemistry": {
    "Mole Concept": ["mole", "molar", "avogadro", "stoichiometry"],
    "Atomic Structure": ["electron", "proton", "neutron", "orbital", "shell"],
    "Periodic Table": ["periodic", "group", "period", "atomic number"],
    "Chemical Bonding": ["bond", "covalent", "ionic", "hybridization"],
    "Thermochemistry": ["enthalpy", "heat", "reaction heat"],
    "Chemical Equilibrium": ["equilibrium", "kc", "kp", "constant"],
    "Redox Reactions": ["oxidation", "reduction", "electron transfer"],
    "Acids Bases Salts": ["acid", "base", "ph", "salt"],
    "Electrochemistry": ["cell", "electrode", "potential", "battery"],
    "Chemical Kinetics": ["rate", "order", "reaction rate"],
    "Organic Basics": ["hydrocarbon", "alkane", "alkene", "alkyne"],
    "Alcohols Phenols": ["alcohol", "phenol", "ether"],
    "Aldehyde Ketone": ["aldehyde", "ketone", "carbonyl"],
    "Biomolecules": ["carbohydrate", "protein", "enzyme", "vitamin"]
},
    "Biology": {
    "Cell Biology": ["cell", "mitosis", "meiosis", "organelle", "nucleus"],
    "Biomolecules": ["enzyme", "protein", "carbohydrate", "lipid"],
    "Photosynthesis": ["photosynthesis", "chlorophyll", "photosystem"],
    "Respiration": ["respiration", "glycolysis", "krebs", "atp"],
    "Human Physiology": ["heart", "blood", "kidney", "lung", "digestion"],
    "Plant Physiology": ["transpiration", "xylem", "phloem", "stomata"],
    "Genetics": ["gene", "dna", "chromosome", "inheritance"],
    "Evolution": ["evolution", "variation", "selection"],
    "Reproduction": ["fertilization", "zygote", "gamete", "pollination"],
    "Ecology": ["ecosystem", "population", "food chain", "environment"],
    "Biotechnology": ["plasmid", "recombinant", "cloning", "pcr"]
}
}

def assign_topic(row):
    text = row['final_text']
    Subject = row['Subject']

    for topic, keywords in topic_map.get(Subject, {}).items():
        for k in keywords:
            if k in text:
                return topic
    return "Other"


In [7]:
df.head()

Unnamed: 0,eng,Subject,final_text
0,An anti-forest measure is\nA. Afforestation\nB...,Biology,antiforest measure afforestation b selective g...
1,"Among the following organic acids, the acid pr...",Chemistry,among following organic acid acid present ranc...
2,If the area of two similar triangles are equal...,Maths,area two similar triangle equal equilateral b ...
3,"In recent year, there has been a growing\nconc...",Biology,recent year growing concern gradually increasi...
4,Which of the following statement\nregarding tr...,Physics,following statement regarding transformer inco...


In [8]:
df['topic'] = df.apply(assign_topic, axis=1)
df['topic'].value_counts()


topic
Other                        61940
Kinematics                   12226
Acids Bases Salts             5868
Laws of Motion                4406
Mole Concept                  4023
Work Power Energy             4013
Atomic Structure              3141
Current Electricity           2671
Optics                        2050
Thermodynamics                1865
Chemical Kinetics             1590
Chemical Equilibrium          1557
Chemical Bonding              1542
Cell Biology                  1524
Thermochemistry               1390
Magnetism                     1346
Periodic Table                1231
Waves                         1211
Redox Reactions               1187
Human Physiology              1029
Biomolecules                   844
Electrochemistry               762
Genetics                       757
Oscillations                   665
Modern Physics                 508
Respiration                    489
Alcohols Phenols               445
Gravitation                    390
Ecology       

In [17]:
df['topic'] = df['topic'].replace("Other", "Unlabeled")
df['topic'].value_counts()


topic
Unlabeled                    61940
Kinematics                   12226
Acids Bases Salts             5868
Laws of Motion                4406
Mole Concept                  4023
Work Power Energy             4013
Atomic Structure              3141
Current Electricity           2671
Optics                        2050
Thermodynamics                1865
Chemical Kinetics             1590
Chemical Equilibrium          1557
Chemical Bonding              1542
Cell Biology                  1524
Thermochemistry               1390
Magnetism                     1346
Periodic Table                1231
Waves                         1211
Redox Reactions               1187
Human Physiology              1029
Biomolecules                   844
Electrochemistry               762
Genetics                       757
Oscillations                   665
Modern Physics                 508
Respiration                    489
Alcohols Phenols               445
Gravitation                    390
Ecology       

In [20]:
unlabeled_df = df[df['topic'] == "Unlabeled"].copy()

print("Unlabeled samples:", len(unlabeled_df))
unlabeled_df[['eng','final_text','Subject']].head()


Unlabeled samples: 61940


Unnamed: 0,eng,final_text,Subject
0,An anti-forest measure is\nA. Afforestation\nB...,antiforest measure afforestation b selective g...,Biology
2,If the area of two similar triangles are equal...,area two similar triangle equal equilateral b ...,Maths
3,"In recent year, there has been a growing\nconc...",recent year growing concern gradually increasi...,Biology
5,Fern plants reproduce by\nA. Seeds\nB. Spores\...,fern plant reproduce seed b spore c laying egg...,Biology
7,The sides of a right angled triangle are in A....,side right angled triangle ap ratio side b c c...,Maths


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_cluster = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2)
)

X_unlabeled = tfidf_cluster.fit_transform(unlabeled_df['final_text'])


In [22]:
from sklearn.cluster import KMeans

k = 10
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(X_unlabeled)

unlabeled_df['cluster'] = clusters
unlabeled_df['cluster'].value_counts()


cluster
6    37107
8     6119
7     5747
2     5273
3     1845
1     1697
9     1533
5     1139
4      808
0      672
Name: count, dtype: int64

In [23]:
for i in range(k):
    print("\n==============================")
    print("CLUSTER", i)
    print("==============================")
    display(unlabeled_df[unlabeled_df['cluster']==i][['eng','Subject']].head(10))



CLUSTER 0


Unnamed: 0,eng,Subject
444,Let \( a=2 i+j-2 k \) and \( b=i+j . \) If \( ...,Maths
460,A \( 0.66 \mathrm{kg} \) ball is moving with a...,Chemistry
496,Simplify: \( \frac{\left(3^{3}\right)^{-2} \ti...,Maths
630,In a building programme the event that all the...,Maths
845,Find each of the following products.\n\( 6 a \...,Maths
933,The sum of the series \( 1+\frac{1}{4 \times 2...,Maths
1051,\( 5.63 \times 11 \) is equal to\nA . 62\nB. \...,Maths
1405,The square root of \( 71 \times 72 \times 73 \...,Maths
1931,Two particles \( A \) and \( B \) are in motio...,Chemistry
2036,Which of the following options is not\ncorrect...,Chemistry



CLUSTER 1


Unnamed: 0,eng,Subject
93,\( x \) and \( y \) are two \( +v e \) numbers...,Maths
104,Find the value of \( x \)\nif \( \tan ^{-1} x+...,Maths
105,"If \( (x-1)\left(x^{2}+1\right)>0, \) then fin...",Maths
110,Solve the equations simultaneously to find the...,Maths
126,Find the principal value of:\n\( \tan ^{-1} \s...,Maths
262,Find the value of \( x \) in each of the\nfoll...,Maths
279,\( \operatorname{Let} \tan ^{-1} y=\tan ^{-1} ...,Maths
346,Find the value of a for which the\nfunction \(...,Maths
356,In the adjoining figure \( \boldsymbol{A B}= \...,Maths
544,Find the value of the unknown exterior\nangle ...,Maths



CLUSTER 2


Unnamed: 0,eng,Subject
7,The sides of a right angled triangle are in A....,Maths
18,The product \( B \) can be:\n\( \mathbf{A} \)\...,Chemistry
26,"The value of \( k, \) of the roots of the\nequ...",Maths
31,If \( \frac{1}{a}+\frac{1}{b}+\frac{1}{c}=\fra...,Maths
45,"In onion root tip during mitotic metaphase, th...",Biology
48,Probability of impossible event is\n\( A \cdot...,Maths
53,Three right angled prisms of refractive\nindic...,Physics
76,0 toppr 5\nQ Trpesourr\na.\nв.\n\( c_{0} \cdot...,Chemistry
99,attometer is \( -\ldots- \) nanometer.\nA \( \...,Physics
119,A type of ion found in sodium acetate\nis :\nA...,Chemistry



CLUSTER 3


Unnamed: 0,eng,Subject
37,If the straight line \( a x+b y+p=0 \) and\n\(...,Maths
147,Integrate the function \( x \sin x \),Maths
243,If \( f(x)=\sin ^{-1}\left\{\frac{\sqrt{3}}{2}...,Maths
458,The value of \( \cos ^{4}\left(\frac{\pi}{4}\r...,Maths
490,Prove:\n\( 3 \cos ^{-1} x=\cos ^{-1}\left(4 x^...,Maths
497,Without expanding prove that the determinant\n...,Maths
552,Evaluate \( \cos (A+B) \cdot \cos (A-B)= \)\n\...,Maths
553,What is \( (\sin x \cos y+ \)\n\( \cos x \sin ...,Maths
555,Prove that \( \frac{\sin x-\sin 3 x}{\sin ^{2}...,Maths
652,"If \( \boldsymbol{\alpha}, \boldsymbol{\beta},...",Maths



CLUSTER 4


Unnamed: 0,eng,Subject
283,Assertion: The reaction; \( _{1} \boldsymbol{H...,Physics
447,Assertion\nA tube light emits white light.\nRe...,Chemistry
523,Assertion\nGemmae formation in Funaria occurs ...,Biology
659,Assertion\nForest influences local air circula...,Biology
686,Assertion: Virtual object can't be seen\nby hu...,Physics
883,"ASSERTION: In the extraction of\nAg, complex \...",Chemistry
1155,"Assertion \( f: R \rightarrow\left[0, \frac{\p...",Maths
1344,Assertion \( \operatorname{Let} \boldsymbol{A}...,Maths
1395,Assertion\nnm is not same as mN\nReason\n\( \l...,Physics
1480,Assertion\n\( \operatorname{Let} \boldsymbol{A...,Maths



CLUSTER 5


Unnamed: 0,eng,Subject
146,"The products are :\nProducts \( P, Q, \) and \...",Chemistry
197,Which of the following is anodic\nreaction:\n\...,Chemistry
228,Which of the following is a redox\nreaction?\n...,Chemistry
432,Which of the following is an energy consuming ...,Chemistry
469,\( \lim _{x \rightarrow \infty} \cos (\sqrt{x+...,Maths
539,\( \lim _{x \rightarrow 3^{-}} \frac{|x-3|}{x-...,Maths
610,If \( f(x)=a x^{2}+b x+c \) then show that\n\(...,Maths
648,Which of the following is an endothermic\nreac...,Chemistry
903,Which of the following equations is a\nbalance...,Chemistry
921,represents the graph of the function \( f(x)=\...,Maths



CLUSTER 6


Unnamed: 0,eng,Subject
0,An anti-forest measure is\nA. Afforestation\nB...,Biology
2,If the area of two similar triangles are equal...,Maths
3,"In recent year, there has been a growing\nconc...",Biology
5,Fern plants reproduce by\nA. Seeds\nB. Spores\...,Biology
10,Convert into mixed fractions.\n(a) \( \frac{3}...,Maths
12,The equation of the plane passing through the ...,Maths
14,An ideal gas is compressed in a closed contain...,Chemistry
19,The area of the quadrilateral formed by the po...,Maths
21,What percent of Rs.150 is Rs.30?\nA . 5\%\nB. ...,Maths
22,Fill in the blanks by choosing the appropriate...,Chemistry



CLUSTER 7


Unnamed: 0,eng,Subject
85,Find out which of the following sequences are ...,Maths
91,You know that \( \frac{1}{7}=0.142857 . \) Fin...,Maths
214,If \( \boldsymbol{A}=\left[\begin{array}{ll}\m...,Maths
263,Construct a \( \triangle A B C \) in which \( ...,Maths
265,Find the numerically greatest term in the expa...,Maths
277,The term independent of \( x \) in the expansi...,Maths
287,If the radius of a sphere is measured as\n\( 7...,Maths
294,The radius of a circular plate is increasing a...,Maths
304,An open U-tube contains mercury. When \( 13.6 ...,Physics
333,The sum of the first 7 terms of an A.P is\n63 ...,Maths



CLUSTER 8


Unnamed: 0,eng,Subject
50,Use the identity \( (x+a)(x+b)=x^{2}+ \)\n\( (...,Maths
60,Find the equation of following planes:,Maths
65,The following table shows the scores of\na gro...,Maths
69,The values of coefficients to balance the\nfol...,Chemistry
134,Write the degree of each of the following poly...,Maths
179,I. Increase in mass II. Differentiation III.\n...,Biology
181,State in which of the following examples the f...,Chemistry
211,Which of the following organisms show\nmixotro...,Biology
212,Balance the following reactions:\n\( \boldsymb...,Chemistry
240,Which of the following characteristics can be ...,Biology



CLUSTER 9


Unnamed: 0,eng,Subject
158,An enclosed fluid under pressure exerts that p...,Physics
182,Eccentricity of a hyperbola is always less tha...,Maths
257,Glucose solution is an electrolyte and hence c...,Chemistry
365,State true or false:\nWith reference to the fi...,Maths
451,Assertion (A): Ductile metals are used\nto pre...,Physics
488,Does Euclid' fifth postulate imply the existen...,Maths
498,State whether the following statement is true ...,Maths
588,Statement I: The equation\n\( \left(\sin ^{-1}...,Maths
639,"Water evaporates into air from oceans, rivers ...",Biology
721,State True or False.\nPolystyrene and PVC are ...,Chemistry


In [24]:
cluster_topic_map = {
    0: "Algebra & Series",
    1:"Trigonometry",
    2:"Mixed Concept Questions",
    3:"Trigonometric Identities",
    4:"Conceptual Assertion Reason",
    5:"Chemical Reactions",
    6:"Basic Conceptual Questions",
    7:"Algebra & Geometry",
    8:"Formula-Based Questions",
    9:"Concept Verification Questions"
}

In [25]:
unlabeled_df['topic'] = unlabeled_df['cluster'].map(cluster_topic_map)

df.loc[df['topic']=="Unlabeled", 'topic'] = unlabeled_df['topic'].values


In [26]:
df['topic'].value_counts(normalize=True) * 100


topic
Basic Conceptual Questions        30.286731
Kinematics                         9.978860
Formula-Based Questions            4.994327
Acids Bases Salts                  4.789461
Algebra & Geometry                 4.690701
Mixed Concept Questions            4.303822
Laws of Motion                     3.596177
Mole Concept                       3.283572
Work Power Energy                  3.275410
Atomic Structure                   2.563684
Current Electricity                2.180070
Optics                             1.673210
Thermodynamics                     1.522213
Trigonometric Identities           1.505889
Trigonometry                       1.385091
Chemical Kinetics                  1.297758
Chemical Equilibrium               1.270823
Chemical Bonding                   1.258580
Concept Verification Questions     1.251235
Cell Biology                       1.243889
Thermochemistry                    1.134518
Magnetism                          1.098605
Periodic Table            

In [27]:
def assign_difficulty(text):
    length = len(text.split())

    if length < 8:
        return "Easy"
    elif length < 18:
        return "Medium"
    else:
        return "Hard"

df['difficulty'] = df['final_text'].apply(assign_difficulty)
df['difficulty'].value_counts()


difficulty
Medium    54865
Hard      48852
Easy      18802
Name: count, dtype: int64

In [28]:
df.head()

Unnamed: 0,eng,Subject,final_text,topic,difficulty
0,An anti-forest measure is\nA. Afforestation\nB...,Biology,antiforest measure afforestation b selective g...,Basic Conceptual Questions,Medium
1,"Among the following organic acids, the acid pr...",Chemistry,among following organic acid acid present ranc...,Acids Bases Salts,Hard
2,If the area of two similar triangles are equal...,Maths,area two similar triangle equal equilateral b ...,Basic Conceptual Questions,Medium
3,"In recent year, there has been a growing\nconc...",Biology,recent year growing concern gradually increasi...,Basic Conceptual Questions,Medium
4,Which of the following statement\nregarding tr...,Physics,following statement regarding transformer inco...,Work Power Energy,Hard


In [29]:
df[['eng','Subject','topic','difficulty']].sample(20)


Unnamed: 0,eng,Subject,topic,difficulty
3610,If the diagonal BD of a quadrilateral\nABCD bi...,Maths,Concept Verification Questions,Medium
62994,\( \ln \operatorname{an} A . P . \) if \( \fra...,Maths,Basic Conceptual Questions,Easy
29761,If \( x \) is an even number then the\nconsecu...,Maths,Mixed Concept Questions,Medium
101821,Redox indicators are substances that\nundergo ...,Chemistry,Electrochemistry,Medium
64329,Chlorine is:\nA. Oxidising agent\nB. bleaching...,Chemistry,Basic Conceptual Questions,Medium
116277,What are dimensional variables?,Chemistry,Basic Conceptual Questions,Easy
2961,\( a C u^{2+}(a q)+b I^{-}(a q) \rightarrow c ...,Chemistry,Mixed Concept Questions,Hard
30891,Clinging roots occur in\nA. Trapa\nB. Orchid\n...,Biology,Basic Conceptual Questions,Medium
74098,Platinum forms two different\ncompounds with c...,Chemistry,Basic Conceptual Questions,Hard
65480,A circle has the equation \( (x+1)^{2}+ \)\n\(...,Maths,Basic Conceptual Questions,Medium


In [30]:
df.isnull().sum()

eng           0
Subject       0
final_text    0
topic         0
difficulty    0
dtype: int64

In [31]:
df.to_csv("../Dataset/Revised_Cleaned_Dataset.csv",index=False)