In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 
import nltk

In [2]:
df = pd.read_csv('../Dataset/Cleaned_Data.csv')

In [3]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aaru\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Aaru\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Aaru\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [5]:
stopwords=set(stopwords.words('english'))

In [6]:
lemmatizer = WordNetLemmatizer()

In [7]:
def preprocess(text):
    tokens = word_tokenize(str(text))          
    tokens = [w for w in tokens if w.isalpha()] 
    tokens = [w for w in tokens if w not in stopwords]  
    tokens = [lemmatizer.lemmatize(w) for w in tokens]   
    return " ".join(tokens)                 


df['final_text'] = df['clean_question'].apply(preprocess)
df[['clean_question','final_text']].head()

Unnamed: 0,clean_question,final_text
0,an antiforest measure is a afforestation b sel...,antiforest measure afforestation b selective g...
1,among the following organic acids the acid pre...,among following organic acid acid present ranc...
2,if the area of two similar triangles are equal...,area two similar triangle equal equilateral b ...
3,in recent year there has been a growing concer...,recent year growing concern gradually increasi...
4,which of the following statement regarding tra...,following statement regarding transformer inco...


Now after all the processing is done on the textual data we label encode our subject column 


In [8]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

df['Subject']=encoder.fit_transform(df['Subject'])

In [25]:
df['final_text']

0         antiforest measure afforestation b selective g...
1         among following organic acid acid present ranc...
2         area two similar triangle equal equilateral b ...
3         recent year growing concern gradually increasi...
4         following statement regarding transformer inco...
                                ...                        
122514    following group character present chordate sta...
122515    light year light emitted sun one year b time t...
122516    member dipnoi native india b africa australia ...
122517    one averagelife half active nucleus decay b le...
122518    state whether true false magnetic field region...
Name: final_text, Length: 122519, dtype: object

In [26]:
from sklearn.model_selection import train_test_split

X = df['final_text']
y = df['Subject']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [27]:
print(X_train)

28237     ln amino acid sequence h amino acid mutated ua...
100555    mathrmg aqueous solution contain g calcium car...
63738     find velocity disc collision v b v mathrmc cdo...
95540     according oswald tippo angiosperm placed atrac...
84033     n circuit shown figure c source give voltage v...
                                ...                        
119879    write iupac name following structure beginarra...
103694                     sin x equal gx gx equal b c cdot
860                         exactly successive throw cdot c
15795     yeast produce clone naturally process called b...
121958    bond dissociation energy boldsymbolx boldsymbo...
Name: final_text, Length: 98015, dtype: object


In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=7000,
    ngram_range=(1,2)
)

In [None]:
X_train = tfidf.fit_transform(X_train)

In [30]:
X_test = tfidf.transform(X_test)

In [36]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

models = {
    "NB": MultinomialNB(),
    "LR": LogisticRegression(max_iter=1000),
    "SVM": LinearSVC()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    acc = model.score(X_test, y_test)
    print(name, acc)


NB 0.9033627162912178
LR 0.9193192948090108
SVM 0.9227881162259223


In [50]:
svm = LinearSVC()
svm.fit(X_train, y_train)

0,1,2
,"penalty  penalty: {'l1', 'l2'}, default='l2' Specifies the norm used in the penalization. The 'l2' penalty is the standard used in SVC. The 'l1' leads to ``coef_`` vectors that are sparse.",'l2'
,"loss  loss: {'hinge', 'squared_hinge'}, default='squared_hinge' Specifies the loss function. 'hinge' is the standard SVM loss (used e.g. by the SVC class) while 'squared_hinge' is the square of the hinge loss. The combination of ``penalty='l1'`` and ``loss='hinge'`` is not supported.",'squared_hinge'
,"dual  dual: ""auto"" or bool, default=""auto"" Select the algorithm to either solve the dual or primal optimization problem. Prefer dual=False when n_samples > n_features. `dual=""auto""` will choose the value of the parameter automatically, based on the values of `n_samples`, `n_features`, `loss`, `multi_class` and `penalty`. If `n_samples` < `n_features` and optimizer supports chosen `loss`, `multi_class` and `penalty`, then dual will be set to True, otherwise it will be set to False. .. versionchanged:: 1.3  The `""auto""` option is added in version 1.3 and will be the default  in version 1.5.",'auto'
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"C  C: float, default=1.0 Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. For an intuitive visualization of the effects of scaling the regularization parameter C, see :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`.",1.0
,"multi_class  multi_class: {'ovr', 'crammer_singer'}, default='ovr' Determines the multi-class strategy if `y` contains more than two classes. ``""ovr""`` trains n_classes one-vs-rest classifiers, while ``""crammer_singer""`` optimizes a joint objective over all classes. While `crammer_singer` is interesting from a theoretical perspective as it is consistent, it is seldom used in practice as it rarely leads to better accuracy and is more expensive to compute. If ``""crammer_singer""`` is chosen, the options loss, penalty and dual will be ignored.",'ovr'
,"fit_intercept  fit_intercept: bool, default=True Whether or not to fit an intercept. If set to True, the feature vector is extended to include an intercept term: `[x_1, ..., x_n, 1]`, where 1 corresponds to the intercept. If set to False, no intercept will be used in calculations (i.e. data is expected to be already centered).",True
,"intercept_scaling  intercept_scaling: float, default=1.0 When `fit_intercept` is True, the instance vector x becomes ``[x_1, ..., x_n, intercept_scaling]``, i.e. a ""synthetic"" feature with a constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes intercept_scaling * synthetic feature weight. Note that liblinear internally penalizes the intercept, treating it like any other term in the feature vector. To reduce the impact of the regularization on the intercept, the `intercept_scaling` parameter can be set to a value greater than 1; the higher the value of `intercept_scaling`, the lower the impact of regularization on it. Then, the weights become `[w_x_1, ..., w_x_n, w_intercept*intercept_scaling]`, where `w_x_1, ..., w_x_n` represent the feature weights and the intercept weight is scaled by `intercept_scaling`. This scaling allows the intercept term to have a different regularization behavior compared to the other features.",1
,"class_weight  class_weight: dict or 'balanced', default=None Set the parameter C of class i to ``class_weight[i]*C`` for SVC. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``.",
,"verbose  verbose: int, default=0 Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in liblinear that, if enabled, may not work properly in a multithreaded context.",0


Here i chose not to do hyperparameter tuning because we are already getting an accuracy of 92 percent.

In [51]:
import pickle

with open("../models/tfidf.pkl", "wb") as f:
    pickle.dump(tfidf, f)

with open("../models/best_model.pkl", "wb") as f:
    pickle.dump(svm, f)

with open("../models/label_encoder.pkl", "wb") as f:
    pickle.dump(encoder, f)