In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df =pd.read_csv("data.csv")

In [3]:
#Importing Libraries
import re
import string
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
#stopwords 
stop_words = set(stopwords.words('english'))

In [5]:
def preprocess_text(text):
    text = str(text)
    text = text.lower() #Converting to lowercase
    #removing punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ',text)
    text = re.sub('<.*?>+',' ',text) #removing HTML Tags
    text = re.sub('[^a-zA-Z0-9\n]', ' ', text) #replacing special character with space
    text = re.sub(r'\s+', ' ',text) #removal of multiple spaces
    
    text_tokens = word_tokenize(text)
    
    #removing stopwords
    tw = [word for word in text_tokens if word not in stop_words]
    text = (" ").join(tw)
    return text

In [6]:
df['Text'] = df['Text'].apply(preprocess_text)

In [7]:
#Replacing the excess space for Gene and variation
df['Gene'] = df['Gene'].str.replace('\s+', '_')
df['Variation'] = df['Variation'].str.replace('\s+', '_')

  df['Gene'] = df['Gene'].str.replace('\s+', '_')
  df['Variation'] = df['Variation'].str.replace('\s+', '_')


In [8]:
df.loc[df['Text'].isnull(),'Text'] = df['Gene'] +' '+df['Variation']

In [9]:
X = df[['Gene', 'Variation', 'Text']]
y = df['Class']

In [10]:
X

Unnamed: 0,Gene,Variation,Text
0,FAM58A,Truncating_Mutations,cyclin dependent kinases cdks regulate variety...
1,CBL,W802*,abstract background non small cell lung cancer...
2,CBL,Q249E,abstract background non small cell lung cancer...
3,CBL,N454D,recent evidence demonstrated acquired uniparen...
4,CBL,L399V,oncogenic mutations monomeric casitas b lineag...
...,...,...,...
3316,RUNX1,D171N,introduction myelodysplastic syndromes mds het...
3317,RUNX1,A122*,introduction myelodysplastic syndromes mds het...
3318,RUNX1,Fusions,runt related transcription factor 1 gene runx1...
3319,RUNX1,R80C,runx1 aml1 gene frequent target chromosomal tr...


In [11]:
y

0       1
1       2
2       2
3       3
4       4
       ..
3316    4
3317    1
3318    1
3319    4
3320    4
Name: Class, Length: 3321, dtype: int64

In [12]:
tfidf=TfidfVectorizer()
X_text=tfidf.fit_transform(X['Text'])

In [13]:
print(X_text.max())

1.0


In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_text, y, test_size = 0.2, stratify = y, random_state = 0)

In [29]:
from sklearn.metrics import log_loss
from sklearn.neighbors import KNeighborsClassifier

In [30]:
neighbours = [5,10,15,40,50,120,150,160,170,180,200,400,500,1000]
Log_Loss = []

for neighbour in neighbours:
    model = KNeighborsClassifier(n_neighbors = neighbour)
    model.fit(X_train,y_train)
    prediction = model.predict_proba(X_test)
    log_loss_cal = log_loss(y_test,prediction,eps=1e-15)
    Log_Loss.append(log_loss_cal)
    print(f'For {neighbour} nearest neighbours the log loss is ',log_loss_cal)

For 5 nearest neighbours the log loss is  5.734585162847762
For 10 nearest neighbours the log loss is  3.5084190842800815
For 15 nearest neighbours the log loss is  2.803618558689596
For 40 nearest neighbours the log loss is  1.9268636963136287
For 50 nearest neighbours the log loss is  1.8677557328741954
For 120 nearest neighbours the log loss is  1.4966364132058771
For 150 nearest neighbours the log loss is  1.43249890992998
For 160 nearest neighbours the log loss is  1.4429885568533891
For 170 nearest neighbours the log loss is  1.4472150556791905
For 180 nearest neighbours the log loss is  1.4555502481488258
For 200 nearest neighbours the log loss is  1.4708294003243485
For 400 nearest neighbours the log loss is  1.488403027918854
For 500 nearest neighbours the log loss is  1.5286841563538203
For 1000 nearest neighbours the log loss is  1.6611279747881897


In [21]:
from sklearn.metrics import accuracy_score
mo = KNeighborsClassifier(n_neighbors = 150)
mo.fit(X_train,y_train)
pred = mo.predict(X_test)
print(accuracy_score(y_test, pred))
prediction = mo.predict_proba(X_test)

0.4496240601503759


In [24]:
mo1 = KNeighborsClassifier(n_neighbors = 15)
mo1.fit(X_train,y_train)
pred = mo1.predict(X_test)
prediction1 = mo1.predict_proba(X_test)
print(accuracy_score(y_test, pred))

0.5458646616541354


In [20]:
from sklearn.metrics import roc_auc_score

In [25]:
macro_roc_auc_ovo = roc_auc_score(
    y_test,
    prediction1,
    multi_class="ovo",
    average="macro",
)

In [26]:
macro_roc_auc_ovo

0.8387975549980073

In [32]:
from sklearn.model_selection import GridSearchCV

#createa a parameter grid
param_grid={'n_neighbors':[1,10,30, 50, 80,100, 120, 150, 180, 200, 250, 300]}

gs = GridSearchCV(KNeighborsClassifier(),param_grid,cv=5)
gs.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [1, 10, 30, 50, 80, 100, 120, 150, 180,
                                         200, 250, 300]})

In [33]:
print("Best parameters: ",gs.best_params_)
print("Best cross-validation score: {:.2f}".format(gs.best_score_))

Best parameters:  {'n_neighbors': 1}
Best cross-validation score: 0.61


In [34]:
macro_roc_auc_ovr = roc_auc_score(
    y_test,
    prediction,
    multi_class="ovr",
    average="macro",
)

In [35]:
macro_roc_auc_ovr

0.7752459055086866