In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [3]:
df =pd.read_csv("/content/drive/MyDrive/Dataset/data.csv")  #loading the dataset

In [4]:
df.loc[df['Text'].isnull(),'Text'] = df['Gene'] +' '+df['Variation']  # Fill the null values

In [5]:
df['Text'].iloc[2755]  #checking

'BRAF G596C'

In [6]:
df.drop(['Unnamed: 0','ID'],axis=1,inplace=True)  # Dropping the unnecessary columns

In [7]:
# for text data
#Importing Libraries
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [9]:
#stopwords
stop_words = set(stopwords.words('english'))

In [10]:
def preprocess_text(text):
    text = str(text)
    text = text.lower() #Converting to lowercase
    #removing punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ',text)
    text = re.sub('<.*?>+',' ',text) #removing HTML Tags
    text = re.sub('[^a-zA-Z0-9\n]', ' ', text) #replacing special character with space
    text = re.sub(r'\s+', ' ',text) #removal of multiple spaces

    text_tokens = word_tokenize(text)

    #removing stopwords
    tw = [word for word in text_tokens if word not in stop_words]
    text = (" ").join(tw)
    return text

In [11]:
df['Text'] = df['Text'].apply(preprocess_text)  # applying the function to clean the text

In [12]:
#Replacing the excess space for Gene and variation
df['Gene'] = df['Gene'].str.replace('\s+', '_')
df['Variation'] = df['Variation'].str.replace('\s+', '_')

In [13]:
df['Text'].iloc[0] #checking the preprocess data

'cyclin dependent kinases cdks regulate variety fundamental cellular processes cdk10 stands one last orphan cdks activating cyclin identified kinase activity revealed previous work shown cdk10 silencing increases ets2 v ets erythroblastosis virus e26 oncogene homolog 2 driven activation mapk pathway confers tamoxifen resistance breast cancer cells precise mechanisms cdk10 modulates ets2 activity generally functions cdk10 remain elusive demonstrate cdk10 cyclin dependent kinase identifying cyclin activating cyclin cyclin orphan cyclin product fam58a whose mutations cause star syndrome human developmental anomaly whose features include toe syndactyly telecanthus anogenital renal malformations show star syndrome associated cyclin mutants unable interact cdk10 cyclin silencing phenocopies cdk10 silencing increasing c raf conferring tamoxifen resistance breast cancer cells cdk10 cyclin phosphorylates ets2 vitro cells positively controls ets2 degradation proteasome ets2 protein levels increa

In [14]:
gene_frequency = df['Gene'].value_counts(normalize=True)

# Map the frequency values to the 'genes' column
df['gene_frequency_encoded'] = df['Gene'].map(gene_frequency)

In [15]:
variation_frequency = df['Variation'].value_counts(normalize=True)

# Map the frequency values to the 'genes' column
df['variation_frequency_encoded'] = df['Variation'].map(variation_frequency)

In [16]:
df.head(3)

Unnamed: 0,Gene,Variation,Class,Text,gene_frequency_encoded,variation_frequency_encoded
0,FAM58A,Truncating_Mutations,1,cyclin dependent kinases cdks regulate variety...,0.000301,0.028004
1,CBL,W802*,2,abstract background non small cell lung cancer...,0.007528,0.000301
2,CBL,Q249E,2,abstract background non small cell lung cancer...,0.007528,0.000301


In [17]:
X = df.drop(['Class', 'Gene', 'Variation'], axis = 1)
y = df['Class']

In [18]:
X.shape

(3321, 3)

In [19]:
#splitting all the columns
X_text = X['Text']
vectorizer = TfidfVectorizer(min_df = 3)
X_text_vectorized = vectorizer.fit_transform(X_text.astype(str))

In [20]:
X_text_vectorized.max()

1.0

In [21]:
type(X_text)

pandas.core.series.Series

In [22]:
text_df = pd.DataFrame(X_text_vectorized.toarray(), columns=vectorizer.get_feature_names_out())

In [23]:
X.head(2)

Unnamed: 0,Text,gene_frequency_encoded,variation_frequency_encoded
0,cyclin dependent kinases cdks regulate variety...,0.000301,0.028004
1,abstract background non small cell lung cancer...,0.007528,0.000301


In [24]:
X_tem = df[['gene_frequency_encoded', 'variation_frequency_encoded']]
X_data = pd.concat([text_df, X_tem], axis=1)

In [25]:
X_data.shape

(3321, 65957)

In [26]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(X_data, y, test_size= 0.3, stratify = y, random_state = 21)


In [27]:
#using smote algorithm
#from imblearn.over_sampling import SMOTE
#from collections import Counter
#sm = SMOTE()
#X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)
#counter = Counter(y_train_sm)

In [29]:
#Multinomial NB
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB  #using multinomial NB for this classification

nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
y_pred_proba = nb.predict_proba(X_test)
print('Recall score : ', recall_score(y_test, y_pred, average='weighted'))
print('f1 score : ', f1_score(y_test, y_pred, average='weighted'))
print('log loss : ', log_loss(y_test, y_pred_proba))

Recall score :  0.4954864593781344
f1 score :  0.41859957629961314
log loss :  2.1930661072846966


In [31]:
#Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state = 21)
rfc.fit(X_train, y_train)
y_pred_r = rfc.predict(X_test)
y_pred_proba_rfc = rfc.predict_proba(X_test)
print('Recall score : ', recall_score(y_test, y_pred_r, average='weighted'))
print('f1 score : ', f1_score(y_test, y_pred_r, average='weighted'))
print('log loss : ', log_loss(y_test, y_pred_proba_rfc))

Recall score :  0.6409227683049148
f1 score :  0.625606997933317
log loss :  1.9253471352848543


In [32]:
#KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
y_pred_proba_knn = knn.predict_proba(X_test)
print('Recall score : ', recall_score(y_test, y_pred_knn, average='weighted'))
print('f1 score : ', f1_score(y_test, y_pred_knn, average='weighted'))
print('log loss : ', log_loss(y_test, y_pred_proba_knn))

Recall score :  0.5697091273821464
f1 score :  0.552703991597449
log loss :  5.660195956531755


In [34]:
from sklearn.neighbors import KNeighborsClassifier
neighbours = [5,10,15,40,50,120,150,160,170,180,200,400,500,1000]
Log_Loss = []

for neighbour in neighbours:
    model = KNeighborsClassifier(n_neighbors = neighbour)
    model.fit(X_train,y_train)
    prediction = model.predict_proba(X_test)
    log_loss_cal = log_loss(y_test,prediction,eps=1e-15)
    Log_Loss.append(log_loss_cal)
    #print(f'For {neighbour} nearest neighbours the log loss is ', recall_score(y_test, prediction, average='weighted'))
    #print(f'For {neighbour} nearest neighbours the log loss is ', f1_score(y_test, prediction, average='weighted'))
    print(f'For {neighbour} nearest neighbours the log loss is ',log_loss_cal)

For 5 nearest neighbours the log loss is  5.4458604167350515
For 10 nearest neighbours the log loss is  3.5720676328284457
For 15 nearest neighbours the log loss is  2.806216020758064
For 40 nearest neighbours the log loss is  1.9436717681269764
For 50 nearest neighbours the log loss is  1.9519127413920665
For 120 nearest neighbours the log loss is  1.5529138082443101
For 150 nearest neighbours the log loss is  1.516009826889936
For 160 nearest neighbours the log loss is  1.5236028391697012
For 170 nearest neighbours the log loss is  1.5327614680979942
For 180 nearest neighbours the log loss is  1.539717370136619
For 200 nearest neighbours the log loss is  1.5256055594438767
For 400 nearest neighbours the log loss is  1.5784814867653598
For 500 nearest neighbours the log loss is  1.6136085770271378
For 1000 nearest neighbours the log loss is  1.6927013917823373


In [35]:
#KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=150)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
y_pred_proba_knn = knn.predict_proba(X_test)
print('Recall score : ', recall_score(y_test, y_pred_knn, average='weighted'))
print('f1 score : ', f1_score(y_test, y_pred_knn, average='weighted'))
print('log loss : ', log_loss(y_test, y_pred_proba_knn))

Recall score :  0.4292878635907723
f1 score :  0.35994362601461327
log loss :  1.5205380425194424
