In [59]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [60]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [61]:
df =pd.read_csv("/content/drive/MyDrive/Dataset/data.csv")  #loading the dataset

In [62]:
df.loc[df['Text'].isnull(),'Text'] = df['Gene'] +' '+df['Variation']  # Fill the null values

In [63]:
df['Text'].iloc[2755]  #checking

'BRAF G596C'

In [64]:
df.drop(['Unnamed: 0','ID'],axis=1,inplace=True)  # Dropping the unnecessary columns

In [65]:
# for text data
#Importing Libraries
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [66]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [67]:
#stopwords
stop_words = set(stopwords.words('english'))

In [68]:
def preprocess_text(text):
    text = str(text)
    text = text.lower() #Converting to lowercase
    #removing punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ',text)
    text = re.sub('<.*?>+',' ',text) #removing HTML Tags
    text = re.sub('[^a-zA-Z0-9\n]', ' ', text) #replacing special character with space
    text = re.sub(r'\s+', ' ',text) #removal of multiple spaces

    text_tokens = word_tokenize(text)

    #removing stopwords
    tw = [word for word in text_tokens if word not in stop_words]
    text = (" ").join(tw)
    return text

In [69]:
df['Text'] = df['Text'].apply(preprocess_text)  # applying the function to clean the text

In [70]:
#Replacing the excess space for Gene and variation
df['Gene'] = df['Gene'].str.replace('\s+', '_')
df['Variation'] = df['Variation'].str.replace('\s+', '_')

In [71]:
df['Text'].iloc[0] #checking the preprocess data

'cyclin dependent kinases cdks regulate variety fundamental cellular processes cdk10 stands one last orphan cdks activating cyclin identified kinase activity revealed previous work shown cdk10 silencing increases ets2 v ets erythroblastosis virus e26 oncogene homolog 2 driven activation mapk pathway confers tamoxifen resistance breast cancer cells precise mechanisms cdk10 modulates ets2 activity generally functions cdk10 remain elusive demonstrate cdk10 cyclin dependent kinase identifying cyclin activating cyclin cyclin orphan cyclin product fam58a whose mutations cause star syndrome human developmental anomaly whose features include toe syndactyly telecanthus anogenital renal malformations show star syndrome associated cyclin mutants unable interact cdk10 cyclin silencing phenocopies cdk10 silencing increasing c raf conferring tamoxifen resistance breast cancer cells cdk10 cyclin phosphorylates ets2 vitro cells positively controls ets2 degradation proteasome ets2 protein levels increa

In [72]:
!pip install category_encoders



In [73]:
import pandas as pd
import category_encoders as ce


# Initialize TargetEncoder
target_encoder = ce.TargetEncoder(cols=['Gene'])
target_encoder1 = ce.TargetEncoder(cols=['Variation'])




In [74]:


# Map the frequency values to the 'genes' column
df['gene_frequency_encoded'] = target_encoder.fit_transform(df['Gene'], df['Class'])

In [75]:


# Map the frequency values to the 'genes' column
df['variation_frequency_encoded'] = target_encoder1.fit_transform(df['Variation'], df['Class'])

In [76]:
df.head(3)

Unnamed: 0,Gene,Variation,Class,Text,gene_frequency_encoded,variation_frequency_encoded
0,FAM58A,Truncating_Mutations,1,cyclin dependent kinases cdks regulate variety...,3.927928,1.152708
1,CBL,W802*,2,abstract background non small cell lung cancer...,3.988734,4.058036
2,CBL,Q249E,2,abstract background non small cell lung cancer...,3.988734,4.058036


In [77]:
X = df.drop(['Class', 'Gene', 'Variation'], axis = 1)
y = df['Class']

In [78]:
X.shape

(3321, 3)

In [79]:
#splitting all the columns
X_text = X['Text']
vectorizer = TfidfVectorizer(min_df = 3)
X_text_vectorized = vectorizer.fit_transform(X_text.astype(str))

In [None]:
X_text_vectorized.max()

In [None]:
type(X_text)

In [None]:
text_df = pd.DataFrame(X_text_vectorized.toarray(), columns=vectorizer.get_feature_names_out())

In [None]:
X.head(2)

In [None]:
X_tem = df[['gene_frequency_encoded', 'variation_frequency_encoded']]
X_data = pd.concat([text_df, X_tem], axis=1)

In [85]:
X_data.shape

(3321, 65957)

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(X_data, y, test_size= 0.3, stratify = y, random_state = 21)


In [None]:
#using smote algorithm
'''from imblearn.over_sampling import SMOTE
from collections import Counter
sm = SMOTE()
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)
counter = Counter(y_train_sm)'''

In [None]:
#Multinomial NB
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB  #using multinomial NB for this classification

nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
y_pred_proba = nb.predict_proba(X_test)
print('Recall score : ', recall_score(y_test, y_pred, average='weighted'))
print('f1 score : ', f1_score(y_test, y_pred, average='weighted'))
print('log loss : ', log_loss(y_test, y_pred_proba))