### **Objective:** Develop algorithms to classify genetic mutations into different classes based on clinical evidence (text)

We will start with first exploring our data files - training_variants and training_text. Data exploration will help to decide the path for model building more accurately

In [None]:
#Importing the libraries 
import os
import math
import numpy as np
import pandas as pd
import seaborn as sns

%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
from textblob import TextBlob as tb
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC


from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

import gensim


import nltk


import os

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.utils.np_utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras.optimizers import Adam

We will now read all the data files as Pandas dataframes and print their dimensions

In [None]:
#First we will take a look at our data summary

train_variants_df = pd.read_csv("C:/Users/AJ186043/Desktop/WORK/Kaggle Competitions/Cancer Treatment/Dataset/training_variants")
test_variants_df = pd.read_csv("C:/Users/AJ186043/Desktop/WORK/Kaggle Competitions/Cancer Treatment/Dataset/test_variants")
train_text_df = pd.read_csv("C:/Users/AJ186043/Desktop/WORK/Kaggle Competitions/Cancer Treatment/Dataset/training_text", sep="\|\|", engine="python", skiprows=1, names=["ID", "Text"])
test_text_df = pd.read_csv("C:/Users/AJ186043/Desktop/WORK/Kaggle Competitions/Cancer Treatment/Dataset/test_text", sep="\|\|", engine="python", skiprows=1, names=["ID", "Text"])
print("Train Variant".ljust(15), train_variants_df.shape)
print("Train Text".ljust(15), train_text_df.shape)
print("Test Variant".ljust(15), test_variants_df.shape)
print("Test Text".ljust(15), test_text_df.shape)

Now we will see the columns present in both training_variants and training_text files

In [None]:
train_variants_df.head()

In [None]:
train_text_df.head()

So we come to know that both our files have a common column i.e. ID column. Other than this the training_variants file has 4 columns - ID, Gene, Variation and class respectively. While trainin_text file had ID and Text columns.

In [None]:
train_variants_df.isnull()

Now in the next step we are going to calculate the unique count for all the columns in the training_variants file.

In [None]:
#We will calculate the count of unique values in training_variants file using unique() function

print("For training data, there are a total of", len(train_variants_df.ID.unique()), "IDs,", end='')
print(len(train_variants_df.Gene.unique()), "unique genes,", end='')
print(len(train_variants_df.Variation.unique()), "unique variations and ", end='')
print(len(train_variants_df.Class.unique()),  "classes")

So we get that we have 9 unique classes in which the data needs to be categorized. Let us calculate the frequency for all these 9 classes to better understand them.

In [None]:
#Calculating the frequency for each class

plt.figure(figsize=(12,8))
sns.countplot(x="Class", data=train_variants_df)
plt.ylabel('Frequency', fontsize=14)
plt.xlabel('Class Count', fontsize=12)
plt.xticks(rotation='vertical')
plt.title("Frequency of Classes", fontsize=15)
plt.show()

The above graph shows that class 7 has the highest frequency, followed by class 4, class 1 and so on. Class 8 has the least frequency.
Now we are going to see the count for each Gene type present in the data.

In [None]:
#Lets see the number of times a particular gene occures in our data

gene_group = train_variants_df.groupby("Gene")['Gene'].count()
max_occ_genes = gene_group.sort_values(ascending=False)
print("Genes with maximal occurences\n", gene_group.sort_values(ascending=False))

So we see that the Gene type BRCA1 has the highest count of occurence. We will now take out the top 10 genes (on the basis of their occurence count) and plot them.

In [None]:
max_occ_top = max_occ_genes[:10]

In [None]:
max_occ_top

In [None]:
#Plotting the top 10 genes according to count 

plt.figure(figsize=(12,8))
max_occ_top.plot(kind='bar')

So the above graph gives us top 10 genes according to their frequencies. We can see that the Gene named BRCA1 has the highest frequency in the overall data followed by TP53, EGFR and so on. 

In [None]:
class_group = train_variants_df.groupby("Class")['Gene'].count()
occ_genes = class_group.sort_values(ascending=False)

In [None]:
#Plotting the genes for all 9 classes

fig, axs = plt.subplots(ncols=3, nrows=3, figsize=(20,20))

for i in range(3):
    for j in range(3):
        gene_count_grp = train_variants_df[train_variants_df["Class"]==((i*3+j)+1)].groupby('Gene')["Variation"].count().reset_index()
        sorted_gene_group = gene_count_grp.sort_values('Variation', ascending=False)
        sorted_gene_group_top_10 = sorted_gene_group[:10]
        sns.barplot(x="Gene", y="Variation", data=sorted_gene_group_top_10, ax=axs[i][j])      

So we conclude the following :

1. BRCA1 is the gene with highest frequency and it appears the most in Class 5
2. TP53 is the gene with the second highest frequency and it appears the most in Class 1

### Lets now explore the training_text file

In [None]:
train_text_df.isnull()

In [None]:
train_text_df.loc[:, 'Text_count']  = train_text_df["Text"].apply(lambda x: len(x.split()))
train_text_df.head()

In [None]:
train_full = train_variants_df.merge(train_text_df, how="inner", left_on="ID", right_on="ID")
train_full.head()

In [None]:
count_grp = train_full.groupby('Class')["Text_count"]
count_grp.describe()

In [None]:
train_full[train_full["Text_count"]==1.0]

In [None]:
train_full.drop(train_full[train_full["Text_count"]==1.0].index, inplace=True)

In [None]:
train_full

In [None]:
from wordcloud import WordCloud
cloud = WordCloud(width=1440, height=1080).generate(" ".join(train_full_df[train_full.Class == 7]['Text']))
plt.figure(figsize=(20, 15))
plt.imshow(cloud)
plt.axis('off')
ax = plt.axes()
ax.set_title('Class 7 Text Word Cloud')

In [None]:
from wordcloud import WordCloud
cloud = WordCloud(width=1440, height=1080).generate(" ".join(train_full_df[train_full.Class == 8]['Text']))
plt.figure(figsize=(20, 15))
plt.imshow(cloud)
plt.axis('off')
ax = plt.axes()
ax.set_title('Class 8 Text Word Cloud')

In [None]:
def tf(word, train_full_df):
    return blob.words.count(word) / len(blob.words)

In [None]:
#Code for eliminating Stop Words

nltk.download()
import io
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
#word_tokenize accepts a string as an input, not a file.
stop_words = set(stopwords.words('english'))
file1 = open("train_full_df")
line = file1.read()# Use this to read file content as a stream:
words = line.split()
for r in words:
    if not r in stop_words:
        appendFile = open('filteredtext.txt','a')
        appendFile.write(" "+r)
        appendFile.close()

In [None]:
#print(appendfile)

In [None]:
#from nltk.tokenize import sent_tokenize, word_tokenize
#from nltk.corpus import stopwords

In [None]:
#data=train_full
#stopWords = set(stopwords.words('english'))
#words = word_tokenize(train_full)
#wordsFiltered = []

In [None]:
#for w in words:
 #   if w not in stopWords:
  #      wordsFiltered.append(w)
 
#print(wordsFiltered)

In [None]:
#import nltk
#nltk.download('stopwords')

In [None]:
train_full_df.describe(include='all')

In [None]:
#Stop words elimination using Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer
n_features = 50
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=50,
                                stop_words='english')
tf = tf_vectorizer.fit_transform(train_full_df['Text'])

In [None]:
tf_feature_names = tf_vectorizer.get_feature_names()
tf_feature_names

In [None]:
#In this step we will try to check all the Gene and Variation combinations present in our data
train_full_df['Gene_And_Variation']=train_full['Gene']+' '+train_full_df['Variation']
train_full_df.head()

In [None]:
train_full.info()

Now we will check the unique values for Gene+Variation combination

In [None]:
train_full['Gene_And_Variation'].value_counts().head()

In [None]:
print(len(train_full.Gene_And_Variation.unique()), "unique genes,", end='')

In [None]:
count_vectorizer = CountVectorizer(
    analyzer="word", tokenizer=nltk.word_tokenize,
    preprocessor=None, stop_words='english', max_features=None)   

In [None]:
bag_of_words = count_vectorizer.fit_transform(train_full['Text'])

In [None]:
len(count_vectorizer.get_feature_names())