# libraries

In [None]:
import re
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.grid_search import GridSearchCV 
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn import preprocessing

In [None]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
%matplotlib inline
sns.plotting_context('poster')
sns.set_style('whitegrid')

# data

#### training datasets
variants and text files

In [None]:
train_v = pd.read_csv('training_variants.csv')

In [None]:
train_t = pd.read_csv('training_text', sep='\|\|',skiprows=1, engine='python', names=["ID","text"])

In [None]:
train = pd.merge(train_v,train_t, on = ['ID'])

#### testing datasets
variant and text files

In [None]:
test_v = pd.read_csv('test_variants.csv')

In [None]:
train_t = pd.read_csv('test_text', sep='\|\|',skiprows=1, engine='python', names=["ID","text"])

In [None]:
test = pd.merge(test_v,test_t, on = ['ID'])

#### combine training and testing datasets

In [None]:
df = pd.concat([train, test], axis=1, ignore_index=True, keys=['train', 'test'] )

# Features

## variants
Classifies variations into common types and standardizes their name. The order of commands in the fuc matters because deletions & insertions can happen simulteneously but coding is inconsistent. Testing: func is coding variations accurately. We might want to change the Fussion expression ("fs") because it could be problematic with new data, but hacky version works for now.

In [None]:
def var_recode(data, colname):
    recoded_colname = colname + '_recoded'
    #recode substitutions
    data[recoded_colname] = data[colname].str.replace('^[A-Z]\d+[A-Z\*]$|^(null)\d+[A-Z\*]$', 'Substitution')
    #recode deletions
    data.loc[data[colname].str.contains('del|silencing|hypermethylation', case=False), recoded_colname] = 'Deletion'
    #recode insertions
    data.loc[data[colname].str.contains('ins', case=False) , recoded_colname] = 'Insertion'
    #recode deletions/insertions
    data.loc[(data[colname].str.contains('del', case=False) & 
              data[colname].str.contains('ins', case=False)), recoded_colname] = 'InDel'
    #recode truncations
    data.loc[data[colname].str.contains('trunc', case=False), recoded_colname] = 'Truncation'
    #recode duplications
    data.loc[data[colname].str.contains('dup', case=False) , recoded_colname] = 'Duplication'
    #recode fusions 
    data.loc[data[colname].str.contains('fusion|fs', case=False), recoded_colname] = 'Fusion'

In [None]:
var_recode(df, 'Variation')

## genes
quick function to calculate gene frequency for a grouping variable 1) count of unique genes 2) total number of genes per group (similar to group frequency) 3) most frequent gene in each group 4) proportional frequency of gene accounting for size of group 5) reset index for easier plotting

In [None]:
def genefreq(data, group):
    newdf = data.groupby(group).agg('Gene': pd.Series.nunique)
    newdf['Gene_total'] = data.groupby(group)['Gene'].count()
    newdf['Gene_most_frequent'] = data.groupby(group)['Gene'].agg(lambda x: x.value_counts().idxmax())
    newdf['Gene_most_frequent_count'] = data.groupby(group)['Gene'].agg(lambda x: x.value_counts().max())
    newdf['Gene_p_unique'] = newdf['Gene']/newdf['Gene_total']
    newdf['Gene_p_most_frequent'] = newdf['Gene_most_frequent_count']/var_grp['Gene_total']
    newdf.reset_index(inplace=True)
    return newdf

In [None]:
genefreq(df, 'Variation_recoded')

## text

#### word count per entry

In [None]:
df.loc[:, 'Word_count']  = df["Text"].apply(lambda x: len(x.split()))

#### top word frequencies

remove stop words

In [None]:
corpus = []
for i in range(0, 3321):
    review = re.sub('[^a-zA-Z]', ' ', df['text'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

find frequent words (from SK)

In [None]:
word_vectorizer = CountVectorizer(ngram_range=(1,1), analyzer='word')
sparse_matrix = word_vectorizer.fit_transform(corpus)
frequencies = sum(sparse_matrix).toarray()[0]
h = pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency'])
h.reset_index(inplace = True)
h.sort_values('frequency',ascending = False, inplace = True)

In [None]:
words = 'mutat, cancer, patient, protein, express, tumor, variant, kinase, domain, brca1, egfr, activation, ras, p53, exon, growth, clinical, signaling, function, pten, phospharylation, residue, resistance, raf, pathway, alk, disease, receptor, missense, breast, braf, inhibit, lung, tyrosine, acid, induce, survival, proliferation, akt, imatinib, fusion, oncogenic, transcription, deleterous, flt3, melanoma, somatic, gefitinib, brct, myc, amplification, genomic, pathogenic, benign, p21, phosphatase, sp3b1, bat3'.split(', ')


find frequency of top words in each entry

In [None]:
def word_count(df, words):    
    for word in words:
        w =[]
        for row in df['text']:
            n = row.lower().count(word)
            w.append(n)
        df[word] = w
    return df

In [None]:
merged_text = word_count(df, words)

"bag of words" from MW

In [None]:
word_vectorizer = CountVectorizer(ngram_range=(1,1), analyzer='word')
sparse_matrix = word_vectorizer.fit_transform(corpus)
frequencies = sum(sparse_matrix).toarray()[0]
df=pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency'])
df.sort_values('frequency', ascending = False, inplace = True)

#### word count per entry