In [1]:
import pandas as pd
import numpy as np
import feather
from sklearn import preprocessing as pe
from tqdm import tqdm

In [2]:
df_train = feather.read_dataframe('../cache/train_stage2.feather')

In [3]:
df_train.head()

Unnamed: 0,Class,Gene,ID,Text,Variation
0,1,FAM58A,0,Cyclin-dependent kinases (CDKs) regulate a var...,Truncating Mutations
1,2,CBL,1,Abstract Background Non-small cell lung canc...,W802*
2,2,CBL,2,Abstract Background Non-small cell lung canc...,Q249E
3,3,CBL,3,Recent evidence has demonstrated that acquired...,N454D
4,4,CBL,4,Oncogenic mutations in the monomeric Casitas B...,L399V


In [4]:
df_test = feather.read_dataframe('../cache/test_stage2.feather')

In [5]:
df_test.head()

Unnamed: 0,ID,Gene,Variation,Text
0,1,CHEK2,H371Y,The incidence of breast cancer is increasing i...
1,2,AXIN2,Truncating Mutations,An unselected series of 310 colorectal carcino...
2,3,WNT4,E216G,Mycosis fungoides and Sézary syndrome are prim...
3,4,SUCLA2,G118R,Regulated progression through the cell cycle ...
4,5,BRAF,T599insTT,Pilocytic astrocytoma (PA) is emerging as a tu...


In [6]:
y = df_train['Class'].values
df_train = df_train.drop(['Class'], axis=1)

test_ids = df_test['ID'].values

In [7]:
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)

In [8]:
df_all.head()

Unnamed: 0,Gene,ID,Text,Variation
0,FAM58A,0,Cyclin-dependent kinases (CDKs) regulate a var...,Truncating Mutations
1,CBL,1,Abstract Background Non-small cell lung canc...,W802*
2,CBL,2,Abstract Background Non-small cell lung canc...,Q249E
3,CBL,3,Recent evidence has demonstrated that acquired...,N454D
4,CBL,4,Oncogenic mutations in the monomeric Casitas B...,L399V


In [9]:
df_all.shape

(4675, 4)

In [10]:
df_all['GeneVar'] = df_all['Gene'] + ' ' + df_all['Variation']

In [11]:
df_all['Gene_Share'] = df_all.apply(lambda row: sum([1 for w in row['Gene'].split(' ') 
                                                   if w in row['Text'].split(' ')]), axis=1)



In [12]:
df_all['Variation_Share'] = df_all.apply(lambda row: sum([1 for w in row['Variation'].split(' ') 
                                                        if w in row['Text'].split(' ')]), axis=1)

In [13]:
df_all['Gene_Variation_Share'] = df_all.apply(lambda row: sum([1 for w in row['GeneVar'].split(' ') 
                                                        if w in row['Text'].split(' ')]), axis=1)

In [14]:
df_all.head()

Unnamed: 0,Gene,ID,Text,Variation,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share
0,FAM58A,0,Cyclin-dependent kinases (CDKs) regulate a var...,Truncating Mutations,FAM58A Truncating Mutations,1,1,2
1,CBL,1,Abstract Background Non-small cell lung canc...,W802*,CBL W802*,1,1,2
2,CBL,2,Abstract Background Non-small cell lung canc...,Q249E,CBL Q249E,1,1,2
3,CBL,3,Recent evidence has demonstrated that acquired...,N454D,CBL N454D,1,1,2
4,CBL,4,Oncogenic mutations in the monomeric Casitas B...,L399V,CBL L399V,1,1,2


In [15]:
df_all['GL'] = df_all['Gene'].apply(lambda x: len(x))

In [16]:
sorted(set(df_all['GL'].values))

[2, 3, 4, 5, 6, 7, 8, 9]

In [17]:
df_all['VL'] = df_all['Variation'].apply(lambda x: len(x))

In [None]:
sorted(set(df_all['VL'].values))

In [68]:
max_len=55

In [69]:
for i in range(max_len+1):
    df_all['Gene_'+str(i)] = df_all['Gene'].map(lambda x: str(x[i]) if len(x)>i else '')
    df_all['Variation_'+str(i)] = df_all['Variation'].map(lambda x: str(x[i]) if len(x)>i else '')
    df_all['GeneVar_'+str(i)] = df_all['GeneVar'].map(lambda x: str(x[i]) if len(x)>i else '')

In [70]:
df_all.head()

Unnamed: 0,Gene,ID,Text,Variation,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share,GL,VL,...,GeneVar_52,Gene_53,Variation_53,GeneVar_53,Gene_54,Variation_54,GeneVar_54,Gene_55,Variation_55,GeneVar_55
0,FAM58A,0,Cyclin-dependent kinases (CDKs) regulate a var...,Truncating Mutations,FAM58A Truncating Mutations,1,1,2,6,20,...,,,,,,,,,,
1,CBL,1,Abstract Background Non-small cell lung canc...,W802*,CBL W802*,1,1,2,3,5,...,,,,,,,,,,
2,CBL,2,Abstract Background Non-small cell lung canc...,Q249E,CBL Q249E,1,1,2,3,5,...,,,,,,,,,,
3,CBL,3,Recent evidence has demonstrated that acquired...,N454D,CBL N454D,1,1,2,3,5,...,,,,,,,,,,
4,CBL,4,Oncogenic mutations in the monomeric Casitas B...,L399V,CBL L399V,1,1,2,3,5,...,,,,,,,,,,


In [71]:
gene_var_lst = sorted(list(df_train.Gene.unique()) + list(df_train.Variation.unique()))

gene_var_lst = [x for x in gene_var_lst if len(x.split(' '))==1]

i_ = 0

for el in tqdm(gene_var_lst):
    df_all['GV_'+str(el)] = df_all['Text'].map(lambda x: str(x).count(str(el)))
    i_ += 1

100%|██████████| 3397/3397 [06:11<00:00,  9.15it/s]


In [72]:
df_all.head()

Unnamed: 0,Gene,ID,Text,Variation,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share,GL,VL,...,GV_Y849S,GV_Y856H,GV_Y87C,GV_Y87N,GV_Y901C,GV_Y931C,GV_Y98H,GV_Y98N,GV_YAP1,GV_p61BRAF
0,FAM58A,0,Cyclin-dependent kinases (CDKs) regulate a var...,Truncating Mutations,FAM58A Truncating Mutations,1,1,2,6,20,...,0,0,0,0,0,0,0,0,0,0
1,CBL,1,Abstract Background Non-small cell lung canc...,W802*,CBL W802*,1,1,2,3,5,...,0,0,0,0,0,0,0,0,0,0
2,CBL,2,Abstract Background Non-small cell lung canc...,Q249E,CBL Q249E,1,1,2,3,5,...,0,0,0,0,0,0,0,0,0,0
3,CBL,3,Recent evidence has demonstrated that acquired...,N454D,CBL N454D,1,1,2,3,5,...,0,0,0,0,0,0,0,0,0,0
4,CBL,4,Oncogenic mutations in the monomeric Casitas B...,L399V,CBL L399V,1,1,2,3,5,...,0,0,0,0,0,0,0,0,0,0


In [73]:
s = df_all['Gene'].value_counts()

In [74]:
print(s['BRCA1'])

322


In [75]:
df_all['G_VC'] = df_all['Gene'].apply(lambda gene: s[str(gene)])

In [76]:
df_all.G_VC[df_all.Gene == 'BRCA1'].count()

322

In [77]:
s = df_all['Variation'].value_counts()

In [78]:
df_all['V_VC'] = df_all['Variation'].apply(lambda var: s[str(var)])

In [79]:
s = df_all['GeneVar'].value_counts()

In [80]:
df_all['GV_VC'] = df_all['GeneVar'].apply(lambda var: s[str(var)])

In [81]:
D = 2 ** 24

In [82]:
df_all['hash'] = df_all['Gene'] + '_' + df_all['Variation'] + '_' + df_all['Text']
                

In [83]:
df_all.head()

Unnamed: 0,Gene,ID,Text,Variation,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share,GL,VL,...,GV_Y901C,GV_Y931C,GV_Y98H,GV_Y98N,GV_YAP1,GV_p61BRAF,G_VC,V_VC,GV_VC,hash
0,FAM58A,0,Cyclin-dependent kinases (CDKs) regulate a var...,Truncating Mutations,FAM58A Truncating Mutations,1,1,2,6,20,...,0,0,0,0,0,0,1,129,1,FAM58A_Truncating Mutations_Cyclin-dependent k...
1,CBL,1,Abstract Background Non-small cell lung canc...,W802*,CBL W802*,1,1,2,3,5,...,0,0,0,0,0,0,29,1,1,CBL_W802*_ Abstract Background Non-small cell...
2,CBL,2,Abstract Background Non-small cell lung canc...,Q249E,CBL Q249E,1,1,2,3,5,...,0,0,0,0,0,0,29,1,1,CBL_Q249E_ Abstract Background Non-small cell...
3,CBL,3,Recent evidence has demonstrated that acquired...,N454D,CBL N454D,1,1,2,3,5,...,0,0,0,0,0,0,29,1,1,CBL_N454D_Recent evidence has demonstrated tha...
4,CBL,4,Oncogenic mutations in the monomeric Casitas B...,L399V,CBL L399V,1,1,2,3,5,...,0,0,0,0,0,0,29,1,1,CBL_L399V_Oncogenic mutations in the monomeric...


In [84]:
df_all['hash'] = df_all['hash'].apply(lambda x: hash(x) % D)

In [88]:
df_all.head()

Unnamed: 0,Gene,ID,Text,Variation,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share,GL,VL,...,GV_Y901C,GV_Y931C,GV_Y98H,GV_Y98N,GV_YAP1,GV_p61BRAF,G_VC,V_VC,GV_VC,hash
0,FAM58A,0,Cyclin-dependent kinases (CDKs) regulate a var...,Truncating Mutations,FAM58A Truncating Mutations,1,1,2,6,20,...,0,0,0,0,0,0,1,129,1,3935774
1,CBL,1,Abstract Background Non-small cell lung canc...,W802*,CBL W802*,1,1,2,3,5,...,0,0,0,0,0,0,29,1,1,13728644
2,CBL,2,Abstract Background Non-small cell lung canc...,Q249E,CBL Q249E,1,1,2,3,5,...,0,0,0,0,0,0,29,1,1,10926447
3,CBL,3,Recent evidence has demonstrated that acquired...,N454D,CBL N454D,1,1,2,3,5,...,0,0,0,0,0,0,29,1,1,14181478
4,CBL,4,Oncogenic mutations in the monomeric Casitas B...,L399V,CBL L399V,1,1,2,3,5,...,0,0,0,0,0,0,29,1,1,15487294


In [89]:
df_all.shape

(4675, 3579)

In [91]:
for c in tqdm(df_all.columns):
    if df_all[c].dtype == 'object':
        if c in ['Gene','Variation','GeneVar']:
            lbl = pe.LabelEncoder()
            df_all[c+'_lbl_enc'] = lbl.fit_transform(df_all[c].values)  
            df_all[c+'_len'] = df_all[c].map(lambda x: len(str(x)))
            df_all[c+'_words'] = df_all[c].map(lambda x: len(str(x).split(' ')))
        elif c != 'Text':
            lbl = pe.LabelEncoder()
            df_all[c] = lbl.fit_transform(df_all[c].values)
        if c=='Text': 
            df_all[c+'_len'] = df_all[c].map(lambda x: len(str(x)))
            df_all[c+'_words'] = df_all[c].map(lambda x: len(str(x).split(' '))) 



100%|██████████| 3587/3587 [00:02<00:00, 1250.86it/s]


In [92]:
df_train = df_all.iloc[:len(df_train)]
df_test = df_all.iloc[len(df_train):]

In [93]:
df_train['Class'] = y

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [94]:
df_train.shape

(3689, 3588)

In [95]:
df_test.shape

(986, 3587)

In [96]:
df_train.head()

Unnamed: 0,Gene,ID,Text,Variation,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share,GL,VL,...,hash,Gene_lbl_enc,Gene_len,Gene_words,Text_len,Text_words,Variation_lbl_enc,Variation_len,Variation_words,Class
0,FAM58A,0,Cyclin-dependent kinases (CDKs) regulate a var...,Truncating Mutations,1654,1,1,2,6,20,...,3935774,128,6,1,39672,6105,3460,20,2,1
1,CBL,1,Abstract Background Non-small cell lung canc...,W802*,941,1,1,2,3,5,...,13728644,59,3,1,36691,5783,3748,5,1,2
2,CBL,2,Abstract Background Non-small cell lung canc...,Q249E,933,1,1,2,3,5,...,10926447,59,3,1,36691,5783,2425,5,1,2
3,CBL,3,Recent evidence has demonstrated that acquired...,N454D,929,1,1,2,3,5,...,14181478,59,3,1,36238,5625,2132,5,1,3
4,CBL,4,Oncogenic mutations in the monomeric Casitas B...,L399V,927,1,1,2,3,5,...,15487294,59,3,1,41308,6248,1854,5,1,4


In [97]:
df_test.head()

Unnamed: 0,Gene,ID,Text,Variation,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share,GL,VL,...,GV_VC,hash,Gene_lbl_enc,Gene_len,Gene_words,Text_len,Text_words,Variation_lbl_enc,Variation_len,Variation_words
3689,CHEK2,1,The incidence of breast cancer is increasing i...,H371Y,1080,1,1,2,5,5,...,2,14853815,75,5,1,33403,4991,1379,5,1
3690,AXIN2,2,An unselected series of 310 colorectal carcino...,Truncating Mutations,286,1,1,2,5,20,...,2,6310080,35,5,1,66400,10348,3460,20,2
3691,WNT4,3,Mycosis fungoides and Sézary syndrome are prim...,E216G,4287,0,1,1,4,5,...,1,5658328,394,4,1,58544,8638,672,5,1
3692,SUCLA2,4,Regulated progression through the cell cycle ...,G118R,3841,0,1,1,6,5,...,1,5544628,366,6,1,42023,6221,1038,5,1
3693,BRAF,5,Pilocytic astrocytoma (PA) is emerging as a tu...,T599insTT,430,1,0,1,4,9,...,2,8839571,50,4,1,22499,3280,3402,9,1


In [98]:
feather.write_dataframe(df_train, '../cache/train_stage2_fe.feather')

In [99]:
feather.write_dataframe(df_test, '../cache/test_stage2_fe.feather')