In [None]:
# some parts have been taken from https://www.kaggle.com/the1owl/redefining-treatment-0-57456

In [1]:
import pandas as pd
import numpy as np
import feather
from sklearn import preprocessing as pe
from tqdm import tqdm

In [2]:
df_train = feather.read_dataframe('../cache/train_stage1.feather')

In [3]:
df_train.head()

Unnamed: 0,ID,Gene,Variation,Class,Text
0,0,FAM58A,Truncating Mutations,1,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,CBL,W802*,2,Abstract Background Non-small cell lung canc...
2,2,CBL,Q249E,2,Abstract Background Non-small cell lung canc...
3,3,CBL,N454D,3,Recent evidence has demonstrated that acquired...
4,4,CBL,L399V,4,Oncogenic mutations in the monomeric Casitas B...


In [4]:
df_test = feather.read_dataframe('../cache/test_stage1.feather')

In [5]:
df_test.head()

Unnamed: 0,ID,Gene,Variation,Text
0,0,ACSL4,R570S,2. This mutation resulted in a myeloproliferat...
1,1,NAGLU,P521L,Abstract The Large Tumor Suppressor 1 (LATS1)...
2,2,PAH,L333F,Vascular endothelial growth factor receptor (V...
3,3,ING1,A148D,Inflammatory myofibroblastic tumor (IMT) is a ...
4,4,TMEM216,G77A,Abstract Retinoblastoma is a pediatric retina...


In [6]:
y = df_train['Class'].values
df_train = df_train.drop(['Class'], axis=1)

test_ids = df_test['ID'].values

In [7]:
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)

In [8]:
df_all.head()

Unnamed: 0,ID,Gene,Variation,Text
0,0,FAM58A,Truncating Mutations,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,CBL,W802*,Abstract Background Non-small cell lung canc...
2,2,CBL,Q249E,Abstract Background Non-small cell lung canc...
3,3,CBL,N454D,Recent evidence has demonstrated that acquired...
4,4,CBL,L399V,Oncogenic mutations in the monomeric Casitas B...


In [9]:
df_all.shape

(8989, 4)

In [10]:
df_all['GeneVar'] = df_all['Gene'] + ' ' + df_all['Variation']

In [11]:
df_all['Gene_Share'] = df_all.apply(lambda row: sum([1 for w in row['Gene'].split(' ') 
                                                   if w in row['Text'].split(' ')]), axis=1)



In [12]:
df_all['Variation_Share'] = df_all.apply(lambda row: sum([1 for w in row['Variation'].split(' ') 
                                                        if w in row['Text'].split(' ')]), axis=1)

In [13]:
df_all['Gene_Variation_Share'] = df_all.apply(lambda row: sum([1 for w in row['GeneVar'].split(' ') 
                                                        if w in row['Text'].split(' ')]), axis=1)

In [14]:
df_all.head()

Unnamed: 0,ID,Gene,Variation,Text,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share
0,0,FAM58A,Truncating Mutations,Cyclin-dependent kinases (CDKs) regulate a var...,FAM58A Truncating Mutations,1,1,2
1,1,CBL,W802*,Abstract Background Non-small cell lung canc...,CBL W802*,1,1,2
2,2,CBL,Q249E,Abstract Background Non-small cell lung canc...,CBL Q249E,1,1,2
3,3,CBL,N454D,Recent evidence has demonstrated that acquired...,CBL N454D,1,1,2
4,4,CBL,L399V,Oncogenic mutations in the monomeric Casitas B...,CBL L399V,1,1,2


In [15]:
df_all['GL'] = df_all['Gene'].apply(lambda x: len(x))

In [16]:
sorted(set(df_all['GL'].values))

[2, 3, 4, 5, 6, 7, 8, 9]

In [17]:
df_all['VL'] = df_all['Variation'].apply(lambda x: len(x))

In [18]:
sorted(set(df_all['VL'].values))

[3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 25,
 26,
 28,
 29,
 30,
 31,
 33,
 34,
 36,
 37,
 39,
 40,
 55]

In [19]:
max_len=55

In [20]:
for i in range(max_len+1):
    df_all['Gene_'+str(i)] = df_all['Gene'].map(lambda x: str(x[i]) if len(x)>i else '')
    df_all['Variation_'+str(i)] = df_all['Variation'].map(lambda x: str(x[i]) if len(x)>i else '')
    df_all['GeneVar_'+str(i)] = df_all['GeneVar'].map(lambda x: str(x[i]) if len(x)>i else '')

In [21]:
df_all.head()

Unnamed: 0,ID,Gene,Variation,Text,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share,GL,VL,...,GeneVar_52,Gene_53,Variation_53,GeneVar_53,Gene_54,Variation_54,GeneVar_54,Gene_55,Variation_55,GeneVar_55
0,0,FAM58A,Truncating Mutations,Cyclin-dependent kinases (CDKs) regulate a var...,FAM58A Truncating Mutations,1,1,2,6,20,...,,,,,,,,,,
1,1,CBL,W802*,Abstract Background Non-small cell lung canc...,CBL W802*,1,1,2,3,5,...,,,,,,,,,,
2,2,CBL,Q249E,Abstract Background Non-small cell lung canc...,CBL Q249E,1,1,2,3,5,...,,,,,,,,,,
3,3,CBL,N454D,Recent evidence has demonstrated that acquired...,CBL N454D,1,1,2,3,5,...,,,,,,,,,,
4,4,CBL,L399V,Oncogenic mutations in the monomeric Casitas B...,CBL L399V,1,1,2,3,5,...,,,,,,,,,,


In [None]:
# from https://www.kaggle.com/the1owl/redefining-treatment-0-57456
gene_var_lst = sorted(list(df_train.Gene.unique()) + list(df_train.Variation.unique()))

gene_var_lst = [x for x in gene_var_lst if len(x.split(' '))==1]

i_ = 0

for el in tqdm(gene_var_lst):
    df_all['GV_'+str(el)] = df_all['Text'].map(lambda x: str(x).count(str(el)))
    i_ += 1

 16%|█▋        | 503/3091 [01:37<07:30,  5.74it/s]

In [45]:
df_all.head()

Unnamed: 0,Gene,ID,Text,Variation,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share,GL,VL,...,Gene_len,Gene_words,Text_len,Text_words,Variation_lbl_enc,Variation_len,Variation_words,GeneVar_lbl_enc,GeneVar_len,GeneVar_words
0,FAM58A,0,Cyclin-dependent kinases (CDKs) regulate a var...,Truncating Mutations,FAM58A Truncating Mutations,1,1,2,6,20,...,6,1,39672,6105,3460,20,2,1654,27,3
1,CBL,1,Abstract Background Non-small cell lung canc...,W802*,CBL W802*,1,1,2,3,5,...,3,1,36691,5783,3748,5,1,941,9,2
2,CBL,2,Abstract Background Non-small cell lung canc...,Q249E,CBL Q249E,1,1,2,3,5,...,3,1,36691,5783,2425,5,1,933,9,2
3,CBL,3,Recent evidence has demonstrated that acquired...,N454D,CBL N454D,1,1,2,3,5,...,3,1,36238,5625,2132,5,1,929,9,2
4,CBL,4,Oncogenic mutations in the monomeric Casitas B...,L399V,CBL L399V,1,1,2,3,5,...,3,1,41308,6248,1854,5,1,927,9,2


In [46]:
s = df_all['Gene'].value_counts()

In [47]:
print(s['BRCA1'])

322


In [48]:
df_all['G_VC'] = df_all['Gene'].apply(lambda gene: s[str(gene)])

In [49]:
df_all.G_VC[df_all.Gene == 'BRCA1'].count()

322

In [50]:
s = df_all['Variation'].value_counts()

In [51]:
df_all['V_VC'] = df_all['Variation'].apply(lambda var: s[str(var)])

In [52]:
s = df_all['GeneVar'].value_counts()

In [53]:
df_all['GV_VC'] = df_all['GeneVar'].apply(lambda var: s[str(var)])

In [54]:
D = 2 ** 24

In [55]:
df_all['hash'] = df_all['Gene'] + '_' + df_all['Variation'] + '_' + df_all['Text']
                

In [56]:
df_all.head()

Unnamed: 0,Gene,ID,Text,Variation,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share,GL,VL,...,Gene_len,Gene_words,Text_len,Text_words,Variation_lbl_enc,Variation_len,Variation_words,GeneVar_lbl_enc,GeneVar_len,GeneVar_words
0,FAM58A,0,Cyclin-dependent kinases (CDKs) regulate a var...,Truncating Mutations,FAM58A Truncating Mutations,1,1,2,6,20,...,6,1,39672,6105,3460,20,2,1654,27,3
1,CBL,1,Abstract Background Non-small cell lung canc...,W802*,CBL W802*,1,1,2,3,5,...,3,1,36691,5783,3748,5,1,941,9,2
2,CBL,2,Abstract Background Non-small cell lung canc...,Q249E,CBL Q249E,1,1,2,3,5,...,3,1,36691,5783,2425,5,1,933,9,2
3,CBL,3,Recent evidence has demonstrated that acquired...,N454D,CBL N454D,1,1,2,3,5,...,3,1,36238,5625,2132,5,1,929,9,2
4,CBL,4,Oncogenic mutations in the monomeric Casitas B...,L399V,CBL L399V,1,1,2,3,5,...,3,1,41308,6248,1854,5,1,927,9,2


In [57]:
df_all['hash'] = df_all['hash'].apply(lambda x: hash(x) % D)

In [58]:
df_all.head()

Unnamed: 0,Gene,ID,Text,Variation,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share,GL,VL,...,Gene_len,Gene_words,Text_len,Text_words,Variation_lbl_enc,Variation_len,Variation_words,GeneVar_lbl_enc,GeneVar_len,GeneVar_words
0,FAM58A,0,Cyclin-dependent kinases (CDKs) regulate a var...,Truncating Mutations,FAM58A Truncating Mutations,1,1,2,6,20,...,6,1,39672,6105,3460,20,2,1654,27,3
1,CBL,1,Abstract Background Non-small cell lung canc...,W802*,CBL W802*,1,1,2,3,5,...,3,1,36691,5783,3748,5,1,941,9,2
2,CBL,2,Abstract Background Non-small cell lung canc...,Q249E,CBL Q249E,1,1,2,3,5,...,3,1,36691,5783,2425,5,1,933,9,2
3,CBL,3,Recent evidence has demonstrated that acquired...,N454D,CBL N454D,1,1,2,3,5,...,3,1,36238,5625,2132,5,1,929,9,2
4,CBL,4,Oncogenic mutations in the monomeric Casitas B...,L399V,CBL L399V,1,1,2,3,5,...,3,1,41308,6248,1854,5,1,927,9,2


In [59]:
df_all.shape

(4675, 3590)

In [60]:
for c in tqdm(df_all.columns):
    if df_all[c].dtype == 'object':
        if c in ['Gene','Variation','GeneVar']:
            lbl = pe.LabelEncoder()
            df_all[c+'_lbl_enc'] = lbl.fit_transform(df_all[c].values)  
            df_all[c+'_len'] = df_all[c].map(lambda x: len(str(x)))
            df_all[c+'_words'] = df_all[c].map(lambda x: len(str(x).split(' ')))
        elif c != 'Text':
            lbl = pe.LabelEncoder()
            df_all[c] = lbl.fit_transform(df_all[c].values)
        if c=='Text': 
            df_all[c+'_len'] = df_all[c].map(lambda x: len(str(x)))
            df_all[c+'_words'] = df_all[c].map(lambda x: len(str(x).split(' '))) 



100%|██████████| 3590/3590 [00:02<00:00, 1266.84it/s]


In [61]:
df_train = df_all.iloc[:len(df_train)]
df_test = df_all.iloc[len(df_train):]

In [62]:
df_train['Class'] = y

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [63]:
df_train.shape

(3689, 3591)

In [64]:
df_test.shape

(986, 3590)

In [65]:
df_train.head()

Unnamed: 0,Gene,ID,Text,Variation,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share,GL,VL,...,Gene_words,Text_len,Text_words,Variation_lbl_enc,Variation_len,Variation_words,GeneVar_lbl_enc,GeneVar_len,GeneVar_words,Class
0,FAM58A,0,Cyclin-dependent kinases (CDKs) regulate a var...,Truncating Mutations,FAM58A Truncating Mutations,1,1,2,6,20,...,1,39672,6105,3460,20,2,1654,27,3,1
1,CBL,1,Abstract Background Non-small cell lung canc...,W802*,CBL W802*,1,1,2,3,5,...,1,36691,5783,3748,5,1,941,9,2,2
2,CBL,2,Abstract Background Non-small cell lung canc...,Q249E,CBL Q249E,1,1,2,3,5,...,1,36691,5783,2425,5,1,933,9,2,2
3,CBL,3,Recent evidence has demonstrated that acquired...,N454D,CBL N454D,1,1,2,3,5,...,1,36238,5625,2132,5,1,929,9,2,3
4,CBL,4,Oncogenic mutations in the monomeric Casitas B...,L399V,CBL L399V,1,1,2,3,5,...,1,41308,6248,1854,5,1,927,9,2,4


In [66]:
df_train['hash'].value_counts()

11003903    1
6040360     1
15081029    1
1733411     1
15565477    1
4678433     1
627483      1
10924823    1
15004435    1
9486971     1
2616080     1
11574456    1
6007564     1
15795068    1
3959556     1
2734850     1
16427772    1
10441467    1
15895288    1
10248724    1
451317      1
6947607     1
1663729     1
16485104    1
635631      1
235535      1
12611695    1
2675495     1
14523178    1
16349922    1
           ..
10026321    1
14394750    1
8806374     1
9670019     1
6169594     1
7638471     1
5226691     1
12996468    1
3063233     1
15316416    1
16396386    1
1236405     1
16209332    1
16494003    1
4453808     1
8992172     1
3734949     1
2325893     1
6501796     1
7603618     1
166870      1
10728864    1
9426332     1
1153560     1
2155926     1
12588428    1
126347      1
4545929     1
3581318     1
6782976     1
Name: hash, Length: 3689, dtype: int64

In [67]:
df_test.head()

Unnamed: 0,Gene,ID,Text,Variation,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share,GL,VL,...,Gene_len,Gene_words,Text_len,Text_words,Variation_lbl_enc,Variation_len,Variation_words,GeneVar_lbl_enc,GeneVar_len,GeneVar_words
3689,CHEK2,1,The incidence of breast cancer is increasing i...,H371Y,CHEK2 H371Y,1,1,2,5,5,...,5,1,33403,4991,1379,5,1,1080,11,2
3690,AXIN2,2,An unselected series of 310 colorectal carcino...,Truncating Mutations,AXIN2 Truncating Mutations,1,1,2,5,20,...,5,1,66400,10348,3460,20,2,286,26,3
3691,WNT4,3,Mycosis fungoides and Sézary syndrome are prim...,E216G,WNT4 E216G,0,1,1,4,5,...,4,1,58544,8638,672,5,1,4287,10,2
3692,SUCLA2,4,Regulated progression through the cell cycle ...,G118R,SUCLA2 G118R,0,1,1,6,5,...,6,1,42023,6221,1038,5,1,3841,12,2
3693,BRAF,5,Pilocytic astrocytoma (PA) is emerging as a tu...,T599insTT,BRAF T599insTT,1,0,1,4,9,...,4,1,22499,3280,3402,9,1,430,14,2


In [68]:
feather.write_dataframe(df_train, '../cache/train_stage1_fe.feather')

In [69]:
feather.write_dataframe(df_test, '../cache/test_stage1_fe.feather')