In [None]:
# some parts have been taken from https://www.kaggle.com/the1owl/redefining-treatment-0-57456

In [1]:
import pandas as pd
import numpy as np
import feather
from sklearn import preprocessing as pe
from tqdm import tqdm

In [2]:
df_train = feather.read_dataframe('../cache/train_stage1.feather')

In [3]:
df_train.head()

Unnamed: 0,ID,Gene,Variation,Class,Text
0,0,FAM58A,Truncating Mutations,1,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,CBL,W802*,2,Abstract Background Non-small cell lung canc...
2,2,CBL,Q249E,2,Abstract Background Non-small cell lung canc...
3,3,CBL,N454D,3,Recent evidence has demonstrated that acquired...
4,4,CBL,L399V,4,Oncogenic mutations in the monomeric Casitas B...


In [4]:
df_test = feather.read_dataframe('../cache/test_stage1.feather')

In [5]:
df_test.head()

Unnamed: 0,ID,Gene,Variation,Text
0,0,ACSL4,R570S,2. This mutation resulted in a myeloproliferat...
1,1,NAGLU,P521L,Abstract The Large Tumor Suppressor 1 (LATS1)...
2,2,PAH,L333F,Vascular endothelial growth factor receptor (V...
3,3,ING1,A148D,Inflammatory myofibroblastic tumor (IMT) is a ...
4,4,TMEM216,G77A,Abstract Retinoblastoma is a pediatric retina...


In [6]:
y = df_train['Class'].values
df_train = df_train.drop(['Class'], axis=1)

test_ids = df_test['ID'].values

In [7]:
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)

In [8]:
df_all.head()

Unnamed: 0,ID,Gene,Variation,Text
0,0,FAM58A,Truncating Mutations,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,CBL,W802*,Abstract Background Non-small cell lung canc...
2,2,CBL,Q249E,Abstract Background Non-small cell lung canc...
3,3,CBL,N454D,Recent evidence has demonstrated that acquired...
4,4,CBL,L399V,Oncogenic mutations in the monomeric Casitas B...


In [9]:
df_all.shape

(8989, 4)

In [10]:
df_all['GeneVar'] = df_all['Gene'] + ' ' + df_all['Variation']

In [11]:
df_all['Gene_Share'] = df_all.apply(lambda row: sum([1 for w in row['Gene'].split(' ') 
                                                   if w in row['Text'].split(' ')]), axis=1)



In [12]:
df_all['Variation_Share'] = df_all.apply(lambda row: sum([1 for w in row['Variation'].split(' ') 
                                                        if w in row['Text'].split(' ')]), axis=1)

In [13]:
df_all['Gene_Variation_Share'] = df_all.apply(lambda row: sum([1 for w in row['GeneVar'].split(' ') 
                                                        if w in row['Text'].split(' ')]), axis=1)

In [14]:
df_all.head()

Unnamed: 0,ID,Gene,Variation,Text,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share
0,0,FAM58A,Truncating Mutations,Cyclin-dependent kinases (CDKs) regulate a var...,FAM58A Truncating Mutations,1,1,2
1,1,CBL,W802*,Abstract Background Non-small cell lung canc...,CBL W802*,1,1,2
2,2,CBL,Q249E,Abstract Background Non-small cell lung canc...,CBL Q249E,1,1,2
3,3,CBL,N454D,Recent evidence has demonstrated that acquired...,CBL N454D,1,1,2
4,4,CBL,L399V,Oncogenic mutations in the monomeric Casitas B...,CBL L399V,1,1,2


In [15]:
df_all['GL'] = df_all['Gene'].apply(lambda x: len(x))

In [16]:
sorted(set(df_all['GL'].values))

[2, 3, 4, 5, 6, 7, 8, 9]

In [17]:
df_all['VL'] = df_all['Variation'].apply(lambda x: len(x))

In [18]:
sorted(set(df_all['VL'].values))

[3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 25,
 26,
 28,
 29,
 30,
 31,
 33,
 34,
 36,
 37,
 39,
 40,
 55]

In [19]:
max_len=55

In [20]:
for i in range(max_len+1):
    df_all['Gene_'+str(i)] = df_all['Gene'].map(lambda x: str(x[i]) if len(x)>i else '')
    df_all['Variation_'+str(i)] = df_all['Variation'].map(lambda x: str(x[i]) if len(x)>i else '')
    df_all['GeneVar_'+str(i)] = df_all['GeneVar'].map(lambda x: str(x[i]) if len(x)>i else '')

In [21]:
df_all.head()

Unnamed: 0,ID,Gene,Variation,Text,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share,GL,VL,...,GeneVar_52,Gene_53,Variation_53,GeneVar_53,Gene_54,Variation_54,GeneVar_54,Gene_55,Variation_55,GeneVar_55
0,0,FAM58A,Truncating Mutations,Cyclin-dependent kinases (CDKs) regulate a var...,FAM58A Truncating Mutations,1,1,2,6,20,...,,,,,,,,,,
1,1,CBL,W802*,Abstract Background Non-small cell lung canc...,CBL W802*,1,1,2,3,5,...,,,,,,,,,,
2,2,CBL,Q249E,Abstract Background Non-small cell lung canc...,CBL Q249E,1,1,2,3,5,...,,,,,,,,,,
3,3,CBL,N454D,Recent evidence has demonstrated that acquired...,CBL N454D,1,1,2,3,5,...,,,,,,,,,,
4,4,CBL,L399V,Oncogenic mutations in the monomeric Casitas B...,CBL L399V,1,1,2,3,5,...,,,,,,,,,,


In [22]:
# from https://www.kaggle.com/the1owl/redefining-treatment-0-57456
gene_var_lst = sorted(list(df_train.Gene.unique()) + list(df_train.Variation.unique()))

gene_var_lst = [x for x in gene_var_lst if len(x.split(' '))==1]

i_ = 0

for el in tqdm(gene_var_lst):
    df_all['GV_'+str(el)] = df_all['Text'].map(lambda x: str(x).count(str(el)))
    i_ += 1

100%|██████████| 3091/3091 [10:00<00:00,  5.95it/s]


In [23]:
df_all.head()

Unnamed: 0,ID,Gene,Variation,Text,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share,GL,VL,...,GV_Y849C,GV_Y849S,GV_Y87C,GV_Y87N,GV_Y901C,GV_Y931C,GV_Y98H,GV_Y98N,GV_YAP1,GV_p61BRAF
0,0,FAM58A,Truncating Mutations,Cyclin-dependent kinases (CDKs) regulate a var...,FAM58A Truncating Mutations,1,1,2,6,20,...,0,0,0,0,0,0,0,0,0,0
1,1,CBL,W802*,Abstract Background Non-small cell lung canc...,CBL W802*,1,1,2,3,5,...,0,0,0,0,0,0,0,0,0,0
2,2,CBL,Q249E,Abstract Background Non-small cell lung canc...,CBL Q249E,1,1,2,3,5,...,0,0,0,0,0,0,0,0,0,0
3,3,CBL,N454D,Recent evidence has demonstrated that acquired...,CBL N454D,1,1,2,3,5,...,0,0,0,0,0,0,0,0,0,0
4,4,CBL,L399V,Oncogenic mutations in the monomeric Casitas B...,CBL L399V,1,1,2,3,5,...,0,0,0,0,0,0,0,0,0,0


In [24]:
s = df_all['Gene'].value_counts()

In [25]:
print(s['BRCA1'])

293


In [26]:
df_all['G_VC'] = df_all['Gene'].apply(lambda gene: s[str(gene)])

In [27]:
df_all.G_VC[df_all.Gene == 'BRCA1'].count()

293

In [28]:
s = df_all['Variation'].value_counts()

In [29]:
df_all['V_VC'] = df_all['Variation'].apply(lambda var: s[str(var)])

In [30]:
s = df_all['GeneVar'].value_counts()

In [31]:
df_all['GV_VC'] = df_all['GeneVar'].apply(lambda var: s[str(var)])

In [32]:
D = 2 ** 24

In [33]:
df_all['hash'] = df_all['Gene'] + '_' + df_all['Variation'] + '_' + df_all['Text']
                

In [34]:
df_all.head()

Unnamed: 0,ID,Gene,Variation,Text,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share,GL,VL,...,GV_Y901C,GV_Y931C,GV_Y98H,GV_Y98N,GV_YAP1,GV_p61BRAF,G_VC,V_VC,GV_VC,hash
0,0,FAM58A,Truncating Mutations,Cyclin-dependent kinases (CDKs) regulate a var...,FAM58A Truncating Mutations,1,1,2,6,20,...,0,0,0,0,0,0,1,111,1,FAM58A_Truncating Mutations_Cyclin-dependent k...
1,1,CBL,W802*,Abstract Background Non-small cell lung canc...,CBL W802*,1,1,2,3,5,...,0,0,0,0,0,0,27,1,1,CBL_W802*_ Abstract Background Non-small cell...
2,2,CBL,Q249E,Abstract Background Non-small cell lung canc...,CBL Q249E,1,1,2,3,5,...,0,0,0,0,0,0,27,1,1,CBL_Q249E_ Abstract Background Non-small cell...
3,3,CBL,N454D,Recent evidence has demonstrated that acquired...,CBL N454D,1,1,2,3,5,...,0,0,0,0,0,0,27,1,1,CBL_N454D_Recent evidence has demonstrated tha...
4,4,CBL,L399V,Oncogenic mutations in the monomeric Casitas B...,CBL L399V,1,1,2,3,5,...,0,0,0,0,0,0,27,1,1,CBL_L399V_Oncogenic mutations in the monomeric...


In [35]:
df_all['hash'] = df_all['hash'].apply(lambda x: hash(x) % D)

In [36]:
df_all.head()

Unnamed: 0,ID,Gene,Variation,Text,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share,GL,VL,...,GV_Y901C,GV_Y931C,GV_Y98H,GV_Y98N,GV_YAP1,GV_p61BRAF,G_VC,V_VC,GV_VC,hash
0,0,FAM58A,Truncating Mutations,Cyclin-dependent kinases (CDKs) regulate a var...,FAM58A Truncating Mutations,1,1,2,6,20,...,0,0,0,0,0,0,1,111,1,16184684
1,1,CBL,W802*,Abstract Background Non-small cell lung canc...,CBL W802*,1,1,2,3,5,...,0,0,0,0,0,0,27,1,1,15320816
2,2,CBL,Q249E,Abstract Background Non-small cell lung canc...,CBL Q249E,1,1,2,3,5,...,0,0,0,0,0,0,27,1,1,4516901
3,3,CBL,N454D,Recent evidence has demonstrated that acquired...,CBL N454D,1,1,2,3,5,...,0,0,0,0,0,0,27,1,1,12999102
4,4,CBL,L399V,Oncogenic mutations in the monomeric Casitas B...,CBL L399V,1,1,2,3,5,...,0,0,0,0,0,0,27,1,1,6288701


In [37]:
df_all.shape

(8989, 3273)

In [38]:
for c in tqdm(df_all.columns):
    if df_all[c].dtype == 'object':
        if c in ['Gene','Variation','GeneVar']:
            lbl = pe.LabelEncoder()
            df_all[c+'_lbl_enc'] = lbl.fit_transform(df_all[c].values)  
            df_all[c+'_len'] = df_all[c].map(lambda x: len(str(x)))
            df_all[c+'_words'] = df_all[c].map(lambda x: len(str(x).split(' ')))
        elif c != 'Text':
            lbl = pe.LabelEncoder()
            df_all[c] = lbl.fit_transform(df_all[c].values)
        if c=='Text': 
            df_all[c+'_len'] = df_all[c].map(lambda x: len(str(x)))
            df_all[c+'_words'] = df_all[c].map(lambda x: len(str(x).split(' '))) 



100%|██████████| 3273/3273 [00:06<00:00, 521.62it/s]


In [39]:
df_train = df_all.iloc[:len(df_train)]
df_test = df_all.iloc[len(df_train):]

In [40]:
df_train['Class'] = y

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [41]:
df_train.shape

(3321, 3285)

In [42]:
df_test.shape

(5668, 3284)

In [43]:
df_train.head()

Unnamed: 0,ID,Gene,Variation,Text,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share,GL,VL,...,Gene_words,Variation_lbl_enc,Variation_len,Variation_words,Text_len,Text_words,GeneVar_lbl_enc,GeneVar_len,GeneVar_words,Class
0,0,FAM58A,Truncating Mutations,Cyclin-dependent kinases (CDKs) regulate a var...,FAM58A Truncating Mutations,1,1,2,6,20,...,1,7654,20,2,39672,6105,3213,27,3,1
1,1,CBL,W802*,Abstract Background Non-small cell lung canc...,CBL W802*,1,1,2,3,5,...,1,8255,5,1,36691,5783,1680,9,2,2
2,2,CBL,Q249E,Abstract Background Non-small cell lung canc...,CBL Q249E,1,1,2,3,5,...,1,5191,5,1,36691,5783,1672,9,2,2
3,3,CBL,N454D,Recent evidence has demonstrated that acquired...,CBL N454D,1,1,2,3,5,...,1,4572,5,1,36238,5625,1668,9,2,3
4,4,CBL,L399V,Oncogenic mutations in the monomeric Casitas B...,CBL L399V,1,1,2,3,5,...,1,3958,5,1,41308,6248,1666,9,2,4


In [44]:
df_train['hash'].value_counts()

1331621     1
15071974    1
6738695     1
7237415     1
10152704    1
10394366    1
6193913     1
11201272    1
8116978     1
4045550     1
16220905    1
16493288    1
5518070     1
8430345     1
2683619     1
8626913     1
15774431    1
2760339     1
3932889     1
9202392     1
3267286     1
9666727     1
6814418     1
819921      1
6343432     1
1686282     1
2527951     1
5946154     1
4510523     1
13020887    1
           ..
3128716     1
5870940     1
15188709    1
11646296    1
16502147    1
466309      1
12193167    1
3421584     1
7825515     1
10421694    1
16133563    1
138681      1
9188792     1
12369333    1
12158386    1
6274481     1
11380142    1
10807876    1
11066036    1
2114986     1
3593636     1
3089826     1
2591702     1
13516189    1
14368153    1
4920728     1
2641303     1
15458692    1
5123475     1
10160149    1
Name: hash, Length: 3321, dtype: int64

In [45]:
df_test.head()

Unnamed: 0,ID,Gene,Variation,Text,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share,GL,VL,...,Gene_len,Gene_words,Variation_lbl_enc,Variation_len,Variation_words,Text_len,Text_words,GeneVar_lbl_enc,GeneVar_len,GeneVar_words
3321,0,ACSL4,R570S,2. This mutation resulted in a myeloproliferat...,ACSL4 R570S,0,1,1,5,5,...,5,1,6404,5,1,49829,7495,188,11,2
3322,1,NAGLU,P521L,Abstract The Large Tumor Suppressor 1 (LATS1)...,NAGLU P521L,0,1,1,5,5,...,5,1,5005,5,1,31326,4762,5540,11,2
3323,2,PAH,L333F,Vascular endothelial growth factor receptor (V...,PAH L333F,0,1,1,3,5,...,3,1,3915,5,1,75282,11191,6023,9,2
3324,3,ING1,A148D,Inflammatory myofibroblastic tumor (IMT) is a ...,ING1 A148D,0,1,1,4,5,...,4,1,85,5,1,53996,8439,4354,10,2
3325,4,TMEM216,G77A,Abstract Retinoblastoma is a pediatric retina...,TMEM216 G77A,0,1,1,7,4,...,7,1,2780,4,1,76967,11226,8211,12,2


In [46]:
feather.write_dataframe(df_train, '../cache/train_stage1_fe.feather')

In [47]:
feather.write_dataframe(df_test, '../cache/test_stage1_fe.feather')