In [1]:
# some parts have been taken from https://www.kaggle.com/the1owl/redefining-treatment-0-57456

In [1]:
import pandas as pd
import numpy as np
import feather
from sklearn import preprocessing as pe
from tqdm import tqdm

from utils import text_to_wordlist, get_fp

In [3]:
def get_wt(cls):
    tot_neg_instances = df_train.shape[0] - p[cls]
    tot_pos_instances = p[cls]
    return float(tot_neg_instances/tot_pos_instances)

In [None]:
D = 2 ** 24
def do_feature_engg(df_all):
    
    # create a new col GeneVar
    df_all['GeneVar'] = df_all['Gene'] + ' ' + df_all['Variation']
    
    # get the share of the gene expression in the corresponding text in the row
    df_all['Gene_Share'] = df_all.apply(lambda row: sum([1 for w in row['Gene'].split(' ') 
                                                       if w in row['Text'].split(' ')]), axis=1)


    # get the share of the variation in the corresponding text in the row
    df_all['Variation_Share'] = df_all.apply(lambda row: sum([1 for w in row['Variation'].split(' ') 
                                                            if w in row['Text'].split(' ')]), axis=1)

    # get the share of the gene-variation in the corresponding text in the row
    df_all['Gene_Variation_Share'] = df_all.apply(lambda row: sum([1 for w in row['GeneVar'].split(' ') 
                                                            if w in row['Text'].split(' ')]), axis=1)

    # get the gene length
    df_all['GL'] = df_all['Gene'].apply(lambda x: len(x))

    # get the variation length
    df_all['VL'] = df_all['Variation'].apply(lambda x: len(x))

    # print(sorted(df_all['VL'].values))

    max_len=55 # VL has max length 55 
    for i in range(max_len+1):
        df_all['Gene_'+str(i)] = df_all['Gene'].map(lambda x: str(x[i]) if len(x)>i else '')
        df_all['Variation_'+str(i)] = df_all['Variation'].map(lambda x: str(x[i]) if len(x)>i else '')
        df_all['GeneVar_'+str(i)] = df_all['GeneVar'].map(lambda x: str(x[i]) if len(x)>i else '')

    # from https://www.kaggle.com/the1owl/redefining-treatment-0-57456
#     gene_var_lst = sorted(list(df_train.Gene.unique()) + list(df_train.Variation.unique()))
    gene_var_lst = sorted(list(df_all.Gene.unique()) + list(df_all.Variation.unique()))
    gene_var_lst = [x for x in gene_var_lst if len(x.split(' '))==1]

    i_ = 0

    for el in tqdm(gene_var_lst):
        df_all['GV_'+str(el)] = df_all['Text'].map(lambda x: str(x).count(str(el)))
        i_ += 1
    
    # gene expression value counts
    s = df_all['Gene'].value_counts()
    df_all['G_VC'] = df_all['Gene'].apply(lambda gene: s[str(gene)])
    # print(df_all.G_VC[df_all.Gene == 'BRCA1'].count())

    # variation value counts
    s = df_all['Variation'].value_counts()
    df_all['V_VC'] = df_all['Variation'].apply(lambda var: s[str(var)])

    # gene variation value counts
    s = df_all['GeneVar'].value_counts()
    df_all['GV_VC'] = df_all['GeneVar'].apply(lambda var: s[str(var)])

    # hash gene variation and text
    df_all['hash'] = df_all['Gene'] + '_' + df_all['Variation'] + '_' + df_all['Text']
    df_all['hash'] = df_all['hash'].map(lambda x: hash(x) % D)

    # label encoding, length of column and number of words in the column
    for c in tqdm(df_all.columns):
        if df_all[c].dtype == 'object':
            if c in ['Gene','Variation','GeneVar']:
                lbl = pe.LabelEncoder()
                df_all[c+'_lbl_enc'] = lbl.fit_transform(df_all[c].values)  
                df_all[c+'_len'] = df_all[c].map(lambda x: len(str(x)))
                df_all[c+'_words'] = df_all[c].map(lambda x: len(str(x).split(' ')))
            elif c != 'Text':
                lbl = pe.LabelEncoder()
                df_all[c] = lbl.fit_transform(df_all[c].values)
            if c=='Text': 
                df_all[c+'_len'] = df_all[c].map(lambda x: len(str(x)))
                df_all[c+'_words'] = df_all[c].map(lambda x: len(str(x).split(' '))) 

    return df_all


In [4]:
df_train = feather.read_dataframe('../cache/train_stage1.feather')

In [5]:
df_train.shape

(3321, 5)

In [6]:
df_train.head()

Unnamed: 0,ID,Gene,Variation,Class,Text
0,0,FAM58A,Truncating Mutations,1,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,CBL,W802*,2,Abstract Background Non-small cell lung canc...
2,2,CBL,Q249E,2,Abstract Background Non-small cell lung canc...
3,3,CBL,N454D,3,Recent evidence has demonstrated that acquired...
4,4,CBL,L399V,4,Oncogenic mutations in the monomeric Casitas B...


In [7]:
df_test = feather.read_dataframe('../cache/test_stage1.feather')

In [8]:
df_test.shape

(5668, 4)

In [9]:
df_test.head()

Unnamed: 0,ID,Gene,Variation,Text
0,0,ACSL4,R570S,2. This mutation resulted in a myeloproliferat...
1,1,NAGLU,P521L,Abstract The Large Tumor Suppressor 1 (LATS1)...
2,2,PAH,L333F,Vascular endothelial growth factor receptor (V...
3,3,ING1,A148D,Inflammatory myofibroblastic tumor (IMT) is a ...
4,4,TMEM216,G77A,Abstract Retinoblastoma is a pediatric retina...


In [10]:
y = df_train['Class'].values
df_train = df_train.drop(['Class'], axis=1)

test_ids = df_test['ID'].values

In [11]:
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)

In [12]:
df_all.head()

Unnamed: 0,ID,Gene,Variation,Text
0,0,FAM58A,Truncating Mutations,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,CBL,W802*,Abstract Background Non-small cell lung canc...
2,2,CBL,Q249E,Abstract Background Non-small cell lung canc...
3,3,CBL,N454D,Recent evidence has demonstrated that acquired...
4,4,CBL,L399V,Oncogenic mutations in the monomeric Casitas B...


In [13]:
df_all.shape

(8989, 4)

In [14]:
D = 2 ** 24

In [16]:
df_all = do_feature_engg(df_all)

100%|██████████| 9935/9935 [33:38<00:00,  4.71it/s]
100%|██████████| 10117/10117 [00:06<00:00, 1569.88it/s]


In [17]:
df_train = df_all.iloc[:len(df_train)]
df_test = df_all.iloc[len(df_train):]

In [18]:
df_train['Class'] = y

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


100%|██████████| 3273/3273 [00:06<00:00, 521.62it/s]


In [19]:
df_train.shape

(3321, 10129)

In [20]:
df_test.shape

(5668, 10128)

In [21]:
df_test.head()

Unnamed: 0,ID,Gene,Variation,Text,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share,GL,VL,...,Gene_len,Gene_words,Variation_lbl_enc,Variation_len,Variation_words,Text_len,Text_words,GeneVar_lbl_enc,GeneVar_len,GeneVar_words
3321,0,ACSL4,R570S,2. This mutation resulted in a myeloproliferat...,ACSL4 R570S,0,1,1,5,5,...,5,1,6404,5,1,49829,7495,188,11,2
3322,1,NAGLU,P521L,Abstract The Large Tumor Suppressor 1 (LATS1)...,NAGLU P521L,0,1,1,5,5,...,5,1,5005,5,1,31326,4762,5540,11,2
3323,2,PAH,L333F,Vascular endothelial growth factor receptor (V...,PAH L333F,0,1,1,3,5,...,3,1,3915,5,1,75282,11191,6023,9,2
3324,3,ING1,A148D,Inflammatory myofibroblastic tumor (IMT) is a ...,ING1 A148D,0,1,1,4,5,...,4,1,85,5,1,53996,8439,4354,10,2
3325,4,TMEM216,G77A,Abstract Retinoblastoma is a pediatric retina...,TMEM216 G77A,0,1,1,7,4,...,7,1,2780,4,1,76967,11226,8211,12,2


In [22]:
feather.write_dataframe(df_train, '../cache/train_stage1_fe.feather')

In [43]:
df_train.head()

Unnamed: 0,ID,Gene,Variation,Text,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share,GL,VL,...,Gene_words,Variation_lbl_enc,Variation_len,Variation_words,Text_len,Text_words,GeneVar_lbl_enc,GeneVar_len,GeneVar_words,Class
0,0,FAM58A,Truncating Mutations,Cyclin-dependent kinases (CDKs) regulate a var...,FAM58A Truncating Mutations,1,1,2,6,20,...,1,7654,20,2,39672,6105,3213,27,3,1
1,1,CBL,W802*,Abstract Background Non-small cell lung canc...,CBL W802*,1,1,2,3,5,...,1,8255,5,1,36691,5783,1680,9,2,2
2,2,CBL,Q249E,Abstract Background Non-small cell lung canc...,CBL Q249E,1,1,2,3,5,...,1,5191,5,1,36691,5783,1672,9,2,2
3,3,CBL,N454D,Recent evidence has demonstrated that acquired...,CBL N454D,1,1,2,3,5,...,1,4572,5,1,36238,5625,1668,9,2,3
4,4,CBL,L399V,Oncogenic mutations in the monomeric Casitas B...,CBL L399V,1,1,2,3,5,...,1,3958,5,1,41308,6248,1666,9,2,4


In [44]:
df_train['hash'].value_counts()

1331621     1
15071974    1
6738695     1
7237415     1
10152704    1
10394366    1
6193913     1
11201272    1
8116978     1
4045550     1
16220905    1
16493288    1
5518070     1
8430345     1
2683619     1
8626913     1
15774431    1
2760339     1
3932889     1
9202392     1
3267286     1
9666727     1
6814418     1
819921      1
6343432     1
1686282     1
2527951     1
5946154     1
4510523     1
13020887    1
           ..
3128716     1
5870940     1
15188709    1
11646296    1
16502147    1
466309      1
12193167    1
3421584     1
7825515     1
10421694    1
16133563    1
138681      1
9188792     1
12369333    1
12158386    1
6274481     1
11380142    1
10807876    1
11066036    1
2114986     1
3593636     1
3089826     1
2591702     1
13516189    1
14368153    1
4920728     1
2641303     1
15458692    1
5123475     1
10160149    1
Name: hash, Length: 3321, dtype: int64

In [23]:
feather.write_dataframe(df_test, '../cache/test_stage1_fe.feather')

In [24]:
df_train['Text'] = [text_to_wordlist(w) for w in df_train['Text'].values]
df_test['Text'] = [text_to_wordlist(w) for w in df_test['Text'].values]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [25]:
test_id = df_test.ID.values
df = pd.DataFrame()
df['ID'] = test_id
df.to_csv('../cache/stage1_test_id.csv', index=False)

In [26]:
y = df_train.Class.values
df = pd.DataFrame()
df['y'] = y
df.to_csv('../cache/stage1_labels.csv', index=False)

In [27]:
p =pd.value_counts(df_train['Class'])

In [28]:
df_train['wt'] =  df_train['Class'].map(lambda s: get_wt(s))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [29]:
wt = df_train.wt.values
df = pd.DataFrame()
df['wt'] = wt
df.to_csv('../cache/stage1_weights.csv', index=False)
np.save('../cache/stage1_train_weights', wt)

In [30]:
my_wt = {}
n_class = 9
for cls in range(n_class):
    my_wt[cls+1] = get_wt(cls+1)
np.save('../cache/stage1_train_weights_per_class', my_wt)

In [31]:
fp = 0
fp = get_fp(fp)

In [32]:
df_train = df_train.drop('Class',axis=1)
df_train = df_train.drop('wt',axis=1)

In [33]:
df_train = fp.fit_transform(df_train)
print (df_train.shape)



(3321, 10483)


In [34]:
df_test = fp.fit_transform(df_test)
print (df_test.shape)



(5668, 10483)


In [35]:
np.save('../cache/train_stage1_fe2', df_train)
np.save('../cache/test_stage1_fe2', df_test)

In [57]:
df_train = feather.read_dataframe('../cache/train_stage1_fe.feather')
df_test = feather.read_dataframe('../cache/test_stage1_fe.feather')

In [58]:
df_train.head()

Unnamed: 0,ID,Gene,Variation,Text,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share,GL,VL,...,Gene_words,Variation_lbl_enc,Variation_len,Variation_words,Text_len,Text_words,GeneVar_lbl_enc,GeneVar_len,GeneVar_words,Class
0,0,FAM58A,Truncating Mutations,Cyclin-dependent kinases (CDKs) regulate a var...,FAM58A Truncating Mutations,1,1,2,6,20,...,1,7654,20,2,39672,6105,3213,27,3,1
1,1,CBL,W802*,Abstract Background Non-small cell lung canc...,CBL W802*,1,1,2,3,5,...,1,8255,5,1,36691,5783,1680,9,2,2
2,2,CBL,Q249E,Abstract Background Non-small cell lung canc...,CBL Q249E,1,1,2,3,5,...,1,5191,5,1,36691,5783,1672,9,2,2
3,3,CBL,N454D,Recent evidence has demonstrated that acquired...,CBL N454D,1,1,2,3,5,...,1,4572,5,1,36238,5625,1668,9,2,3
4,4,CBL,L399V,Oncogenic mutations in the monomeric Casitas B...,CBL L399V,1,1,2,3,5,...,1,3958,5,1,41308,6248,1666,9,2,4


In [51]:
print(df_train.shape, df_test.shape)

(3321, 10129) (5668, 10128)


In [52]:
df_stage1_partial_soln = pd.read_csv('../data/stage1_solution_filtered.csv')
cols = []
for cls in range(9):
    cols.append('class' + str(cls+1))
df_stage1_partial_soln['Class'] = df_stage1_partial_soln[cols].idxmax(axis=1).map(lambda x: int(x[-1]))

partial_id = df_stage1_partial_soln['ID'].values
df_partial_test = df_test[df_test.ID.isin(partial_id)].reset_index()

In [53]:
df_stage1_partial_soln.head()

Unnamed: 0,ID,class1,class2,class3,class4,class5,class6,class7,class8,class9,Class
0,12,1,0,0,0,0,0,0,0,0,1
1,19,0,1,0,0,0,0,0,0,0,2
2,21,0,1,0,0,0,0,0,0,0,2
3,55,0,0,0,1,0,0,0,0,0,4
4,64,0,0,0,1,0,0,0,0,0,4


In [54]:
df_all_p = pd.concat((df_train, df_partial_test), axis=0, ignore_index=True)

In [40]:
df_all_p = do_feature_engg(df_all_p)

100%|██████████| 3397/3397 [05:10<00:00, 12.12it/s]
100%|██████████| 10130/10130 [00:02<00:00, 3747.16it/s]


In [41]:
df_train2 = df_all_p.iloc[:len(df_train2)]
df_test2 = df_all_p.iloc[len(df_train2):]

In [42]:
feather.write_dataframe(df_train2, '../cache/train_p_stage1_fe.feather')
feather.write_dataframe(df_test2, '../cache/test_p_stage1_fe.feather')

In [59]:
df_train2 = feather.read_dataframe('../cache/train_p_stage1_fe.feather')
df_test2 = feather.read_dataframe('../cache/test_p_stage1_fe.feather')

In [60]:
print(df_train2.shape, df_test2.shape)

(3321, 10130) (368, 10130)


In [61]:
set(df_train2.columns) - set(df_test2.columns)

set()

In [62]:
set(df_test2.columns) - set(df_train2.columns)

set()

In [63]:
df_train2 = df_train2.drop('Class',axis=1)
df_test2 = df_test2.drop('Class',axis=1)

In [64]:
df_train2 = df_train2.drop('index',axis=1)
df_test2 = df_test2.drop('index',axis=1)

In [65]:
df_train2.head()

Unnamed: 0,GL,GV_1_2009trunc,GV_2010_2471trunc,GV_256_286trunc,GV_385_418del,GV_422_605trunc,GV_533_534del,GV_534_536del,GV_550_592del,GV_560_561insER,...,Variation_55,Variation_6,Variation_7,Variation_8,Variation_9,Variation_Share,Variation_lbl_enc,Variation_len,Variation_words,hash
0,6,0,0,0,0,0,0,0,0,0,...,0,47,40,43,32,1,2904,20,2,16184684
1,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,3154,5,1,15320816
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,2095,5,1,4516901
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1842,5,1,12999102
4,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1599,5,1,6288701


In [66]:
df_train2['Text'] = [text_to_wordlist(w) for w in df_train2['Text'].values]
df_test2['Text'] = [text_to_wordlist(w) for w in df_test2['Text'].values]

test_id2 = df_test2.ID.values
df2 = pd.DataFrame()
df2['ID'] = test_id2
df2.to_csv('../cache/stage1_p_test_id.csv', index=False)



In [67]:
fp = 0
fp = get_fp(fp)

df_train2 = fp.fit_transform(df_train2)
print (df_train2.shape)

df_test2 = fp.fit_transform(df_test2)
print (df_test2.shape)

tr = df_train2
# te = df_test2

(3321, 10233)


(368, 10233)


In [74]:
print(np.isnan(df_train2).any())
print(np.isnan(df_train2).any())

False
False


In [75]:
print(np.argwhere(np.isnan(df_train2)))

[]


In [76]:
df_train2.shape

(3321, 10233)

In [77]:
df_test2.shape

(368, 10233)

In [71]:
# df_train2[:,~np.all(np.isnan(df_train2), axis=0)].shape

In [72]:
# df_test2[:,~np.all(np.isnan(df_test2), axis=0)].shape

In [78]:
np.save('../cache/train_p_stage1_fe2', df_train2)
np.save('../cache/test_p_stage1_fe2', df_test2)

In [69]:
df_stage1_partial_soln.head()

Unnamed: 0,ID,class1,class2,class3,class4,class5,class6,class7,class8,class9,Class
0,12,1,0,0,0,0,0,0,0,0,1
1,19,0,1,0,0,0,0,0,0,0,2
2,21,0,1,0,0,0,0,0,0,0,2
3,55,0,0,0,1,0,0,0,0,0,4
4,64,0,0,0,1,0,0,0,0,0,4


In [71]:
y1 = df_stage1_partial_soln.Class.values
df2 = pd.DataFrame()
df2['y'] = y1
df2.to_csv('../cache/stage1_p_test_labels.csv', index=False)