In [1]:
import pandas as pd
import numpy as np
import feather
from sklearn import preprocessing as pe
from tqdm import tqdm

from utils import text_to_wordlist, get_fp
from sklearn.model_selection import train_test_split

In [2]:
def get_wt(cls):
    tot_neg_instances = df_train.shape[0] - p[cls]
    tot_pos_instances = p[cls]
    return float(tot_neg_instances/tot_pos_instances)

In [3]:
D = 2 ** 24

In [4]:
def do_feature_engg(df_all):
    
    # create a new col GeneVar
    df_all['GeneVar'] = df_all['Gene'] + ' ' + df_all['Variation']
    
    # get the share of the gene expression in the corresponding text in the row
    df_all['Gene_Share'] = df_all.apply(lambda row: sum([1 for w in row['Gene'].split(' ') 
                                                       if w in row['Text'].split(' ')]), axis=1)


    # get the share of the variation in the corresponding text in the row
    df_all['Variation_Share'] = df_all.apply(lambda row: sum([1 for w in row['Variation'].split(' ') 
                                                            if w in row['Text'].split(' ')]), axis=1)

    # get the share of the gene-variation in the corresponding text in the row
    df_all['Gene_Variation_Share'] = df_all.apply(lambda row: sum([1 for w in row['GeneVar'].split(' ') 
                                                            if w in row['Text'].split(' ')]), axis=1)

    # get the gene length
    df_all['GL'] = df_all['Gene'].apply(lambda x: len(x))

    # get the variation length
    df_all['VL'] = df_all['Variation'].apply(lambda x: len(x))

    # print(sorted(df_all['VL'].values))

    max_len=55 # VL has max length 55 
    for i in range(max_len+1):
        df_all['Gene_'+str(i)] = df_all['Gene'].map(lambda x: str(x[i]) if len(x)>i else '')
        df_all['Variation_'+str(i)] = df_all['Variation'].map(lambda x: str(x[i]) if len(x)>i else '')
        df_all['GeneVar_'+str(i)] = df_all['GeneVar'].map(lambda x: str(x[i]) if len(x)>i else '')

    # from https://www.kaggle.com/the1owl/redefining-treatment-0-57456
#     gene_var_lst = sorted(list(df_train.Gene.unique()) + list(df_train.Variation.unique()))
    gene_var_lst = sorted(list(df_all.Gene.unique()) + list(df_all.Variation.unique()))
    gene_var_lst = [x for x in gene_var_lst if len(x.split(' '))==1]

    i_ = 0

    for el in tqdm(gene_var_lst):
        df_all['GV_'+str(el)] = df_all['Text'].map(lambda x: str(x).count(str(el)))
        i_ += 1
    
    # gene expression value counts
    s = df_all['Gene'].value_counts()
    df_all['G_VC'] = df_all['Gene'].apply(lambda gene: s[str(gene)])
    # print(df_all.G_VC[df_all.Gene == 'BRCA1'].count())

    # variation value counts
    s = df_all['Variation'].value_counts()
    df_all['V_VC'] = df_all['Variation'].apply(lambda var: s[str(var)])

    # gene variation value counts
    s = df_all['GeneVar'].value_counts()
    df_all['GV_VC'] = df_all['GeneVar'].apply(lambda var: s[str(var)])

    # hash gene variation and text
    df_all['hash'] = df_all['Gene'] + '_' + df_all['Variation'] + '_' + df_all['Text']
    df_all['hash'] = df_all['hash'].apply(lambda x: hash(x) % D)

    # label encoding, length of column and number of words in the column
    for c in tqdm(df_all.columns):
        if df_all[c].dtype == 'object':
            if c in ['Gene','Variation','GeneVar']:
                lbl = pe.LabelEncoder()
                df_all[c+'_lbl_enc'] = lbl.fit_transform(df_all[c].values)  
                df_all[c+'_len'] = df_all[c].map(lambda x: len(str(x)))
                df_all[c+'_words'] = df_all[c].map(lambda x: len(str(x).split(' ')))
            elif c != 'Text':
                lbl = pe.LabelEncoder()
                df_all[c] = lbl.fit_transform(df_all[c].values)
            if c=='Text': 
                df_all[c+'_len'] = df_all[c].map(lambda x: len(str(x)))
                df_all[c+'_words'] = df_all[c].map(lambda x: len(str(x).split(' '))) 

    return df_all


In [5]:
df_train = feather.read_dataframe('../cache/train_stage2.feather')

In [6]:
df_train.head()

Unnamed: 0,Class,Gene,ID,Text,Variation
0,1,FAM58A,0,Cyclin-dependent kinases (CDKs) regulate a var...,Truncating Mutations
1,2,CBL,1,Abstract Background Non-small cell lung canc...,W802*
2,2,CBL,2,Abstract Background Non-small cell lung canc...,Q249E
3,3,CBL,3,Recent evidence has demonstrated that acquired...,N454D
4,4,CBL,4,Oncogenic mutations in the monomeric Casitas B...,L399V


In [7]:
df_test = feather.read_dataframe('../cache/test_stage2.feather')

In [8]:
df_test.head()

Unnamed: 0,ID,Gene,Variation,Text
0,1,CHEK2,H371Y,The incidence of breast cancer is increasing i...
1,2,AXIN2,Truncating Mutations,An unselected series of 310 colorectal carcino...
2,3,WNT4,E216G,Mycosis fungoides and Sézary syndrome are prim...
3,4,SUCLA2,G118R,Regulated progression through the cell cycle ...
4,5,BRAF,T599insTT,Pilocytic astrocytoma (PA) is emerging as a tu...


In [9]:
y = df_train['Class'].values
df_train = df_train.drop(['Class'], axis=1)

test_ids = df_test['ID'].values

In [10]:
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)

In [11]:
df_all.head()

Unnamed: 0,Gene,ID,Text,Variation
0,FAM58A,0,Cyclin-dependent kinases (CDKs) regulate a var...,Truncating Mutations
1,CBL,1,Abstract Background Non-small cell lung canc...,W802*
2,CBL,2,Abstract Background Non-small cell lung canc...,Q249E
3,CBL,3,Recent evidence has demonstrated that acquired...,N454D
4,CBL,4,Oncogenic mutations in the monomeric Casitas B...,L399V


In [12]:
df_all.shape

(4675, 4)

In [13]:
df_all = do_feature_engg(df_all)

100%|██████████| 4141/4141 [08:05<00:00,  8.53it/s]
100%|██████████| 4323/4323 [00:03<00:00, 1254.73it/s]


In [14]:
df_train = df_all.iloc[:len(df_train)]
df_test = df_all.iloc[len(df_train):]

In [15]:
df_train['Class'] = y

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [16]:
df_train.shape

(3689, 4335)

In [17]:
df_test.shape

(986, 4334)

In [18]:
df_train.head()

Unnamed: 0,Gene,ID,Text,Variation,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share,GL,VL,...,Gene_words,Text_len,Text_words,Variation_lbl_enc,Variation_len,Variation_words,GeneVar_lbl_enc,GeneVar_len,GeneVar_words,Class
0,FAM58A,0,Cyclin-dependent kinases (CDKs) regulate a var...,Truncating Mutations,FAM58A Truncating Mutations,1,1,2,6,20,...,1,39672,6105,3460,20,2,1654,27,3,1
1,CBL,1,Abstract Background Non-small cell lung canc...,W802*,CBL W802*,1,1,2,3,5,...,1,36691,5783,3748,5,1,941,9,2,2
2,CBL,2,Abstract Background Non-small cell lung canc...,Q249E,CBL Q249E,1,1,2,3,5,...,1,36691,5783,2425,5,1,933,9,2,2
3,CBL,3,Recent evidence has demonstrated that acquired...,N454D,CBL N454D,1,1,2,3,5,...,1,36238,5625,2132,5,1,929,9,2,3
4,CBL,4,Oncogenic mutations in the monomeric Casitas B...,L399V,CBL L399V,1,1,2,3,5,...,1,41308,6248,1854,5,1,927,9,2,4


In [19]:
df_train['hash'].value_counts()

1331621     1
7282969     1
9202392     1
3267286     1
9666727     1
6814418     1
819921      1
12552912    1
2527951     1
7240398     1
5086951     1
3502795     1
9524488     1
8667849     1
13005511    1
15358662    1
111299      1
13212354    1
13564606    1
15065789    1
11491229    1
5554869     1
11662003    1
10748593    1
7850119     1
11756205    1
8179693     1
3932889     1
16409309    1
16317094    1
           ..
11363650    1
2692468     1
7855477     1
9405814     1
5123475     1
10807876    1
11066036    1
2114986     1
3593636     1
3089826     1
2591702     1
13516189    1
14368153    1
4920728     1
2641303     1
15458692    1
3421584     1
9720221     1
12193167    1
7619982     1
10757514    1
466309      1
7067445     1
16502147    1
2073986     1
6462849     1
9821565     1
839035      1
6499813     1
16441348    1
Name: hash, Length: 3689, dtype: int64

In [20]:
df_test.head()

Unnamed: 0,Gene,ID,Text,Variation,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share,GL,VL,...,Gene_len,Gene_words,Text_len,Text_words,Variation_lbl_enc,Variation_len,Variation_words,GeneVar_lbl_enc,GeneVar_len,GeneVar_words
3689,CHEK2,1,The incidence of breast cancer is increasing i...,H371Y,CHEK2 H371Y,1,1,2,5,5,...,5,1,33403,4991,1379,5,1,1080,11,2
3690,AXIN2,2,An unselected series of 310 colorectal carcino...,Truncating Mutations,AXIN2 Truncating Mutations,1,1,2,5,20,...,5,1,66400,10348,3460,20,2,286,26,3
3691,WNT4,3,Mycosis fungoides and Sézary syndrome are prim...,E216G,WNT4 E216G,0,1,1,4,5,...,4,1,58544,8638,672,5,1,4287,10,2
3692,SUCLA2,4,Regulated progression through the cell cycle ...,G118R,SUCLA2 G118R,0,1,1,6,5,...,6,1,42023,6221,1038,5,1,3841,12,2
3693,BRAF,5,Pilocytic astrocytoma (PA) is emerging as a tu...,T599insTT,BRAF T599insTT,1,0,1,4,9,...,4,1,22499,3280,3402,9,1,430,14,2


In [21]:
feather.write_dataframe(df_train, '../cache/train_stage2_fe.feather')

In [22]:
feather.write_dataframe(df_test, '../cache/test_stage2_fe.feather')

In [23]:
df_train['Text'] = [text_to_wordlist(w) for w in df_train['Text'].values]
df_test['Text'] = [text_to_wordlist(w) for w in df_test['Text'].values]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [24]:
test_id = df_test.ID.values
df = pd.DataFrame()
df['ID'] = test_id
df.to_csv('../cache/stage2_test_id.csv', index=False)

In [25]:
y = df_train.Class.values
df = pd.DataFrame()
df['y'] = y
df.to_csv('../cache/stage2_labels.csv', index=False)

In [26]:
df_train = df_train.drop('Class',axis=1)

In [29]:
fp = 0
fp = get_fp(fp)

In [30]:
df_train = fp.fit_transform(df_train)
print (df_train.shape)



(3689, 4689)


In [31]:
df_test = fp.fit_transform(df_test)
print (df_test.shape)



(986, 4689)


In [32]:
np.save('../cache/train_stage2_fe2', df_train)
np.save('../cache/test_stage2_fe2', df_test)

In [33]:
# now claculate weights

In [34]:
df_train = feather.read_dataframe('../cache/train_stage2_fe.feather')

In [35]:
df_train.head()

Unnamed: 0,Gene,ID,Text,Variation,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share,GL,VL,...,Gene_words,Text_len,Text_words,Variation_lbl_enc,Variation_len,Variation_words,GeneVar_lbl_enc,GeneVar_len,GeneVar_words,Class
0,FAM58A,0,Cyclin-dependent kinases (CDKs) regulate a var...,Truncating Mutations,FAM58A Truncating Mutations,1,1,2,6,20,...,1,39672,6105,3460,20,2,1654,27,3,1
1,CBL,1,Abstract Background Non-small cell lung canc...,W802*,CBL W802*,1,1,2,3,5,...,1,36691,5783,3748,5,1,941,9,2,2
2,CBL,2,Abstract Background Non-small cell lung canc...,Q249E,CBL Q249E,1,1,2,3,5,...,1,36691,5783,2425,5,1,933,9,2,2
3,CBL,3,Recent evidence has demonstrated that acquired...,N454D,CBL N454D,1,1,2,3,5,...,1,36238,5625,2132,5,1,929,9,2,3
4,CBL,4,Oncogenic mutations in the monomeric Casitas B...,L399V,CBL L399V,1,1,2,3,5,...,1,41308,6248,1854,5,1,927,9,2,4


In [36]:
df_train.shape

(3689, 4335)

In [37]:
pd.value_counts(df_train['Class'])

7    1054
4     751
1     662
2     498
6     297
5     267
3      96
9      43
8      21
Name: Class, dtype: int64

In [38]:
p =pd.value_counts(df_train['Class'])

In [39]:
print(p[7])

1054


In [40]:
print(p[8])

21


In [41]:
df_train['wt'] =  df_train['Class'].map(lambda s: get_wt(s))

In [42]:
df_train.head()

Unnamed: 0,Gene,ID,Text,Variation,GeneVar,Gene_Share,Variation_Share,Gene_Variation_Share,GL,VL,...,Text_len,Text_words,Variation_lbl_enc,Variation_len,Variation_words,GeneVar_lbl_enc,GeneVar_len,GeneVar_words,Class,wt
0,FAM58A,0,Cyclin-dependent kinases (CDKs) regulate a var...,Truncating Mutations,FAM58A Truncating Mutations,1,1,2,6,20,...,39672,6105,3460,20,2,1654,27,3,1,4.572508
1,CBL,1,Abstract Background Non-small cell lung canc...,W802*,CBL W802*,1,1,2,3,5,...,36691,5783,3748,5,1,941,9,2,2,6.407631
2,CBL,2,Abstract Background Non-small cell lung canc...,Q249E,CBL Q249E,1,1,2,3,5,...,36691,5783,2425,5,1,933,9,2,2,6.407631
3,CBL,3,Recent evidence has demonstrated that acquired...,N454D,CBL N454D,1,1,2,3,5,...,36238,5625,2132,5,1,929,9,2,3,37.427083
4,CBL,4,Oncogenic mutations in the monomeric Casitas B...,L399V,CBL L399V,1,1,2,3,5,...,41308,6248,1854,5,1,927,9,2,4,3.912117


In [43]:
wt = df_train.wt.values
df = pd.DataFrame()
df['wt'] = wt
df.to_csv('../cache/stage2_weights.csv', index=False)

In [44]:
np.save('../cache/stage2_train_weights', wt)

In [45]:
my_wt = {}
n_class = 9

In [46]:
for cls in range(n_class):
    my_wt[cls+1] = get_wt(cls+1)

In [47]:
my_wt

{1: 4.572507552870091,
 2: 6.407630522088353,
 3: 37.427083333333336,
 4: 3.912117177097204,
 5: 12.816479400749063,
 6: 11.42087542087542,
 7: 2.5,
 8: 174.66666666666666,
 9: 84.79069767441861}

In [48]:
np.save('../cache/stage2_train_weights_per_class', my_wt)

In [None]:
# now create validation data to be used by all stage2 classifers

In [3]:
df_train = np.load('../cache/train_stage2_fe2.npy')
df1 = pd.read_csv('../cache/stage2_labels.csv')
y = df1['y'].values

In [5]:
x1,x2,y1,y2 = train_test_split(df_train, y, test_size=0.2, random_state=42, stratify=y)

In [6]:
np.save('../cache/train_stage2_x1', x1)
np.save('../cache/train_stage2_x2', x2)
np.save('../cache/train_stage2_y1', y1)
np.save('../cache/train_stage2_y2', y2)

In [18]:
p =pd.value_counts(y1)
df_train = pd.DataFrame(x1)
my_wt = {}
n_class = 9

for cls in range(n_class):
    my_wt[cls+1] = get_wt(cls+1)
print(my_wt)

np.save('../cache/stage2_x1_weights_per_class', my_wt)

{1: 4.578449905482041, 2: 6.414572864321608, 3: 37.324675324675326, 4: 3.9101497504159735, 5: 12.789719626168225, 6: 11.399159663865547, 7: 2.5005931198102016, 8: 172.58823529411765, 9: 85.79411764705883}


In [20]:
df_train = pd.DataFrame(x2)
p =pd.value_counts(y2)

my_wt = {}
n_class = 9

for cls in range(n_class):
    my_wt[cls+1] = get_wt(cls+1)
print(my_wt)

{1: 4.548872180451128, 2: 6.38, 3: 37.8421052631579, 4: 3.92, 5: 12.924528301886792, 6: 11.508474576271187, 7: 2.4976303317535544, 8: 183.5, 9: 81.0}


In [21]:
np.save('../cache/stage2_x2_weights_per_class', my_wt)