### 2. Data Preprocessing and Feature Engineering

In [None]:
#!pip install py7zr

In [None]:
#import nltk
#nltk.download('stopwords')

In [1]:
import numpy as np
import pandas as pd
import os
import py7zr
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn.decomposition import TruncatedSVD

In [2]:
import warnings
warnings.filterwarnings('ignore')

#### Load MSKCC cancer treatment dataset [Kaggle 2017]

In [3]:
for dirname, _, filenames in os.walk('msk-redefining-cancer-treatment'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

msk-redefining-cancer-treatment/stage_2_private_solution.csv.7z
msk-redefining-cancer-treatment/stage2_test_text.csv.7z
msk-redefining-cancer-treatment/stage2_test_variants.csv.7z
msk-redefining-cancer-treatment/stage1_solution_filtered.csv.7z
msk-redefining-cancer-treatment/test_variants.zip
msk-redefining-cancer-treatment/training_text.zip
msk-redefining-cancer-treatment/test_text.zip
msk-redefining-cancer-treatment/stage2_sample_submission.csv.7z
msk-redefining-cancer-treatment/training_variants.zip


#### Load data

In [5]:
df_train = pd.read_csv('msk-redefining-cancer-treatment/training_text.zip', engine='python', sep='\|\|', skiprows=1, names=["ID", "Text"]).set_index('ID')
df_train2 = pd.read_csv('msk-redefining-cancer-treatment/training_variants.zip').set_index('ID')

df_test = pd.read_csv('msk-redefining-cancer-treatment/test_text.zip', engine='python', sep='\|\|', header=None, skiprows=1, names=["ID", "Text"]).set_index('ID')
df_test2 = pd.read_csv('msk-redefining-cancer-treatment/test_variants.zip').set_index('ID')

df_train

Unnamed: 0_level_0,Text
ID,Unnamed: 1_level_1
0,Cyclin-dependent kinases (CDKs) regulate a var...
1,Abstract Background Non-small cell lung canc...
2,Abstract Background Non-small cell lung canc...
3,Recent evidence has demonstrated that acquired...
4,Oncogenic mutations in the monomeric Casitas B...
...,...
3316,Introduction Myelodysplastic syndromes (MDS) ...
3317,Introduction Myelodysplastic syndromes (MDS) ...
3318,The Runt-related transcription factor 1 gene (...
3319,The RUNX1/AML1 gene is the most frequent targe...


In [6]:
df_train2

Unnamed: 0_level_0,Gene,Variation,Class
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,FAM58A,Truncating Mutations,1
1,CBL,W802*,2
2,CBL,Q249E,2
3,CBL,N454D,3
4,CBL,L399V,4
...,...,...,...
3316,RUNX1,D171N,4
3317,RUNX1,A122*,1
3318,RUNX1,Fusions,1
3319,RUNX1,R80C,4


#### Merge data: Gene, Variation and Class, Text 

In [7]:
train = pd.merge(df_train2, df_train, how='inner', on='ID').fillna('')
test = pd.merge(df_test2, df_test, how='inner', on='ID').fillna('')
train

Unnamed: 0_level_0,Gene,Variation,Class,Text
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,FAM58A,Truncating Mutations,1,Cyclin-dependent kinases (CDKs) regulate a var...
1,CBL,W802*,2,Abstract Background Non-small cell lung canc...
2,CBL,Q249E,2,Abstract Background Non-small cell lung canc...
3,CBL,N454D,3,Recent evidence has demonstrated that acquired...
4,CBL,L399V,4,Oncogenic mutations in the monomeric Casitas B...
...,...,...,...,...
3316,RUNX1,D171N,4,Introduction Myelodysplastic syndromes (MDS) ...
3317,RUNX1,A122*,1,Introduction Myelodysplastic syndromes (MDS) ...
3318,RUNX1,Fusions,1,The Runt-related transcription factor 1 gene (...
3319,RUNX1,R80C,4,The RUNX1/AML1 gene is the most frequent targe...


In [8]:
test

Unnamed: 0_level_0,Gene,Variation,Text
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,ACSL4,R570S,2. This mutation resulted in a myeloproliferat...
1,NAGLU,P521L,Abstract The Large Tumor Suppressor 1 (LATS1)...
2,PAH,L333F,Vascular endothelial growth factor receptor (V...
3,ING1,A148D,Inflammatory myofibroblastic tumor (IMT) is a ...
4,TMEM216,G77A,Abstract Retinoblastoma is a pediatric retina...
...,...,...,...
5663,SLC46A1,R113S,The realization in the late 1970s that RAS har...
5664,FOXC1,L130F,Hemizygous deletions are common molecular abno...
5665,GSS,R267W,All most R267W of has with to SMARTpool invest...
5666,CTSK,G79E,Abstract Blood samples from 125 unrelated fami...


#### Data Cleaning- NLP

In [10]:
with py7zr.SevenZipFile('msk-redefining-cancer-treatment/stage2_test_text.csv.7z', mode='r') as z:
    z.extractall()
    
with py7zr.SevenZipFile('msk-redefining-cancer-treatment/stage2_test_variants.csv.7z', mode='r') as z:
    z.extractall()

In [11]:
df_test = pd.read_csv('./stage2_test_text.csv', engine='python', sep='\|\|', header=None, skiprows=1, names=["ID", "Text"]).set_index('ID')
df_test2 = pd.read_csv('./stage2_test_variants.csv').set_index('ID')
test = pd.merge(df_test2, df_test, how='inner', on='ID').fillna('')
test

Unnamed: 0_level_0,Gene,Variation,Text
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,CHEK2,H371Y,The incidence of breast cancer is increasing i...
2,AXIN2,Truncating Mutations,An unselected series of 310 colorectal carcino...
3,WNT4,E216G,Mycosis fungoides and Sézary syndrome are prim...
4,SUCLA2,G118R,Regulated progression through the cell cycle ...
5,BRAF,T599insTT,Pilocytic astrocytoma (PA) is emerging as a tu...
...,...,...,...
982,TP63,S580P,IκB kinase β (IKKβ) is involved in tumor devel...
983,SCN4A,R672G,he identification of subtype-specific transloc...
984,BRAF,N581H,Cardio-facio-cutaneous (CFC) syndrome (MIM 115...
985,TSHR,S281N,Screening for tumor suppressor genes in breast...


In [12]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [13]:
np.array(stopwords.words('english'))

array(['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
       "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself',
       'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her',
       'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them',
       'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom',
       'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are',
       'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
       'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and',
       'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at',
       'by', 'for', 'with', 'about', 'against', 'between', 'into',
       'through', 'during', 'before', 'after', 'above', 'below', 'to',
       'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
       'again', 'further', 'then', 'once', 'here', 'there', 'when',
       'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'm

In [14]:
stop_words = set(stopwords.words('english')) 

In [15]:
def preprocessing(text):
    global stop_words
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    
#     word_tokens = word_tokenize(text)
    
#     return ' '.join([w for w in word_tokens if not w in stop_words])
    return text

In [16]:
train['Text'].loc[0][:1000]

'Cyclin-dependent kinases (CDKs) regulate a variety of fundamental cellular processes. CDK10 stands out as one of the last orphan CDKs for which no activating cyclin has been identified and no kinase activity revealed. Previous work has shown that CDK10 silencing increases ETS2 (v-ets erythroblastosis virus E26 oncogene homolog 2)-driven activation of the MAPK pathway, which confers tamoxifen resistance to breast cancer cells. The precise mechanisms by which CDK10 modulates ETS2 activity, and more generally the functions of CDK10, remain elusive. Here we demonstrate that CDK10 is a cyclin-dependent kinase by identifying cyclin M as an activating cyclin. Cyclin M, an orphan cyclin, is the product of FAM58A, whose mutations cause STAR syndrome, a human developmental anomaly whose features include toe syndactyly, telecanthus, and anogenital and renal malformations. We show that STAR syndrome-associated cyclin M mutants are unable to interact with CDK10. Cyclin M silencing phenocopies CDK1

In [17]:
preprocessing(train['Text'].loc[0])[:1000]

'cyclindependent kinases cdks regulate a variety of fundamental cellular processes cdk10 stands out as one of the last orphan cdks for which no activating cyclin has been identified and no kinase activity revealed previous work has shown that cdk10 silencing increases ets2 vets erythroblastosis virus e26 oncogene homolog 2driven activation of the mapk pathway which confers tamoxifen resistance to breast cancer cells the precise mechanisms by which cdk10 modulates ets2 activity and more generally the functions of cdk10 remain elusive here we demonstrate that cdk10 is a cyclindependent kinase by identifying cyclin m as an activating cyclin cyclin m an orphan cyclin is the product of fam58a whose mutations cause star syndrome a human developmental anomaly whose features include toe syndactyly telecanthus and anogenital and renal malformations we show that star syndromeassociated cyclin m mutants are unable to interact with cdk10 cyclin m silencing phenocopies cdk10 silencing in increasing

#### TF-IDF (term frequency–inverse document frequency)

In [19]:
tfidf = TfidfVectorizer(min_df=1, ngram_range=(1, 2), max_features=500)

In [20]:
text_train = tfidf.fit_transform(train['Text'].values).toarray()
text_test = tfidf.transform(test['Text'].values).toarray()

In [21]:
text_train

array([[0.03222031, 0.00399878, 0.00711412, ..., 0.00644296, 0.00608174,
        0.02743203],
       [0.03167719, 0.01757558, 0.00781705, ..., 0.00707957, 0.00891022,
        0.01644138],
       [0.03167719, 0.01757558, 0.00781705, ..., 0.00707957, 0.00891022,
        0.01644138],
       ...,
       [0.01836215, 0.00865975, 0.02310948, ..., 0.00697642, 0.0065853 ,
        0.        ],
       [0.01577341, 0.        , 0.0070898 , ..., 0.00285375, 0.01077503,
        0.00331373],
       [0.01203382, 0.00128983, 0.01721026, ..., 0.0055419 , 0.00915461,
        0.00321759]])

In [22]:
train2 = pd.DataFrame(text_train, index=train.index)  
test2 = pd.DataFrame(text_test, index=test.index)

#### SVD - dimensionality reduction

In [23]:
n_components = 70

svd_truncated = TruncatedSVD(n_components=n_components, n_iter=40, random_state=42)
truncated_train = pd.DataFrame(svd_truncated.fit_transform(train2))
truncated_test = pd.DataFrame(svd_truncated.transform(test2))

truncated_train.columns = truncated_test.columns = [f'component №{i}' for i in range(1, n_components + 1)]

# truncated_train.index = train.index
# truncated_test.index = test.index

truncated_train.head()

Unnamed: 0,component №1,component №2,component №3,component №4,component №5,component №6,component №7,component №8,component №9,component №10,...,component №61,component №62,component №63,component №64,component №65,component №66,component №67,component №68,component №69,component №70
0,0.798709,-0.052967,-0.00911,-0.048373,-0.021367,0.018489,-0.042512,-0.012434,-0.093022,0.023448,...,0.018818,-0.024228,-0.016149,0.016322,0.042955,0.003837,-0.018348,0.010838,-0.014163,-0.001972
1,0.922718,-0.107192,-0.067519,0.096123,0.095367,-0.048336,0.065608,-0.025439,-0.037674,0.025788,...,-0.032124,-0.026275,0.011463,-0.019337,0.000811,-0.000102,0.035715,-0.01675,-0.039072,-0.046151
2,0.922718,-0.107192,-0.067519,0.096123,0.095367,-0.048336,0.065608,-0.025439,-0.037674,0.025788,...,-0.032124,-0.026275,0.011463,-0.019337,0.000811,-0.000102,0.035715,-0.01675,-0.039072,-0.046151
3,0.919961,-0.078922,-0.050858,0.058433,0.001345,0.022259,-0.076216,-0.006971,-0.020254,0.036479,...,0.057067,-0.034414,0.030904,-0.032662,0.044584,0.00445,-0.007508,0.024411,-0.044299,-0.02342
4,0.934481,-0.027762,0.01716,0.046482,0.069179,-0.08012,0.031766,0.041763,-0.042904,-0.008979,...,-0.043995,-0.024464,-0.024551,-0.024677,0.037975,-0.008402,-0.012181,0.019736,-0.011271,0.036321


In [24]:
truncated_test.head()

Unnamed: 0,component №1,component №2,component №3,component №4,component №5,component №6,component №7,component №8,component №9,component №10,...,component №61,component №62,component №63,component №64,component №65,component №66,component №67,component №68,component №69,component №70
0,0.838542,0.014929,-0.060907,0.034164,-0.010582,-0.00485,0.000705,-0.034006,0.034505,0.004785,...,0.051223,-0.007877,0.098975,-0.004006,-0.026758,0.040917,-0.00132,0.02979,-0.025627,-0.059728
1,0.95554,-0.060327,0.049973,0.038471,-0.000872,-0.013533,-0.045491,0.013191,-0.023428,0.017536,...,0.007183,0.009029,-0.005982,-0.033475,-0.010884,0.030223,-0.001515,0.015433,0.00811,0.009957
2,0.960923,-0.045953,-0.037609,0.01807,0.003177,0.002015,-0.049345,0.02854,-0.051081,0.061721,...,-0.012245,-0.032944,-0.034175,0.004805,0.029055,-0.03821,-0.007114,-0.008025,0.037869,0.015962
3,0.871541,-0.046223,-0.006951,-0.12817,-0.098044,-0.029713,0.03388,-0.119566,0.073448,-0.048588,...,-0.01202,0.021971,-0.010027,0.004155,-0.027371,0.003023,-0.031493,-0.02509,-0.016807,0.047101
4,0.873243,-0.063641,0.016393,-0.049943,-0.04424,-0.04371,-0.064435,0.027757,-0.224386,-0.092244,...,0.019562,-0.030203,0.006132,-0.002582,0.002216,0.005077,0.002767,-0.022842,0.017765,-0.006686


In [25]:
all_data = pd.concat([train, test]).reset_index(drop=True)
all_data = pd.get_dummies(all_data, columns=['Gene', 'Variation'], drop_first=True)
all_data.drop('Text', axis=1, inplace=True)
all_data.head()

Unnamed: 0,Class,Gene_ABCC6,Gene_ABL1,Gene_ACVR1,Gene_ADAMTS13,Gene_ADGRG1,Gene_AGO2,Gene_AGXT,Gene_AKAP9,Gene_AKT1,...,Variation_YAP1-TFE3 Fusion,Variation_YWHAE-ROS1 Fusion,Variation_ZC3H7B-BCOR Fusion,Variation_ZNF198-FGFR1 Fusion,Variation_null1313Y,Variation_null189Y,Variation_null262Q,Variation_null267R,Variation_null399R,Variation_p61BRAF
0,1.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
train = all_data.loc[train.index]

ind = sorted(set(all_data.index) - set(train.index))
test = all_data.loc[ind]

truncated_test.index = ind

train = train.join(truncated_train)
test = test.join(truncated_test)

train.shape, test.shape

((3321, 4391), (986, 4391))

In [27]:
train.head()

Unnamed: 0_level_0,Class,Gene_ABCC6,Gene_ABL1,Gene_ACVR1,Gene_ADAMTS13,Gene_ADGRG1,Gene_AGO2,Gene_AGXT,Gene_AKAP9,Gene_AKT1,...,component №61,component №62,component №63,component №64,component №65,component №66,component №67,component №68,component №69,component №70
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0,0,0,0,0,0,0,0,0,...,0.018818,-0.024228,-0.016149,0.016322,0.042955,0.003837,-0.018348,0.010838,-0.014163,-0.001972
1,2.0,0,0,0,0,0,0,0,0,0,...,-0.032124,-0.026275,0.011463,-0.019337,0.000811,-0.000102,0.035715,-0.01675,-0.039072,-0.046151
2,2.0,0,0,0,0,0,0,0,0,0,...,-0.032124,-0.026275,0.011463,-0.019337,0.000811,-0.000102,0.035715,-0.01675,-0.039072,-0.046151
3,3.0,0,0,0,0,0,0,0,0,0,...,0.057067,-0.034414,0.030904,-0.032662,0.044584,0.00445,-0.007508,0.024411,-0.044299,-0.02342
4,4.0,0,0,0,0,0,0,0,0,0,...,-0.043995,-0.024464,-0.024551,-0.024677,0.037975,-0.008402,-0.012181,0.019736,-0.011271,0.036321


In [28]:
test.head()

Unnamed: 0,Class,Gene_ABCC6,Gene_ABL1,Gene_ACVR1,Gene_ADAMTS13,Gene_ADGRG1,Gene_AGO2,Gene_AGXT,Gene_AKAP9,Gene_AKT1,...,component №61,component №62,component №63,component №64,component №65,component №66,component №67,component №68,component №69,component №70
3321,,0,0,0,0,0,0,0,0,0,...,0.051223,-0.007877,0.098975,-0.004006,-0.026758,0.040917,-0.00132,0.02979,-0.025627,-0.059728
3322,,0,0,0,0,0,0,0,0,0,...,0.007183,0.009029,-0.005982,-0.033475,-0.010884,0.030223,-0.001515,0.015433,0.00811,0.009957
3323,,0,0,0,0,0,0,0,0,0,...,-0.012245,-0.032944,-0.034175,0.004805,0.029055,-0.03821,-0.007114,-0.008025,0.037869,0.015962
3324,,0,0,0,0,0,0,0,0,0,...,-0.01202,0.021971,-0.010027,0.004155,-0.027371,0.003023,-0.031493,-0.02509,-0.016807,0.047101
3325,,0,0,0,0,0,0,0,0,0,...,0.019562,-0.030203,0.006132,-0.002582,0.002216,0.005077,0.002767,-0.022842,0.017765,-0.006686


#### Export data- ready for model training

In [29]:
X = train.drop('Class', axis=1)
# predict -> (0, 8) => -1
y_array = train['Class'].values - 1

In [30]:
y = pd.DataFrame(data=y_array, columns=['Class'])

In [31]:
X.to_csv('X.csv', index=False)
y.to_csv('y.csv', index=False)

In [32]:
X_test = test.drop('Class', axis=1)

In [33]:
X_test.to_csv('X_test.csv', index=False)

In [34]:
X.columns

Index(['Gene_ABCC6', 'Gene_ABL1', 'Gene_ACVR1', 'Gene_ADAMTS13', 'Gene_ADGRG1',
       'Gene_AGO2', 'Gene_AGXT', 'Gene_AKAP9', 'Gene_AKT1', 'Gene_AKT2',
       ...
       'component №61', 'component №62', 'component №63', 'component №64',
       'component №65', 'component №66', 'component №67', 'component №68',
       'component №69', 'component №70'],
      dtype='object', length=4390)