In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np

# para evitarmos a exibição dos dados em notacao científica
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [3]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from numpy import mean
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedStratifiedKFold
from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.under_sampling import InstanceHardnessThreshold
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

In [4]:
#Reload dataset cleaned
dataset = pd.read_csv("sb_leuk.csv", index_col=0)
df = pd.DataFrame(dataset)
df.shape

(1332, 14097)

In [5]:
df.head()

Unnamed: 0_level_0,Status,Leukemia,Age_group,SCYL3,C1orf112,FGR,CFH,STPG1,NIPAL3,KDM1A,...,DDX3Y,ZFY,TBL1Y,USP9Y,RPS4Y1,TMSB4Y,RPS4Y2,NLGN4Y,UTY,EIF1AY
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TARGET-20-PABHET,KMT2A-WT,AML,pediatric,9.786,4.306,5.928,0.07,3.187,6.529,40.698,...,0.013,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.005,0.125
TARGET-20-PABHKY,KMT2A-WT,AML,pediatric,7.87,8.228,18.701,1.093,1.534,7.152,64.805,...,0.005,0.003,0.0,0.001,0.058,0.0,0.0,0.0,0.007,0.177
TARGET-20-PABLDZ,KMT2A-WT,AML,pediatric,3.125,5.045,44.346,2.269,1.538,5.008,35.962,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TARGET-20-PACDZR,KMT2A-WT,AML,pediatric,4.207,1.937,8.015,0.537,1.769,7.645,64.097,...,0.01,0.0,0.0,0.002,0.0,0.0,0.0,0.0,0.0,0.059
TARGET-20-PADDXZ,KMT2A-WT,AML,pediatric,2.184,2.131,8.995,3.976,0.492,1.286,43.094,...,56.868,17.092,0.23,2.892,260.731,1.875,0.04,0.0,7.159,12.708


In [6]:
df.Status.value_counts()

KMT2A-WT    1232
KMT2A-r      100
Name: Status, dtype: int64

In [7]:
df.Leukemia.value_counts()

AML      748
T-ALL    271
B-ALL    242
ALAL      71
Name: Leukemia, dtype: int64

In [8]:
df1 = df.drop(['Leukemia', 'Age_group'],axis = 1)

In [9]:
df1.head()

Unnamed: 0_level_0,Status,SCYL3,C1orf112,FGR,CFH,STPG1,NIPAL3,KDM1A,TTC22,ST7L,...,DDX3Y,ZFY,TBL1Y,USP9Y,RPS4Y1,TMSB4Y,RPS4Y2,NLGN4Y,UTY,EIF1AY
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TARGET-20-PABHET,KMT2A-WT,9.786,4.306,5.928,0.07,3.187,6.529,40.698,0.464,3.676,...,0.013,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.005,0.125
TARGET-20-PABHKY,KMT2A-WT,7.87,8.228,18.701,1.093,1.534,7.152,64.805,0.496,8.175,...,0.005,0.003,0.0,0.001,0.058,0.0,0.0,0.0,0.007,0.177
TARGET-20-PABLDZ,KMT2A-WT,3.125,5.045,44.346,2.269,1.538,5.008,35.962,0.113,7.624,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TARGET-20-PACDZR,KMT2A-WT,4.207,1.937,8.015,0.537,1.769,7.645,64.097,0.065,3.648,...,0.01,0.0,0.0,0.002,0.0,0.0,0.0,0.0,0.0,0.059
TARGET-20-PADDXZ,KMT2A-WT,2.184,2.131,8.995,3.976,0.492,1.286,43.094,0.085,3.689,...,56.868,17.092,0.23,2.892,260.731,1.875,0.04,0.0,7.159,12.708


# 2) Desenvolvimento dos Modelos

### Defino a classe a ser predita (Y)


In [10]:
def joinCategories(row):
    if row['Status']== 'KMT2A-r'  :
        val = 1
    else:
        val = 0
    return val

In [11]:
df1['Status'] = df1.apply(joinCategories, axis=1)

In [12]:
df1.Status.value_counts()

0    1232
1     100
Name: Status, dtype: int64

In [13]:
df1.head()

Unnamed: 0_level_0,Status,SCYL3,C1orf112,FGR,CFH,STPG1,NIPAL3,KDM1A,TTC22,ST7L,...,DDX3Y,ZFY,TBL1Y,USP9Y,RPS4Y1,TMSB4Y,RPS4Y2,NLGN4Y,UTY,EIF1AY
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TARGET-20-PABHET,0,9.786,4.306,5.928,0.07,3.187,6.529,40.698,0.464,3.676,...,0.013,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.005,0.125
TARGET-20-PABHKY,0,7.87,8.228,18.701,1.093,1.534,7.152,64.805,0.496,8.175,...,0.005,0.003,0.0,0.001,0.058,0.0,0.0,0.0,0.007,0.177
TARGET-20-PABLDZ,0,3.125,5.045,44.346,2.269,1.538,5.008,35.962,0.113,7.624,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TARGET-20-PACDZR,0,4.207,1.937,8.015,0.537,1.769,7.645,64.097,0.065,3.648,...,0.01,0.0,0.0,0.002,0.0,0.0,0.0,0.0,0.0,0.059
TARGET-20-PADDXZ,0,2.184,2.131,8.995,3.976,0.492,1.286,43.094,0.085,3.689,...,56.868,17.092,0.23,2.892,260.731,1.875,0.04,0.0,7.159,12.708


In [14]:
X=df1.drop(['Status'],axis = 1)
y=df1[['Status']] 

In [15]:
from sklearn.feature_selection import VarianceThreshold
threshold = 0 

selector = VarianceThreshold(threshold)
selector.fit_transform(X)
for i,s in enumerate(selector.get_support()):
  if s:
    print(X.columns[i] + " - keep " + "["+ str(selector.variances_[i]) + "]")
  else:
    print("*** " + X.columns[i] + " - remove " + "["+ str(selector.variances_[i]) + "]")

SCYL3 - keep [7.586023273402314]
C1orf112 - keep [33.62023351237185]
FGR - keep [1815.341559451235]
CFH - keep [211.00464453876498]
STPG1 - keep [1.2805969703144824]
NIPAL3 - keep [22.752079002749415]
KDM1A - keep [143.6234854328836]
TTC22 - keep [0.43857460055662206]
ST7L - keep [9.188663799074796]
DNAJC11 - keep [54.76953172760125]
E2F2 - keep [319.26818265076804]
NADK - keep [216.53598598011354]
CSDE1 - keep [690.7994386732216]
MASP2 - keep [11.130712742183304]
FAM76A - keep [8.498651400324183]
TRAF3IP3 - keep [204.63606997259578]
SPRTN - keep [10.268563749954668]
METTL13 - keep [72.62896283597595]
SCMH1 - keep [121.15441839603078]
TCEB3 - keep [163.96558582672898]
LYPLA2 - keep [94.08282977509673]
CLCN6 - keep [38.85394912569324]
MTMR11 - keep [268.573659193709]
NCDN - keep [32.74445862939728]
RUNX3 - keep [499.0019863794747]
GLRX2 - keep [16.474591682144467]
PLEKHO1 - keep [324.09875409022186]
GCLM - keep [28.142879843920234]
DEPDC1 - keep [12.442458385185219]
KPNA6 - keep [59.672

In [16]:
constant_features = [
    feat for feat in X.columns if X[feat].std() == 0
]
constant_features

[]

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30,random_state=42)

In [18]:
y_train.Status.value_counts()

0    853
1     79
Name: Status, dtype: int64

In [19]:
y_test.Status.value_counts()

0    379
1     21
Name: Status, dtype: int64

In [20]:
from sklearn.preprocessing import StandardScaler
std_scale = StandardScaler()
X_train = pd.DataFrame(std_scale.fit_transform(X_train), columns = X_train.columns)
X_test  = pd.DataFrame(std_scale.transform(X_test), columns = X_test.columns)

In [21]:
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier

# define random forest classifier
forest = RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=5)
forest.fit(X_train, y_train.values.ravel())

boruta = BorutaPy(
   estimator = forest, 
   n_estimators = 'auto',
   verbose=2,
   random_state=42,
   max_iter = 100)

### fit Boruta
boruta.fit(np.array(X_train), np.array(y_train.values.ravel()))

### print results
green_area = X_train.columns[boruta.support_].to_list()
blue_area = X_train.columns[boruta.support_weak_].to_list()

print('features in the green area:', green_area) #Keep
print('features in the blue area:', blue_area)   #Inconclusive 

print('ranking: ', boruta.ranking_)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	14094
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	14094
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	14094
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	14094
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	14094
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	14094
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	14094
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	686
Rejected: 	13408
Iteration: 	9 / 100
Confirmed: 	193
Tentative: 	493
Rejected: 	13408
Iteration: 	10 / 100
Confirmed: 	193
Tentative: 	493
Rejected: 	13408
Iteration: 	11 / 100
Confirmed: 	193
Tentative: 	493
Rejected: 	13408
Iteration: 	12 / 100
Confirmed: 	204
Tentative: 	387
Rejected: 	13503
Iteration: 	13 / 100
Confirmed: 	204
Tentative: 	387
Rejected: 	13503
Iteration: 	14 / 100
Confirmed: 	204
Tentative: 	387
Rejected: 	13503
Iteration: 	15 / 100
Confirmed: 	204
Tentative: 	387
Reject

In [22]:
#green area only
df = df[['Status', 'Leukemia', 'Age_group', 'PQLC2', 'COL9A2', 'AKR7A2', 'PTPRU', 'IL12RB2', 'BMP8B', 'STMN1', 'TXNDC12', 'C1orf54', 'FCGR1A', 'MCOLN2', 'FAM213B', 'CACHD1', 'SV2A', 'WDR63', 'SLC22A15', 'SERINC2', 'LPAR3', 'RNPEP', 'CLCNKA', 'RNF220', 'SPAG6', 'NEBL', 'MLLT10', 'DNTT', 'LZTS2', 'CISD1', 'DNAJC1', 'COMMD3', 'ZEB1', 'IPMK', 'REEP3', 'ZNF503', 'SLC35G1', 'FZD8', 'SKIDA1', 'KIAA1598', 'CASC10', 'DNAJC9', 'TMEM109', 'HIPK3', 'KIAA1549L', 'DPF2', 'ZC3H12C', 'FEZ1', 'TPP1', 'VPS37C', 'PRKCDBP', 'PLA2G16', 'RAB39A', 'CADM1', 'SVIP', 'SLC6A13', 'CD4', 'BCAT1', 'NUAK1', 'CLEC2B', 'LIN7A', 'ACRBP', 'SOCS2', 'BHLHE41', 'ITGA7', 'FGD4', 'CACNA2D4', 'PLBD2', 'CRADD', 'WNT10B', 'POLE', 'SMAD9', 'DACH1', 'PRKCH', 'SLC22A17', 'HIF1A', 'RIPK3', 'PARP2', 'OXA1L', 'NDRG2', 'CDCA4', 'NEDD4', 'CTSH', 'TGM5', 'MYO5C', 'GOLGA8I', 'LIPC', 'SPINT1', 'NIPA1', 'CSPG4', 'THSD4', 'KIF22', 'SYT17', 'ITGAX', 'NUDT7', 'ADCY9', 'LDHD', 'VAT1L', 'MT1F', 'CES1', 'GPR56', 'ALDH3A1', 'SCPEP1', 'C17orf53', 'KAT7', 'SLC47A1', 'CCL23', 'CBX2', 'PPP1R27', 'LRRC37B', 'ZNF532', 'MAPRE2', 'ZNF521', 'PAFAH1B3', 'TNNT1', 'PLD3', 'GRIN2D', 'ZNF85', 'ZFP30', 'ZSCAN18', 'C3', 'IGFLR1', 'TRPM4', 'ZNF331', 'KCNC3', 'ZNF776', 'ZNF256', 'BRSK1', 'FBXO27', 'ZNF91', 'HSD11B1L', 'ZNF571', 'ZNF329', 'LILRB4', 'ZNF681', 'ZNF418', 'ZNF682', 'ZNF772', 'ZNF461', 'ZNF544', 'ZNF43', 'ZNF254', 'ZNF134', 'ZNF844', 'ZNF737', 'ADAM23', 'TANC1', 'SPR', 'TGFBRAP1', 'DTNB', 'MEIS1', 'HNMT', 'AP1S3', 'B3GNT7', 'BRE', 'DAPL1', 'NMUR1', 'WDSUB1', 'MRPL33', 'PHACTR3', 'BFSP1', 'LAMP5', 'BTBD3', 'ZNF512B', 'SLC2A10', 'ZBTB21', 'UMODL1', 'CLTCL1', 'HMGXB4', 'UPK3A', 'CDC42EP1', 'IGF2BP2', 'PLCH1', 'ATG3', 'MBNL1', 'MRAS', 'CLSTN2', 'MAGEF1', 'HTR1F', 'VGLL3', 'PROM1', 'GUCY1B3', 'GLRB', 'TMEM156', 'CPEB2', 'MMRN1', 'OCIAD2', 'STK32B', 'GUCY1A3', 'RHOH', 'SMAD1', 'UNC5C', 'HEXB', 'MEF2C', 'NPR3', 'WWC1', 'MSX2', 'NRG2', 'RHOBTB3', 'IRX1', 'S100Z', 'ANXA2R', 'PRR16', 'BVES', 'WASF1', 'GPR126', 'QKI', 'SENP6', 'RUNX2', 'IL22RA2', 'NKAIN2', 'SUPT3H', 'MYO6', 'CD2AP', 'L3MBTL3', 'PPP1R3G', 'ETV1', 'HOXA9', 'ZC3HAV1', 'HOXA3', 'HOXA5', 'HOXA6', 'ABHD11', 'HOXA7', 'DYNC1I1', 'CLCN1', 'HOXA4', 'HOXA10', 'TRPS1', 'DNAJC5B', 'CPA6', 'FAM110B', 'PENK', 'CLDN23', 'VLDLR', 'GNA14', 'CDK20', 'PPAPDC3', 'PSIP1', 'FBP1', 'PBX3', 'GAS1', 'TDRD7', 'TKTL1', 'NXT2', 'FAM127A', 'CCNB3', 'RAB39B', 'KCNE1L', 'SAGE1']]
df.head()

Unnamed: 0_level_0,Status,Leukemia,Age_group,PQLC2,COL9A2,AKR7A2,PTPRU,IL12RB2,BMP8B,STMN1,...,PBX3,GAS1,TDRD7,TKTL1,NXT2,FAM127A,CCNB3,RAB39B,KCNE1L,SAGE1
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TARGET-20-PABHET,KMT2A-WT,AML,pediatric,11.026,23.265,6.362,0.169,0.627,2.334,123.779,...,6.632,0.072,18.485,0.762,16.824,8.576,0.044,3.244,7.811,0.0
TARGET-20-PABHKY,KMT2A-WT,AML,pediatric,7.953,47.429,17.014,0.236,1.153,2.059,66.008,...,7.3,0.079,17.879,1.607,10.848,5.108,0.186,2.101,24.633,0.0
TARGET-20-PABLDZ,KMT2A-WT,AML,pediatric,8.799,6.63,44.009,0.077,0.029,0.246,187.226,...,11.419,0.14,6.316,0.055,70.217,7.595,0.092,0.481,0.143,12.506
TARGET-20-PACDZR,KMT2A-WT,AML,pediatric,9.322,15.78,27.391,0.1,0.256,1.246,186.547,...,3.193,3.208,6.189,0.617,9.23,0.467,0.026,1.522,0.138,0.0
TARGET-20-PADDXZ,KMT2A-WT,AML,pediatric,6.879,30.013,16.351,0.169,0.024,1.062,91.189,...,12.377,0.008,7.966,0.99,4.378,0.264,0.082,0.538,2.031,0.0


In [23]:
df.to_csv("leukemia247.csv")