In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from matplotlib import pyplot as plt
plt.style.use('seaborn-dark')
plt.style.use('seaborn-poster')
import warnings
warnings.filterwarnings('ignore')

  plt.style.use('seaborn-dark')
  plt.style.use('seaborn-poster')


Let's begin by reading the expression matrix:

In [2]:
expression_matrix = pd.read_csv("../R_code/datasets/exp_matrix_cancer.csv")
expression_matrix.drop(['Unnamed: 0'], axis=1, inplace=True)
expression_matrix.head(5)

Unnamed: 0,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,1431_at,...,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at,type,diagnosis,sample,source
0,9.846184,8.148296,6.422754,7.357572,3.041305,6.878362,4.963093,4.408153,8.889147,3.269462,...,6.280482,5.055927,4.977227,2.984777,3.515967,3.300484,Basal,Breast cancer,GSM1116084,Human Basal Tumor Sample
1,9.860176,8.250261,7.097815,7.692816,3.169582,7.559728,5.095169,4.579266,7.81668,3.574471,...,6.762712,5.284717,5.410978,2.93841,3.685822,3.06185,Basal,Breast cancer,GSM1116085,Human Basal Tumor Sample
2,10.302871,8.298517,7.358199,7.697099,3.094102,7.520407,5.037056,4.436952,8.717094,3.714278,...,8.161149,6.135511,6.866922,2.971874,3.559325,3.140796,Basal,Breast cancer,GSM1116085,Human Basal Tumor Sample
3,10.268402,7.590026,6.915557,7.125229,3.394204,7.438678,5.02154,4.76927,7.791342,3.718379,...,6.918603,5.491241,5.567928,2.842062,3.690852,3.243485,Her2,Breast cancer,GSM1116086,Human Her2 Tumor Sample
4,10.113765,8.961569,5.771326,7.695747,3.120667,6.598305,4.77991,4.409205,9.429646,3.555235,...,6.360527,5.167412,5.235059,2.845738,3.540291,3.161066,Basal,Breast cancer,GSM1116087,Human Basal Tumor Sample


And then also the annotation matrix:

In [3]:
annotation = pd.read_csv("../R_code/datasets/annotation.csv", encoding='utf-8')
annotation.head(5)

Unnamed: 0,ensembl_gene_id,affy_hg_u133_plus_2,hgnc_symbol,description,refseq_mrna,refseq_ncrna
0,ENSG00000198888,1553551_s_at,MT-ND1,mitochondrially encoded NADH:ubiquinone oxidor...,,
1,ENSG00000210100,1553551_s_at,MT-TI,mitochondrially encoded tRNA-Ile (AUU/C) [Sour...,,
2,ENSG00000210112,1553551_s_at,MT-TM,mitochondrially encoded tRNA-Met (AUA/G) [Sour...,,
3,ENSG00000198763,1553551_s_at,MT-ND2,mitochondrially encoded NADH:ubiquinone oxidor...,,
4,ENSG00000198804,1553538_s_at,MT-CO1,mitochondrially encoded cytochrome c oxidase I...,,


We filter the expression matrix with only the genes of interest:

In [4]:
genes_of_interest = ["SNORA1", "SNORA12", "SNORA14B", "SNORA16A", "SNORA21", "SNORA23", "SNORA24", "SNORA32", "SNORA38", "SNORA44", "SNORA48", "SNORA49", "SNORA52", "SNORA53", "SNORA57", "SNORA61", "SNORA63", "SNORA64", "SNORA65", "SNORA70", "SNORA71C", "SNORA73A", "SNORA73B", "SNORA75", "SNORA78", "SNORA8", "SNORD10", "SNORD102", "SNORD104", "SNORD105", "SNORD105B", "SNORD108", "SNORD110", "SNORD111B", "SNORD113-3", "SNORD113-4", "SNORD114-1", "SNORD114-13", "SNORD114-14", "SNORD114-19", "SNORD114-20", "SNORD114-21", "SNORD115-23", "SNORD115-32", "SNORD116-13", "SNORD119", "SNORD12", "SNORD12B", "SNORD13", "SNORD14A", "SNORD14C", "SNORD14D", "SNORD15B", "SNORD16", "SNORD17", "SNORD18A", "SNORD1B", "SNORD20", "SNORD22", "SNORD26", "SNORD28", "SNORD29", "SNORD32A", "SNORD33", "SNORD34", "SNORD35A", "SNORD36A", "SNORD36B", "SNORD38A", "SNORD3A", "SNORD3D", "SNORD41", "SNORD42A", "SNORD42B", "SNORD44", "SNORD45A", "SNORD47", "SNORD49B", "SNORD4B", "SNORD5", "SNORD50A",
                     "SNORD52", "SNORD53", "SNORD54", "SNORD55", "SNORD56", "SNORD57", "SNORD58A", "SNORD58C", "SNORD59A", "SNORD60", "SNORD63", "SNORD64", "SNORD65", "SNORD68", "SNORD69", "SNORD71", "SNORD74", "SNORD76", "SNORD8", "SNORD81", "SNORD82", "SNORD84", "SNORD86", "SNORD87", "SNORD89", "SNORD94", "SNORD96A", "SNORD97", "SNORD99", "TMX1", "ZFAS1", "CDKN2B-AS1", "CWF19L1", "EIF4A1", "EP400", "HIF1A-AS2", "MEG8", "NOP56", "RACK1", "RPL13A", "RPS13", "SCARNA12", "SNHG12", "SNHG20", "SNHG5", "SNHG6", "SNHG7", "SNHG8", "AP1G1", "CFDP1", "CHD8", "DDX39B", "EEF2", "EIF4A2", "EIF4G2", "GAS5", "GNL3", "HSPA8", "HSPA9", "IPO7", "MYRIP", "NAN", "NCL", "NFATC3", "PCAT4", "PPAN", "PRKAA1", "PRRC2A", "PTCD3", "RABGGTB", "RNF149", "RPL10", "RPL12", "RPL13", "RPL17", "RPL21", "RPL23", "RPL23A", "RPL4", "RPL7A", "RPLP2", "RPS11", "RPS2", "RPS20", "RPS3", "RPS8", "SF3B3", "SLC25A3", "SNHG1", "SNHG3", "SNHG9", "SNORD1C", "SNORD35B", "SNORD37", "SNRPB", "SNX5", "TAF1D", "TNPO2", "TOMM20", "WDR43"]
ids_of_interest = annotation[annotation['hgnc_symbol'].isin(genes_of_interest)]
ids_of_interest = ids_of_interest['affy_hg_u133_plus_2'].unique()

expression_matrix_filtered = expression_matrix[ids_of_interest]

In [5]:
expression_matrix_filtered["type"] = expression_matrix["type"]
expression_matrix_filtered.head(5)

Unnamed: 0,1555996_s_at,1554591_at,1558619_at,1552729_at,1555177_at,1558954_at,1557964_at,1566340_at,1560023_x_at,1560021_at,...,237945_at,236588_at,239266_at,235536_at,242856_at,241840_at,244669_at,241448_at,241892_at,type
0,8.572623,4.541497,5.549578,5.998607,3.454522,6.546487,5.903515,5.190982,3.840987,3.523424,...,3.44466,6.765612,6.872891,6.743925,3.710025,4.330195,7.111778,5.895615,4.197069,Basal
1,8.98906,4.545225,5.882288,5.765459,3.203928,6.134798,5.979569,5.899465,4.12179,3.568873,...,3.791548,5.347673,7.274763,6.194496,4.029982,4.115736,7.673906,6.654325,4.032583,Basal
2,8.22257,4.817119,6.005374,6.608301,3.146863,6.373683,6.03411,5.686189,3.97806,3.531802,...,3.465757,5.412203,6.461432,5.795265,3.597531,4.434329,7.363095,6.600451,4.249542,Basal
3,8.116295,4.599514,5.843843,5.708726,3.707219,6.368689,5.885922,5.504504,3.620644,3.707258,...,4.225795,5.007519,6.990521,7.133525,5.197916,4.675879,6.957246,5.727644,4.121604,Her2
4,9.313135,4.68162,5.593466,5.363386,3.405048,6.162777,5.792534,5.310325,4.010464,3.74155,...,3.763927,6.163665,7.647568,6.627222,3.446529,4.364641,8.031034,5.336647,4.380835,Basal


In [8]:
exp_filtered_2 = expression_matrix[["1566749_at", "1553266_at", "237520_x_at", "1561177_at", "1560540_x_at", "243520_x_at", "240463_at", "1563055_at", "1558708_at", "1564272_a_at", "1561343_a_at", "1559612_at", "1562480_at", "1566202_at", "1561564_at", "234755_x_at", "1567386_at", "233822_x_at", "231717_s_at", "1568366_at", "234601_x_at", "1560383_at", "1560516_at", "1561331_at", "1552974_at", "1560884_at", "1560528_at", "1559842_at", "1561025_at", "220837_at", "1553963_at", "1569810_at", "216974_at", "204704_s_at", "217306_at", "1565755_at", "1553607_at", "242296_x_at", "1556889_s_at", "1561230_at", "241647_x_at", "234686_at", "1552661_at", "1569610_at", "220656_at", "1568925_at", "1561030_at", "221438_s_at", "1562628_at", "1561156_at", "1555939_at", "1560202_at", "1557566_at", "1554920_at", "1570115_at", "1562998_at", "1552675_at", "241200_x_at", "240619_at", "1570452_at", "1561775_at", "241760_x_at", "1560458_s_at", "1555357_at", "241674_s_at", "1567696_at", "244283_x_at", "1568673_s_at", "1563040_s_at", "1552975_x_at", "1558893_a_at", "1556282_at", "211093_at", "233847_x_at", "1561998_at", "1553383_at", "1557017_at", "1552858_at", "91580_at", "1563465_at", "214899_at", "234833_at", "1561959_x_at", "1561765_at", "242623_x_at", "1552904_at", "242955_x_at", "1563660_at", "234706_x_at", "1567287_at", "1559275_x_at", "1569663_at", "1564631_at", "1564662_at", "1568926_x_at", "243644_at", "237477_at", "233683_at", "1569981_at", "1558793_at", "1563254_a_at", "1561021_at", "1556592_at", "1567182_at", "1553441_at", "241796_x_at", "1569406_at", "1569873_at", "1553362_at", "207607_at", "1559342_a_at", "1558857_at", "216136_at", "1552372_at", "1563101_at", "241082_at", "238391_at", "1568603_at", "1559621_at", "1570244_at", "1558519_at", "240586_at", "217694_at", "1553075_a_at", "1559274_at", "1564128_at", "1554298_a_at", "208192_at", "1556912_at", "240093_x_at", "1554125_a_at", "1553868_a_at", "1569719_at", "1561110_at", "1564950_at", "1554142_at", "1569494_at", "1562607_at", "1562431_x_at", "1567173_at", "1557871_at", "241587_at", "215901_at", "1555558_at", "1560885_x_at", "1567702_at", "1563186_at", "1563351_at", "1562223_at", "1560791_at", "233662_at", "1566865_at", "244277_at", "1567656_at", "1567387_at", "217479_at", "1557692_a_at", "1556786_at", "1562582_at", "1560980_a_at", "1566930_at", "242164_s_at", "1553651_at", "234022_at", "237788_at", "232638_at", "1561855_x_at", "244039_x_at", "1565820_x_at", "1560992_at", "234662_at", "215803_at", "1553519_at", "1556497_a_at", "1561151_a_at", "1563133_at", "1557890_at", "1566439_at", "1560950_at", "237998_at", "1569929_at", "240713_s_at", "1569168_at", "1565657_at", "1567379_at", "type"]]
exp_filtered_2.head(5)

Unnamed: 0,1566749_at,1553266_at,237520_x_at,1561177_at,1560540_x_at,243520_x_at,240463_at,1563055_at,1558708_at,1564272_a_at,...,1557890_at,1566439_at,1560950_at,237998_at,1569929_at,240713_s_at,1569168_at,1565657_at,1567379_at,has_cancer
0,2.559442,2.726784,2.58563,2.597794,2.520453,2.559858,2.491014,2.685144,2.496566,2.416953,...,2.87799,2.427743,2.714462,2.851813,2.62301,2.564426,2.722807,2.68937,2.723331,0
1,2.685335,2.932854,2.58617,2.792947,2.571018,2.562747,2.500382,2.838283,2.430896,2.723997,...,2.929473,3.06997,2.628544,3.079739,2.790806,3.050911,2.498076,2.635403,2.560476,0
2,2.608813,3.242867,2.788735,2.909878,2.836168,2.513369,2.576594,2.637237,2.413338,2.575517,...,2.928679,2.970361,2.944181,2.921951,3.008012,2.565859,3.111847,2.923108,2.586501,0
3,2.644717,2.613194,2.728292,2.720819,2.730368,3.024656,2.783768,2.82401,2.714252,2.543371,...,3.285727,2.845168,2.694164,3.19376,3.111656,2.6345,2.763755,2.758103,2.984773,0
4,2.528156,2.83711,2.51685,2.470301,2.50562,2.572569,2.426244,2.759874,2.697897,2.578138,...,3.231105,2.531692,2.627596,2.887021,2.973099,2.693099,3.116446,2.489445,2.951391,0


In [8]:
sample_names = expression_matrix['sample_names']
#expression_matrix.drop(['sample_names'], axis=1, inplace=True)

Güt. Now we can start doing some ML. First, we fit a Multinomial Naive Bayes:

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(
    expression_matrix_filtered.drop(['has_cancer'], axis=1), expression_matrix_filtered['has_cancer'], test_size=0.2, random_state=226506)

tuned_nb = GridSearchCV(MultinomialNB(), {}, refit=True, cv=10, verbose=4)
tuned_nb.fit(X_train, Y_train)

preds_mnb = tuned_nb.predict(X_test)

preds_mnb = preds_mnb.astype(int)
Y_test = Y_test.astype(int)
print(f"Accuracy: {metrics.accuracy_score(Y_test, preds_mnb)}")
print(f"Precision: {metrics.precision_score(Y_test, preds_mnb)}")
print(f"Recall: {metrics.recall_score(Y_test, preds_mnb)}")
print(f"F1: {metrics.f1_score(Y_test, preds_mnb)}")
print(metrics.classification_report(Y_test, preds_mnb))


Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV 1/10] END .................................., score=0.800 total time=   0.0s
[CV 2/10] END .................................., score=0.960 total time=   0.0s
[CV 3/10] END .................................., score=0.840 total time=   0.0s
[CV 4/10] END .................................., score=0.920 total time=   0.0s
[CV 5/10] END .................................., score=0.880 total time=   0.0s
[CV 6/10] END .................................., score=0.958 total time=   0.0s
[CV 7/10] END .................................., score=1.000 total time=   0.0s
[CV 8/10] END .................................., score=0.875 total time=   0.0s
[CV 9/10] END .................................., score=0.875 total time=   0.0s
[CV 10/10] END ................................., score=0.917 total time=   0.0s
Accuracy: 0.8548387096774194
Precision: 0.8979591836734694
Recall: 0.9166666666666666
F1: 0.9072164948453607
              precis

Decent performance, but let's check if an SVM can do better:

In [6]:
tune_grid = {
    'gamma': [0.00001, 0.0001, 0.001, 0.1, 1, 'scale', 'auto'],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [1, 2, 3, 4],
    'C': [0.1, 1, 10, 100]
}

tuned_svm = GridSearchCV(SVC(random_state=226506), tune_grid, refit=True, cv=10, verbose=4)
tuned_svm.fit(X_train, Y_train)

preds_svm = tuned_svm.predict(X_test)

preds_svm = preds_svm.astype(int)
print(f"Accuracy: {metrics.accuracy_score(Y_test, preds_svm)}")
print(f"Precision: {metrics.precision_score(Y_test, preds_svm)}")
print(f"Recall: {metrics.recall_score(Y_test, preds_svm)}")
print(f"F1: {metrics.f1_score(Y_test, preds_svm)}")
print(metrics.classification_report(Y_test, preds_svm))


TypeError: '<' not supported between instances of 'float' and 'str'

Much better. We can also check what are the best parameters:

In [11]:
print(tuned_svm.best_estimator_)

SVC(C=0.1, degree=1, gamma=1e-05, kernel='linear', random_state=226506)


Now we can try to fit a ridge regression:

In [90]:
tune_grid = {
    'C': [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 1, 10, 100],
    'penalty': ['l2']
}

tuned_ridge = GridSearchCV(LogisticRegression(max_iter=1000), tune_grid, cv=10, verbose=4)

tuned_ridge.fit(X_train, Y_train)

preds_ridge = tuned_ridge.predict(X_test)

preds_ridge = preds_ridge.astype(int)
print(f"Accuracy: {metrics.accuracy_score(Y_test, preds_ridge)}")
print(f"Precision: {metrics.precision_score(Y_test, preds_ridge)}")
print(f"Recall: {metrics.recall_score(Y_test, preds_ridge)}")
print(f"F1: {metrics.f1_score(Y_test, preds_ridge)}")
print(metrics.classification_report(Y_test, preds_ridge))

Fitting 10 folds for each of 23 candidates, totalling 230 fits
[CV 1/10] END ...............C=0.01, penalty=l2;, score=0.920 total time=   0.0s
[CV 2/10] END ...............C=0.01, penalty=l2;, score=0.920 total time=   0.0s
[CV 3/10] END ...............C=0.01, penalty=l2;, score=0.960 total time=   0.0s
[CV 4/10] END ...............C=0.01, penalty=l2;, score=0.960 total time=   0.0s
[CV 5/10] END ...............C=0.01, penalty=l2;, score=0.960 total time=   0.0s
[CV 6/10] END ...............C=0.01, penalty=l2;, score=0.958 total time=   0.0s
[CV 7/10] END ...............C=0.01, penalty=l2;, score=1.000 total time=   0.0s
[CV 8/10] END ...............C=0.01, penalty=l2;, score=0.917 total time=   0.0s
[CV 9/10] END ...............C=0.01, penalty=l2;, score=1.000 total time=   0.0s
[CV 10/10] END ..............C=0.01, penalty=l2;, score=0.958 total time=   0.0s
[CV 1/10] END ...............C=0.02, penalty=l2;, score=0.920 total time=   0.0s
[CV 2/10] END ...............C=0.02, penalty=l

It also gives us very good results. Let's check the parameters:

In [92]:
print(tuned_ridge.best_estimator_)

LogisticRegression(C=10, max_iter=1000)


Let's also check the coefficients:

In [105]:
ridge = LogisticRegression(C=10, penalty='l2', max_iter=1000)

ridge.fit(X_train, Y_train)
print(ridge.coef_)

[[ 0.05438692  0.08170905  0.04891421 -0.49478602 -0.04299994  0.00377561
  -0.3985913   0.10359035 -0.53726929 -0.41060791  0.14682986 -0.06087096
  -0.0659303   0.15681673 -0.07257855 -0.29644671 -0.05060175 -0.00371523
   0.33665877  0.14137493  0.05607593 -0.67116759 -0.32813481 -0.32123246
  -0.15666301 -0.35089702 -0.02097143  0.19845362 -0.70320283  0.17206215
  -0.37050835 -0.1629628  -0.00849114 -0.09558741  0.01726735 -0.0280372
  -0.14580425 -0.53323225  0.48242882 -0.1909962  -0.32154261 -0.2479101
  -0.00932726  0.11493101  0.22670531  0.32042563  0.41636539  0.30316587
   0.02871651  0.09003023 -0.51075295  0.33629659  0.05476497  0.16801595
   0.80984145  0.80028437  0.17649455 -0.07957763 -0.12684077 -0.03803103
  -0.20800291  0.0028514  -0.04489775 -0.48601379 -0.03316757  0.07545592
  -0.07954453 -0.02874781  0.31192839 -0.89762969 -0.20999263  0.42145435
  -0.13523484 -0.13469599  0.19743305  0.15478522 -0.07429535 -0.06117743
  -0.32581825  1.1433579   0.05793961  0

A lasso regression can also be fit to see if there is a comparable difference with the ridge:

In [25]:
tune_grid = {
    #'C': [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 1, 10, 100],
    'C': [0.01, 0.2, 1, 5, 10, 20, 50, 80, 100],
    'penalty': ['l1']
}

tuned_lasso = GridSearchCV(LogisticRegression(max_iter=10000, solver='saga'), tune_grid, cv=10, verbose=4)

tuned_lasso.fit(X_train, Y_train)

preds_lasso = tuned_lasso.predict(X_test)

preds_lasso = preds_lasso.astype(int)
print(f"Accuracy: {metrics.accuracy_score(Y_test, preds_lasso)}")
print(f"Precision: {metrics.precision_score(Y_test, preds_lasso)}")
print(f"Recall: {metrics.recall_score(Y_test, preds_lasso)}")
print(f"F1: {metrics.f1_score(Y_test, preds_lasso)}")
print(metrics.classification_report(Y_test, preds_lasso))

Fitting 10 folds for each of 9 candidates, totalling 90 fits
[CV 1/10] END ...............C=0.01, penalty=l1;, score=0.800 total time=   1.0s
[CV 2/10] END ...............C=0.01, penalty=l1;, score=0.800 total time=   1.0s
[CV 3/10] END ...............C=0.01, penalty=l1;, score=0.800 total time=   1.0s
[CV 4/10] END ...............C=0.01, penalty=l1;, score=0.800 total time=   1.0s
[CV 5/10] END ...............C=0.01, penalty=l1;, score=0.800 total time=   1.1s
[CV 6/10] END ...............C=0.01, penalty=l1;, score=0.833 total time=   1.0s
[CV 7/10] END ...............C=0.01, penalty=l1;, score=0.833 total time=   1.0s
[CV 8/10] END ...............C=0.01, penalty=l1;, score=0.833 total time=   1.0s
[CV 9/10] END ...............C=0.01, penalty=l1;, score=0.833 total time=   1.0s
[CV 10/10] END ..............C=0.01, penalty=l1;, score=0.792 total time=   1.2s
[CV 1/10] END ................C=0.2, penalty=l1;, score=0.920 total time=   1.6s
[CV 2/10] END ................C=0.2, penalty=l1;

KeyboardInterrupt: 

Same exact performance. Let's see the parameters and coefficients:

In [11]:
print(tuned_lasso.best_estimator_)

LogisticRegression(C=10, max_iter=10000, penalty='l1', solver='saga')


In [12]:
lasso = LogisticRegression(C=10, penalty='l1', solver='saga', max_iter=10000)

lasso.fit(X_train, Y_train)
print(lasso.coef_)

[[ 8.20202969e-02  3.05768050e-02  0.00000000e+00 -1.72353760e-01
   0.00000000e+00  0.00000000e+00 -7.89849392e-02  9.58305692e-02
  -2.71920425e-01 -1.95240956e-01  6.32143674e-02 -1.19937099e-02
  -1.95753372e-02  8.12473534e-02 -3.71721874e-02 -1.53307923e-01
   0.00000000e+00  0.00000000e+00  1.39358371e-01  4.53490793e-02
   1.87897231e-02 -5.00171166e-01 -1.35236012e-01 -2.43697667e-01
  -3.47479008e-03 -9.70692127e-02  0.00000000e+00  8.30219821e-02
  -4.01956593e-01  1.26230685e-01 -1.42852645e-01 -4.28983234e-02
   0.00000000e+00 -1.97661395e-02  0.00000000e+00 -2.02111010e-02
  -9.55662722e-02 -2.52673846e-01  3.58693174e-01 -2.03202180e-01
  -2.56211499e-01 -1.29882452e-01 -5.59847015e-02  1.14260375e-01
   2.38212529e-01  1.96580738e-01  2.26893793e-01  1.93756594e-01
   0.00000000e+00  0.00000000e+00 -2.38267622e-01  2.09244053e-01
   7.00023464e-02  2.11828786e-01  4.80387212e-01  6.04780726e-01
   9.68710563e-02  4.49774610e-02 -2.58827384e-04  6.21936456e-03
  -1.42042

In [18]:
gene_names = []
for col in X_test.columns:
    gene_names.append(annotation[annotation['affy_hg_u133_plus_2'] == col].iloc[0]['hgnc_symbol'])

coefficients = pd.concat([pd.DataFrame(X_test.columns),pd.DataFrame(np.transpose(lasso.coef_)), pd.DataFrame(gene_names)], axis = 1)
coefficients

Unnamed: 0,0,0.1,0.2
0,1555996_s_at,0.082020,EIF4A2
1,1554591_at,0.030577,PCAT4
2,1558619_at,0.000000,SNHG7
3,1552729_at,-0.172354,SNORA17B
4,1555177_at,0.000000,PRKAA1
...,...,...,...
180,242856_at,-0.146354,MEG8
181,241840_at,-0.156390,HSPA9
182,244669_at,-0.041213,SNHG5
183,241448_at,0.000000,TOMM20


In [21]:
annotation[annotation['affy_hg_u133_plus_2'] == '200858_s_at']

Unnamed: 0,ensembl_gene_id,affy_hg_u133_plus_2,hgnc_symbol,description,refseq_mrna,refseq_ncrna
22281,ENSG00000240376,200858_s_at,,ribosomal protein S8 (RPS8) pseudogene,,
37187,ENSG00000142937,200858_s_at,RPS8,ribosomal protein S8 [Source:HGNC Symbol;Acc:H...,,
37188,ENSG00000142937,200858_s_at,RPS8,ribosomal protein S8 [Source:HGNC Symbol;Acc:H...,NM_001012,
37207,ENSG00000200913,200858_s_at,SNORD46,"small nucleolar RNA, C/D box 46 [Source:HGNC S...",,NR_000024
37208,ENSG00000202031,200858_s_at,SNORD38A,"small nucleolar RNA, C/D box 38A [Source:HGNC ...",,NR_001456


In [20]:
coefficients.to_csv('../R_code/lasso_coeff.csv')

The coefficients are different, but the performance remains the same and the ridge is also quicker to fit.

Lastly, we try a RandomForest:

In [23]:
tune_grid = {
    'bootstrap': [True],
    'criterion': ['gini'],
    'max_features': ['sqrt'],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000],
    'n_jobs': [-1]
}

tuned_rf = GridSearchCV(RandomForestClassifier(
    random_state=226506), tune_grid, refit=True, cv=10, verbose=4)
tuned_rf.fit(X_train, Y_train)

preds_rf = tuned_rf.predict(X_test)

preds_rf = preds_rf.astype(int)
print(f"Accuracy: {metrics.accuracy_score(Y_test, preds_rf)}")
print(f"Precision: {metrics.precision_score(Y_test, preds_rf)}")
print(f"Recall: {metrics.recall_score(Y_test, preds_rf)}")
print(f"F1: {metrics.f1_score(Y_test, preds_rf)}")
print(metrics.classification_report(Y_test, preds_rf))


Fitting 10 folds for each of 36 candidates, totalling 360 fits
[CV 1/10] END bootstrap=True, criterion=gini, max_features=sqrt, min_samples_leaf=3, min_samples_split=8, n_estimators=100, n_jobs=-1;, score=0.880 total time=   0.1s
[CV 2/10] END bootstrap=True, criterion=gini, max_features=sqrt, min_samples_leaf=3, min_samples_split=8, n_estimators=100, n_jobs=-1;, score=0.920 total time=   0.1s
[CV 3/10] END bootstrap=True, criterion=gini, max_features=sqrt, min_samples_leaf=3, min_samples_split=8, n_estimators=100, n_jobs=-1;, score=0.960 total time=   0.1s
[CV 4/10] END bootstrap=True, criterion=gini, max_features=sqrt, min_samples_leaf=3, min_samples_split=8, n_estimators=100, n_jobs=-1;, score=0.960 total time=   0.1s
[CV 5/10] END bootstrap=True, criterion=gini, max_features=sqrt, min_samples_leaf=3, min_samples_split=8, n_estimators=100, n_jobs=-1;, score=0.920 total time=   0.1s
[CV 6/10] END bootstrap=True, criterion=gini, max_features=sqrt, min_samples_leaf=3, min_samples_split

In [24]:
print(metrics.confusion_matrix(Y_test, preds_rf))

[[ 9  5]
 [ 0 48]]


Testing with only least variant genes:

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(
    exp_filtered_2.drop(['has_cancer'], axis=1), exp_filtered_2['has_cancer'], test_size=0.2, random_state=226506)

tuned_nb = GridSearchCV(MultinomialNB(), {}, refit=True, cv=10, verbose=2)
tuned_nb.fit(X_train, Y_train)

preds_mnb = tuned_nb.predict(X_test)

preds_mnb = preds_mnb.astype(int)
Y_test = Y_test.astype(int)
print(f"Accuracy: {metrics.accuracy_score(Y_test, preds_mnb)}")
print(f"Precision: {metrics.precision_score(Y_test, preds_mnb)}")
print(f"Recall: {metrics.recall_score(Y_test, preds_mnb)}")
print(f"F1: {metrics.f1_score(Y_test, preds_mnb)}")

Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] END .................................................... total time=   0.0s
[CV] END .................................................... total time=   0.0s
[CV] END .................................................... total time=   0.0s
[CV] END .................................................... total time=   0.0s
[CV] END .................................................... total time=   0.0s
[CV] END .................................................... total time=   0.0s
[CV] END .................................................... total time=   0.0s
[CV] END .................................................... total time=   0.0s
[CV] END .................................................... total time=   0.0s
[CV] END .................................................... total time=   0.0s
Accuracy: 0.7741935483870968
Precision: 0.7741935483870968
Recall: 1.0
F1: 0.8727272727272727


In [35]:
X_train, X_test, Y_train, Y_test = train_test_split(
    exp_filtered_2.drop(['has_cancer'], axis=1), exp_filtered_2['has_cancer'], test_size=0.2, random_state=226506)

tune_grid = {
    'gamma': [0.00001, 0.0001, 0.001, 0.1, 1, 'scale', 'auto'],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [1, 2, 3, 4],
    'C': [0.1, 1, 10, 100]
}

tuned_svm = GridSearchCV(SVC(random_state=226506), tune_grid, refit=True, cv=10, verbose=2)
tuned_svm.fit(X_train, Y_train)

preds_svm = tuned_svm.predict(X_test)

preds_svm = preds_svm.astype(int)
print(f"Accuracy: {metrics.accuracy_score(Y_test, preds_svm)}")
print(f"Precision: {metrics.precision_score(Y_test, preds_svm)}")
print(f"Recall: {metrics.recall_score(Y_test, preds_svm)}")
print(f"F1: {metrics.f1_score(Y_test, preds_svm)}")
print(f"Confidence interval (95%): {tuned_svm.cv_results_.get('mean_test_score') + 1.96 * tuned_svm.cv_results_.get('std_test_score')}, {tuned_svm.cv_results_.get('mean_test_score') - 1.96 * tuned_svm.cv_results_.get('std_test_score')}")

Fitting 10 folds for each of 448 candidates, totalling 4480 fits
[CV] END ........C=0.1, degree=1, gamma=1e-05, kernel=linear; total time=   0.0s
[CV] END ........C=0.1, degree=1, gamma=1e-05, kernel=linear; total time=   0.0s
[CV] END ........C=0.1, degree=1, gamma=1e-05, kernel=linear; total time=   0.0s
[CV] END ........C=0.1, degree=1, gamma=1e-05, kernel=linear; total time=   0.0s
[CV] END ........C=0.1, degree=1, gamma=1e-05, kernel=linear; total time=   0.0s
[CV] END ........C=0.1, degree=1, gamma=1e-05, kernel=linear; total time=   0.0s
[CV] END ........C=0.1, degree=1, gamma=1e-05, kernel=linear; total time=   0.0s
[CV] END ........C=0.1, degree=1, gamma=1e-05, kernel=linear; total time=   0.0s
[CV] END ........C=0.1, degree=1, gamma=1e-05, kernel=linear; total time=   0.0s
[CV] END ........C=0.1, degree=1, gamma=1e-05, kernel=linear; total time=   0.0s
[CV] END ..........C=0.1, degree=1, gamma=1e-05, kernel=poly; total time=   0.0s
[CV] END ..........C=0.1, degree=1, gamma=1e

In [49]:
print(tuned_svm.best_estimator_.kernel)
print(tuned_svm.best_estimator_.gamma)

rbf
scale


In [50]:
tune_grid = {
    'gamma': ['scale'],
    'kernel': ['rbf'],
    'degree': [1],
    'C': [10]
}

tuned_svm = GridSearchCV(SVC(random_state=226506), tune_grid, refit=True, cv=10, verbose=2)
tuned_svm.fit(X_train, Y_train)

preds_svm = tuned_svm.predict(X_test)

preds_svm = preds_svm.astype(int)
print(f"Accuracy: {metrics.accuracy_score(Y_test, preds_svm)}")
print(f"Precision: {metrics.precision_score(Y_test, preds_svm)}")
print(f"Recall: {metrics.recall_score(Y_test, preds_svm)}")
print(f"F1: {metrics.f1_score(Y_test, preds_svm)}")
print(f"Confidence interval (95%): {tuned_svm.cv_results_.get('mean_test_score') + 1.96 * tuned_svm.cv_results_.get('std_test_score')}, {tuned_svm.cv_results_.get('mean_test_score') - 1.96 * tuned_svm.cv_results_.get('std_test_score')}")

Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] END ............C=10, degree=1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END ............C=10, degree=1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END ............C=10, degree=1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END ............C=10, degree=1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END ............C=10, degree=1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END ............C=10, degree=1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END ............C=10, degree=1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END ............C=10, degree=1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END ............C=10, degree=1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END ............C=10, degree=1, gamma=scale, kernel=rbf; total time=   0.0s
Accuracy: 0.7903225806451613
Precision: 0.7966101694915254
Recall: 0.9791666666666666
F1: 0.8785046728971962
Confidence interval 

In [34]:
X_train, X_test, Y_train, Y_test = train_test_split(
    expression_matrix_filtered.drop(['has_cancer'], axis=1), expression_matrix_filtered['has_cancer'], test_size=0.2, random_state=226506)

tune_grid = {
    'gamma': [0.00001],
    'kernel': ['linear'],
    'degree': [1],
    'C': [0.1]
}

tuned_svm = GridSearchCV(SVC(random_state=226506), tune_grid, refit=True, cv=10, verbose=2)
tuned_svm.fit(X_train, Y_train)

preds_svm = tuned_svm.predict(X_test)

preds_svm = preds_svm.astype(int)
print(f"Accuracy: {metrics.accuracy_score(Y_test, preds_svm)}")
print(f"Precision: {metrics.precision_score(Y_test, preds_svm)}")
print(f"Recall: {metrics.recall_score(Y_test, preds_svm)}")
print(f"F1: {metrics.f1_score(Y_test, preds_svm)}")
print(f"Confidence interval (95%): {tuned_svm.cv_results_.get('mean_test_score') + 1.96 * tuned_svm.cv_results_.get('std_test_score')}, {tuned_svm.cv_results_.get('mean_test_score') - 1.96 * tuned_svm.cv_results_.get('std_test_score')}")

Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] END ........C=0.1, degree=1, gamma=1e-05, kernel=linear; total time=   0.0s
[CV] END ........C=0.1, degree=1, gamma=1e-05, kernel=linear; total time=   0.0s
[CV] END ........C=0.1, degree=1, gamma=1e-05, kernel=linear; total time=   0.0s
[CV] END ........C=0.1, degree=1, gamma=1e-05, kernel=linear; total time=   0.0s
[CV] END ........C=0.1, degree=1, gamma=1e-05, kernel=linear; total time=   0.0s
[CV] END ........C=0.1, degree=1, gamma=1e-05, kernel=linear; total time=   0.0s
[CV] END ........C=0.1, degree=1, gamma=1e-05, kernel=linear; total time=   0.0s
[CV] END ........C=0.1, degree=1, gamma=1e-05, kernel=linear; total time=   0.0s
[CV] END ........C=0.1, degree=1, gamma=1e-05, kernel=linear; total time=   0.0s
[CV] END ........C=0.1, degree=1, gamma=1e-05, kernel=linear; total time=   0.0s
Accuracy: 0.9838709677419355
Precision: 0.9795918367346939
Recall: 1.0
F1: 0.9896907216494846
Confidence interval (95%): [1.03600

In [4]:
exp_transpose = expression_matrix.transpose()

In [5]:
exp_transpose

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,297,298,299,300,301,302,303,304,305,306
1007_s_at,8.496339,7.354706,6.898585,9.640469,8.175582,8.364415,7.575356,8.185822,7.804674,9.353249,...,9.329418,8.898559,9.222869,8.441316,8.19473,8.437055,7.590776,7.767889,9.015564,9.351943
1053_at,5.944231,5.633305,5.871544,5.893211,6.151961,6.333614,6.274335,6.347031,6.576194,5.890703,...,6.143179,5.921702,5.161781,5.985828,5.628126,5.03582,5.54115,5.358043,4.946309,5.261728
117_at,5.838808,6.325852,5.514367,5.668953,6.065677,5.803095,5.479506,6.616528,5.950515,5.368573,...,6.163103,5.8522,5.197433,5.568367,5.83527,4.723362,5.14351,5.419263,5.607157,5.348134
121_at,7.180569,7.901653,7.71879,7.640794,7.563042,7.505144,7.26672,7.612678,7.21536,7.64588,...,8.131638,6.8562,7.934532,7.567247,7.669761,7.480491,7.93893,7.377218,7.433125,7.529486
1255_g_at,2.759084,4.077464,3.175172,3.067836,2.980283,2.762957,2.761245,2.904411,2.832273,2.999111,...,3.073488,2.953088,3.490833,2.651213,3.070625,3.245203,3.26833,3.854395,3.103448,3.665095
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AFFX-TrpnX-3_at,2.69305,2.702545,2.722887,2.867993,2.770782,2.794066,2.736256,2.87649,2.63731,2.855903,...,2.816209,2.737058,3.028836,2.842158,2.868383,2.853172,3.211069,2.808535,2.941784,3.82393
AFFX-TrpnX-5_at,3.063041,3.845348,3.705345,3.215214,3.170171,3.096422,2.945528,3.058819,3.061541,3.355273,...,3.141304,3.014076,3.388969,3.407304,3.389115,3.170252,4.11423,3.692275,3.665263,3.614414
AFFX-TrpnX-M_at,3.117905,3.677584,3.173042,3.42067,3.164002,3.408591,3.672628,3.121292,3.373403,3.392454,...,3.176052,3.049913,3.124641,3.418521,3.484033,3.248964,4.296599,3.392462,3.356391,3.294695
sample_names,GSM1045191_N1_15_12_04.CEL,GSM1045192_N4_14_12_04.CEL,GSM1045193_N5_15_12_04.CEL,GSM1045194_N6_14_12_04.CEL,GSM1045195_P1_15_12_04.CEL,GSM1045196_P10_26_1_05.CEL,GSM1045197_P11_25_1_05.CEL,GSM1045198_P12_25_1_05.CEL,GSM1045199_P13_25_1_05.CEL,GSM1045200_P14_25_1_05.CEL,...,GSM728667.CEL,GSM728668.CEL,GSM728669.CEL,GSM728670.CEL,GSM728671.CEL,GSM728672.CEL,GSM728673.CEL,GSM728674.CEL,GSM728675.CEL,GSM728676.CEL


In [None]:
exp_transpose.columns

In [6]:
exp_transpose.to_csv("trasposta.csv", header=False, index_label=True)

Now some small test on Clarke's dataset

In [3]:
expression_clarke = pd.read_csv("dataset_clark.csv", sep=" ")
expression_clarke.head(5)

Unnamed: 0,AFFX-TrpnX-3_at,1560383_at,1569663_at,1561222_at,241796_x_at,1560992_at,1554920_at,1566930_at,1561177_at,1556282_at,...,243613_at,242474_s_at,237477_at,1557363_a_at,207934_at,1554133_at,244283_x_at,242948_x_at,1555055_at,has_cancer
1,2.368188,2.299416,2.249327,2.323346,2.421872,2.382523,2.182615,2.313856,2.292708,2.39207,...,2.62445,2.478789,2.293508,2.445813,2.331599,2.28565,2.237884,2.707792,2.248191,0
2,2.39458,2.299247,2.320025,2.550657,2.773975,2.400077,2.410844,2.486348,2.418126,2.745695,...,2.379285,2.711583,2.440026,2.420784,2.556292,2.282652,2.296605,2.585675,2.342204,0
3,2.41884,2.450094,2.353575,2.376876,2.360638,2.292805,2.256739,2.382371,2.561871,2.394036,...,2.604558,2.352347,2.606063,2.511967,2.554176,2.218655,2.24961,2.986171,2.325991,0
4,2.58049,2.254073,2.410144,2.52337,2.492478,2.779718,2.302139,2.624737,2.375422,2.706434,...,2.522257,2.790318,2.346151,2.70737,2.719193,2.359547,2.356542,2.397825,2.506958,0
5,2.331745,2.418706,2.491957,2.269967,2.349177,2.338383,2.34751,2.122751,2.12555,2.534821,...,2.59753,2.517993,2.69068,2.71207,2.46791,2.319397,2.301513,2.51366,2.36821,0


In [9]:
expression_clarke[["AFFX-TrpnX-3_at","1560383_at","1569663_at","1561222_at","241796_x_at","1560992_at","1554920_at","1566930_at","1561177_at","1556282_at","224231_at","234907_x_at","1561331_at","1559612_at","1564272_a_at","1561564_at","1560647_at","1554354_at","233372_at","1554778_at","240093_x_at","1559842_at","1565799_at","234686_at","1561151_a_at","1564950_at","243520_x_at","216974_at","1558387_at","1561959_x_at","1557017_at","1562329_at","1554142_at","1553441_at","1564128_at","241587_at","1570452_at","91580_at","1553868_a_at","240463_at","1560528_at","1565755_at","237520_x_at","244318_at","1561765_at","1552661_at","1560494_a_at","234755_x_at","1563055_at","243274_x_at","1553519_at","243784_s_at","241061_at","1553963_at","1566749_at","1558893_a_at","231898_x_at","1552379_at","1568603_at","1560884_at","1561775_at","208343_s_at","216258_s_at","1552904_at","234601_x_at","1561156_at","241200_x_at","1556786_at","1563254_a_at","1567386_at","1553698_a_at","1553776_at","231294_at","233620_at","207815_at","1562480_at","1569948_at","233822_x_at","1567656_at","1552939_at","1561343_a_at","1557217_a_at","241760_x_at","1552372_at","1562998_at","1565657_at","1561855_x_at","1554071_at","1560885_x_at","242522_at","242603_x_at","221040_at","1558691_a_at","1553261_x_at","1562997_a_at","1561025_at","220656_at","241979_x_at","1554706_at","1564705_at","1558009_at","231544_s_at","1565820_x_at","1558372_at","1559833_at","210121_at","1555538_s_at","240512_x_at","1568366_at","1557062_at","216987_at","1560424_at","1561230_at","1562566_at","234432_at","1561021_at","242164_s_at","224296_x_at","1558708_at","1554793_at","214899_at","1560980_a_at","1552878_at","221438_s_at","240586_at","233683_at","231717_s_at","1567387_at","1560516_at","207607_at","1553266_at","204704_s_at","207849_at","1560458_s_at","1562628_at","1554619_at","1570115_at","244190_at","1560197_at","230344_x_at","1559274_at","217306_at","244067_x_at","237804_at","1552974_at","234706_x_at","1570491_at","1568449_at","1556889_s_at","1565446_at","238391_at","215312_at","1564631_at","1554328_at","208245_at","1554012_at","1552858_at","1562823_at","1562054_at","207465_at","1556873_at","206977_at","237937_x_at","241781_at","239767_at","1552722_at","1560540_x_at","1560111_at","242623_x_at","1568925_at","1561030_at","242063_s_at","1569756_at","231678_s_at","239776_at","1554125_a_at","243613_at","242474_s_at","237477_at","1557363_a_at","207934_at","1554133_at","244283_x_at","242948_x_at","1555055_at","has_cancer"]]
expression_clarke.head(5)

Unnamed: 0,AFFX-TrpnX-3_at,1560383_at,1569663_at,1561222_at,241796_x_at,1560992_at,1554920_at,1566930_at,1561177_at,1556282_at,...,243613_at,242474_s_at,237477_at,1557363_a_at,207934_at,1554133_at,244283_x_at,242948_x_at,1555055_at,has_cancer
1,2.368188,2.299416,2.249327,2.323346,2.421872,2.382523,2.182615,2.313856,2.292708,2.39207,...,2.62445,2.478789,2.293508,2.445813,2.331599,2.28565,2.237884,2.707792,2.248191,0
2,2.39458,2.299247,2.320025,2.550657,2.773975,2.400077,2.410844,2.486348,2.418126,2.745695,...,2.379285,2.711583,2.440026,2.420784,2.556292,2.282652,2.296605,2.585675,2.342204,0
3,2.41884,2.450094,2.353575,2.376876,2.360638,2.292805,2.256739,2.382371,2.561871,2.394036,...,2.604558,2.352347,2.606063,2.511967,2.554176,2.218655,2.24961,2.986171,2.325991,0
4,2.58049,2.254073,2.410144,2.52337,2.492478,2.779718,2.302139,2.624737,2.375422,2.706434,...,2.522257,2.790318,2.346151,2.70737,2.719193,2.359547,2.356542,2.397825,2.506958,0
5,2.331745,2.418706,2.491957,2.269967,2.349177,2.338383,2.34751,2.122751,2.12555,2.534821,...,2.59753,2.517993,2.69068,2.71207,2.46791,2.319397,2.301513,2.51366,2.36821,0


In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(
    expression_clarke.drop(['has_cancer'], axis=1), expression_clarke['has_cancer'], test_size=0.2, random_state=226506)

tune_grid = {
    'bootstrap': [True],
    'criterion': ['gini'],
    'max_features': ['sqrt'],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000],
    #'n_jobs': [-1]
}

tuned_rf = GridSearchCV(RandomForestClassifier(
    random_state=226506), tune_grid, refit=True, cv=10, verbose=2)
tuned_rf.fit(X_train, Y_train)

preds_rf = tuned_rf.predict(X_test)

preds_rf = preds_rf.astype(int)
print(f"Accuracy: {metrics.accuracy_score(Y_test, preds_rf)}")
print(f"Precision: {metrics.precision_score(Y_test, preds_rf)}")
print(f"Recall: {metrics.recall_score(Y_test, preds_rf)}")
print(f"F1: {metrics.f1_score(Y_test, preds_rf)}")

Fitting 10 folds for each of 36 candidates, totalling 360 fits
[CV] END bootstrap=True, criterion=gini, max_features=sqrt, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.2s
[CV] END bootstrap=True, criterion=gini, max_features=sqrt, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.3s
[CV] END bootstrap=True, criterion=gini, max_features=sqrt, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.2s
[CV] END bootstrap=True, criterion=gini, max_features=sqrt, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.2s
[CV] END bootstrap=True, criterion=gini, max_features=sqrt, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.2s
[CV] END bootstrap=True, criterion=gini, max_features=sqrt, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.2s
[CV] END bootstrap=True, criterion=gini, max_features=sqrt, min_samples_leaf=3, min_samples_split=8, n_estima

In [2]:
print(f"Confidence interval (95%): {0.7496133 + 1.96 * 0.04652349}, {0.7496133 - 1.96 * 0.04652349}")

Confidence interval (95%): 0.8407993404, 0.6584272596
