# Decision Tree

In [1]:
###########################################################################################
# Read all the dimensionally reduced files.
###########################################################################################

#read from data with reduced dimensions from the csv file in chunks (as the file is too big to be read with read_csv) 
import pandas as pd
import os
filename = "../../output-data/PCA-ALL/PCA-ALL.csv"
dfPCAList = []
for chunkTemp in pd.read_csv(filename, chunksize=20):
    dfPCAList.append(chunkTemp)
dataFramePCA = pd.concat(dfPCAList) 
dataFramePCA.head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1191,1192,1193,1194,1195,1196,1197,1198,1199,CancerType
0,-55.343285,-126.190156,-179.740709,176.617508,105.096771,29.470689,-49.969141,107.428589,83.472864,-65.010527,...,7.575449e-16,3.903185e-16,1.130541e-15,3.310056e-16,5.701954000000001e-17,8.893892e-15,4.492858e-16,9.309029000000001e-17,7.739631e-15,LGG
1,-76.564778,-45.758051,-152.585784,120.916344,40.464931,-29.667521,-13.329516,28.324559,82.610741,-46.02887,...,7.575449e-16,3.903185e-16,1.130541e-15,3.310056e-16,5.701954000000001e-17,8.893892e-15,4.492858e-16,9.309029000000001e-17,7.739631e-15,LGG
2,-549.502366,-273.469092,-118.915857,118.495907,0.477319,70.932045,77.167022,-5.171697,-62.889904,-39.829417,...,7.575449e-16,3.903185e-16,1.130541e-15,3.310056e-16,5.701954000000001e-17,8.893892e-15,4.492858e-16,9.309029000000001e-17,7.739631e-15,LGG
3,-502.357654,-266.772564,-27.700195,93.084931,91.809505,41.085247,41.356804,-23.764993,-7.872551,-1.030746,...,7.575449e-16,3.903185e-16,1.130541e-15,3.310056e-16,5.701954000000001e-17,8.893892e-15,4.492858e-16,9.309029000000001e-17,7.739631e-15,LGG
4,-501.416866,-256.686756,-121.397588,74.965747,83.945138,40.309418,32.916611,-44.554687,-6.541575,-52.891518,...,7.575449e-16,3.903185e-16,1.130541e-15,3.310056e-16,5.701954000000001e-17,8.893892e-15,4.492858e-16,9.309029000000001e-17,7.739631e-15,LGG
5,-542.118981,-216.085578,-154.588438,100.631594,18.214736,-23.380787,140.103672,-58.381645,15.78634,2.199567,...,7.575449e-16,3.903185e-16,1.130541e-15,3.310056e-16,5.701954000000001e-17,8.893892e-15,4.492858e-16,9.309029000000001e-17,7.739631e-15,LGG
6,70.668553,-55.01279,-196.714484,141.9102,19.26704,-48.83281,-9.866192,35.978007,130.560999,-32.349642,...,7.575449e-16,3.903185e-16,1.130541e-15,3.310056e-16,5.701954000000001e-17,8.893892e-15,4.492858e-16,9.309029000000001e-17,7.739631e-15,LGG
7,-714.672213,-313.091486,-57.246248,68.318735,90.897546,70.038473,37.320502,-43.969219,-50.290974,-13.035979,...,7.575449e-16,3.903185e-16,1.130541e-15,3.310056e-16,5.701954000000001e-17,8.893892e-15,4.492858e-16,9.309029000000001e-17,7.739631e-15,LGG
8,-684.092996,-245.672665,-77.421551,37.953425,72.493762,-7.681309,21.934606,-42.482929,1.44012,-19.775625,...,7.575449e-16,3.903185e-16,1.130541e-15,3.310056e-16,5.701954000000001e-17,8.893892e-15,4.492858e-16,9.309029000000001e-17,7.739631e-15,LGG
9,-581.427671,-229.024076,-102.420707,65.293164,96.776663,12.888462,21.762938,-39.500445,-15.623715,-18.121467,...,7.575449e-16,3.903185e-16,1.130541e-15,3.310056e-16,5.701954000000001e-17,8.893892e-15,4.492858e-16,9.309029000000001e-17,7.739631e-15,LGG


In [42]:
###########################################################################################
#Train the model using decision tree and stratified 10-fold cross validation to get the best parameters 
#https://www.ritchieng.com/machine-learning-cross-validation/
#https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html
###########################################################################################

#Get all the features i.e. all the columns names
finalCols = dataFramePCA.columns
#Take all columns names of features i.e. cg00001 etc. except last column name which has label
finalFeatures = finalCols[:-1]
# Separating out the feature values
x = dataFramePCA.loc[:, finalFeatures].values
# Separating out the label values
y = dataFramePCA.loc[:,['CancerType']].values

#import the decision tree classifier module
from sklearn.tree import DecisionTreeClassifier

#Create a decision tree classifier
dct = DecisionTreeClassifier()

#import the cross_validate modules
from sklearn.model_selection import cross_validate, StratifiedKFold

#Import GridSearchCV module, whcih is used for picking the best parameter
from sklearn.model_selection import GridSearchCV

#Specify all the parameter values to be tested
parameter_grid = {'max_depth': [50, 100, 200, 300, 400]}

#Build the stratified k-fold object, specify splits as 10
cross_validation_stratified = StratifiedKFold(n_splits=10)

#Do the gridsearch to get the best parameters for the fit
grid_search = GridSearchCV(dct, param_grid=parameter_grid, cv=cross_validation_stratified) 
grid_search.fit(x, y.ravel())

#Print the best score
print('Best Accuracy score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

#print the parameters used for best training
dtc = grid_search.best_estimator_
dtc

Best Accuracy score: 0.8805970149253731
Best parameters: {'max_depth': 100}


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=100,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [41]:
###########################################################################################
#Get the other metrics for the best parameters
###########################################################################################
modelCV = DecisionTreeClassifier(max_depth=100)

#import the cross_validate module
from sklearn.model_selection import cross_validate, StratifiedKFold

#Use StratifiedKFold to select equal samples from all cancer types
skf = StratifiedKFold(n_splits=10)

#Cross validate on the entire dataset 
scores = cross_validate(modelCV, x, y.ravel(), cv=skf,
                           scoring=('precision_weighted','recall_weighted','f1_weighted'), return_train_score=True)

#Print the results of cross-validate
print('K-fold cross-validation results:')
print('precision:',scores['test_precision_weighted'].mean())
print('recall:',scores['test_recall_weighted'].mean())
print('f1:',scores['test_f1_weighted'].mean())

K-fold cross-validation results:
precision: 0.8823579135603659
recall: 0.8781424242424242
f1: 0.8776836967410839


In [6]:
#################################################################################
# Use cross_val_predict to build the classification report, gives per cancer type
#################################################################################
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
predictions_cv = cross_val_predict(modelCV, x, y.ravel(), cv=skf)
print('K-fold cross-validation results from classification report:')
print(classification_report(y, predictions_cv))

K-fold cross-validation results:
Accuracy: 0.87893864013267
              precision    recall  f1-score   support

        LAML       0.99      1.00      0.99       201
         LGG       0.92      0.94      0.93       202
        LUSC       0.80      0.80      0.80       200
        SARC       0.78      0.77      0.78       201
        SKCM       0.88      0.90      0.89       201
        STAD       0.89      0.87      0.88       201

   micro avg       0.88      0.88      0.88      1206
   macro avg       0.88      0.88      0.88      1206
weighted avg       0.88      0.88      0.88      1206

