# 0) Settings

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd drive/MyDrive/Colab\ Notebooks/MOW2/magisterka

/content/drive/MyDrive/Colab Notebooks/MOW2/magisterka


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy import stats
from scipy.stats import randint
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
import sklearn.model_selection as model_selection
from sklearn import metrics
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing

In [None]:
df = pd.read_csv ('thyroid_clean.csv')
df.head()

Unnamed: 0,id,age,gender,FT3,FT4,TSH,TPO,TGAb,site,echo_pattern,multifocality,size,shape,margin,calcification,echo_strength,blood_flow,composition,mal,multilateral
0,1,46,1,4.34,12.41,1.677,0.43,0.98,0,0,0,4.6,0,0,0,4,0,1,1,1
1,2,61,1,5.4,16.26,2.905,0.45,1.91,0,0,0,4.2,0,1,1,4,1,2,1,1
2,3,44,1,3.93,13.39,1.823,9.15,26.25,0,0,0,0.7,0,1,0,4,0,2,0,1
3,5,29,0,3.7,13.98,1.293,0.15,0.81,0,0,1,1.0,1,1,1,4,0,2,1,1
4,6,37,1,3.6,14.56,0.938,0.13,21.22,0,0,0,0.7,0,1,1,4,0,2,1,1


# 1) Normalization

In [None]:
x = df.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_scaled = pd.DataFrame(x_scaled)
df_scaled=pd.DataFrame(x_scaled, columns=df.columns)

In [None]:
def testCrossValidation(data,model):
  X = data.drop(['mal'],axis=1)
  Y = data['mal']

  scoring = { 'f1_score' : make_scorer(f1_score),
              'accuracy' : make_scorer(accuracy_score), 
              'precision' : make_scorer(precision_score),
              'recall' : make_scorer(recall_score),
              'roc_auc' : make_scorer(roc_auc_score),
              'specifity' : make_scorer(recall_score, pos_label=0),
            }

  kfold = model_selection.KFold(n_splits=10)
  results = model_selection.cross_validate(estimator=model,
                                          X=X,
                                          y=Y,
                                          cv=kfold,
                                          scoring=scoring)
  return results


In [None]:
algorithms_names = ['SVM_linear',
                    'Naive Bayes',
                    'DecisionTree',
                    'KNeighbors', 
                    'SVM_radial',
                    'GBM',
                    'Logistic Regression',
                    'LDA',
                    'Random Forest']
                    
test_names = ['accuracy', 'precision', 'recall', 'f1_score', 'roc_auc', 'specifity', 'sensivity']

def count_average_all(crossResult, num, algorithms_names, test_names):
  crossResult_list = crossResult.Accuracy.tolist()

  test_accuracy = dict()
  test_precision = dict()
  test_sensivity = dict()
  test_specifity = dict()
  test_recall = dict()
  test_f1_score = dict()
  test_roc_auc = dict()

  for i in range(num):
    test_f1_score[algorithms_names[i]] = crossResult_list[i]['test_f1_score'].mean()
    test_accuracy[algorithms_names[i]] = crossResult_list[i]['test_accuracy'].mean()
    test_precision[algorithms_names[i]] = crossResult_list[i]['test_precision'].mean()
    test_recall[algorithms_names[i]] = crossResult_list[i]['test_recall'].mean()
    test_roc_auc[algorithms_names[i]] = crossResult_list[i]['test_roc_auc'].mean()
    test_specifity[algorithms_names[i]] = crossResult_list[i]['test_specifity'].mean()

  test_all = pd.DataFrame({
              test_names[0] : test_f1_score,
              test_names[1] : test_accuracy, 
              test_names[2] : test_precision, 
              test_names[3] : test_recall, 
              test_names[4] : test_roc_auc,
              test_names[5] : test_specifity})
  return test_all

# Choice of hyperparameters

## a) Algorithms

In [None]:
algorithms_1 = [svm.SVC(),
                GaussianNB(),
                KNeighborsClassifier(), 
                svm.SVC(),
                LogisticRegression(),
                LinearDiscriminantAnalysis()]

In [None]:
algorithms_2 = [tree.DecisionTreeClassifier(),
                GradientBoostingClassifier(),
                RandomForestClassifier(random_state=0)]

In [None]:
def check_algorithm1(num, gs):
  if num==0:
    return svm.SVC(**gs.best_params_)
  elif num==1:
    return GaussianNB(**gs.best_params_)
  elif num==2:
    return KNeighborsClassifier(**gs.best_params_)
  elif num==3:
    return svm.SVC(**gs.best_params_)
  elif num==4:
    return LogisticRegression(**gs.best_params_)
  elif num==5:
    return LinearDiscriminantAnalysis(**gs.best_params_)

def check_algorithm2(num, gs):
  if num==0:
    return tree.DecisionTreeClassifier(**gs.best_params_)
  elif num==1:
    return GradientBoostingClassifier(**gs.best_params_)
  elif num==2:
    return RandomForestClassifier(**gs.best_params_)

## b) GridSearch

### Hiperparameters

In [None]:
params_svc_linear ={'C': [0.1, 1, 10, 50, 100], 
         'kernel': ['linear']}

params_nb = {
    'var_smoothing': np.logspace(0, 9, num=100)
}

params_tree = {
    'criterion':('entropy', 'gini'),
    'splitter':('best','random'),
    'max_depth':np.arange(1,6),
    'min_samples_split':np.arange(3,8),
    'min_samples_leaf':np.arange(1,5)}

params_knn = { 'leaf_size': np.arange(20,40,1),
                'n_neighbors': np.arange(1,20),
                'p' : [1,2]}

params_svc_radial ={'C': [0.1,1, 10, 100], 
         'kernel': ['rbf'],
         'gamma': ['scale', 'auto']}

params_GBC = {
    "learning_rate": [0.01, 0.1, 0.2],
    "max_depth":[3,4,5,8],
    "subsample":[0.5, 0.8, 1.0],
    'n_estimators' : [100,500,1000]
    }

params_logistic = {
    'penalty' : ['l1','l2'], 
    'C'       : np.logspace(-3,3,7),
    'solver'  : ['newton-cg', 'liblinear']}


params_LDA = {
    'solver' : ['svd', 'eigen'] 
}

params_forest = {'n_estimators': [10, 56, 80, 200],
               'max_features': ['auto', 'sqrt'],
               'max_depth': [4,5,8,10],
               'bootstrap': [True, False]}
  

parameters1_list_gs = [params_svc_linear,
                       params_nb,
                       params_knn, 
                       params_svc_radial, 
                       params_logistic,
                       params_LDA,]

parameters2_list_gs = [params_tree, 
                       params_GBC, 
                       params_forest]

### Function

In [None]:
def parameters_optimalization_gs(params, algorithm, num, df, scaled):
  X = df.drop(['mal'],axis=1)
  Y = df['mal']
  X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

  cv_score = []
  final_score = []

  for i in range(0, 5):
    print('Iteracja: ' + str(i))
    gs = GridSearchCV(algorithm, cv = 10, param_grid = params, scoring = 'accuracy', n_jobs = -1)
    gs.fit(X_train, y_train)
    cv_score.append(gs.best_score_)

  # test modelu - parametry GridSearchCV
  if scaled==0:
    model_1 = check_algorithm1(num, gs)
  elif scaled ==1:
    model_1 = check_algorithm2(num, gs)
  
  model_1.fit(X_train, y_train)
  final_score.append(model_1.score(X_test, y_test))

  return gs


### Test df_scale

In [None]:
best_parameters = dict()
for i in range(0,6):
  result = parameters_optimalization_gs(parameters1_list_gs[i], algorithms_1[i], i, df_scaled, 0)
  if(i==0):
    best_parameters[str(algorithms_1[i])+"_linear"] = {'best_score' : result.best_score_, 'parameters': result.best_params_}
  elif(i==3):
    best_parameters[str(algorithms_1[i])+"_radial"] = {'best_score' : result.best_score_, 'parameters': result.best_params_}
  else:
    best_parameters[str(algorithms_1[i])+ "_"] = {'best_score' : result.best_score_, 'parameters': result.best_params_}

In [None]:
res_gs1 = pd.DataFrame(best_parameters)
res_gs1

Unnamed: 0,SVC()_linear,GaussianNB()_,KNeighborsClassifier()_,SVC()_radial,LogisticRegression()_,LinearDiscriminantAnalysis()_
best_score,0.781674,0.770501,0.75435,0.766481,0.76545,0.766461
parameters,"{'C': 1, 'kernel': 'linear'}",{'var_smoothing': 1.5199110829529336},"{'leaf_size': 20, 'n_neighbors': 10, 'p': 2}","{'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}","{'C': 1.0, 'penalty': 'l1', 'solver': 'libline...",{'solver': 'svd'}


### Test df

In [None]:
best_parameters = dict()
for i in range(0,3):
  result = parameters_optimalization_gs(parameters2_list_gs[i], algorithms_2[i], i, df, 1)
  best_parameters[str(algorithms_2[i])+ "_"] = {'best_score' : result.best_score_, 'parameters': result.best_params_}

Iteracja: 0
Iteracja: 1
Iteracja: 2
Iteracja: 3
Iteracja: 4
Iteracja: 0
Iteracja: 1
Iteracja: 2
Iteracja: 3
Iteracja: 4
Iteracja: 0
Iteracja: 1
Iteracja: 2
Iteracja: 3
Iteracja: 4


In [None]:
res_gs2 = pd.DataFrame(best_parameters)
res_gs2

## c) Random Search

### Hiperparameters

In [None]:
params_svc_linear ={'C': stats.uniform(2, 10), 
         'kernel': ['linear']}

params_nb = {
    'var_smoothing': np.logspace(0,-9, num=100)
}

params_tree = {'criterion':('entropy', 'gini'),
                'splitter':('best','random'),
                'max_depth':randint(1,6),
                'min_samples_split':randint(3,8),
                'min_samples_leaf':randint(1,5)}



params_knn = { 'leaf_size': randint(20,40,1),
                'n_neighbors': randint(1,20),
                'p' : [1,2]}

params_svc_radial ={'C': [0.1,1, 10, 100], 
         'kernel': ['rbf'],
         'gamma': ['scale', 'auto']}

params_GBC = {
    "loss":["deviance"],
    "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    "min_samples_split": np.linspace(0.1, 0.5, 12),
    "min_samples_leaf": np.linspace(0.1, 0.5, 12),
    "max_depth":[3,5,8],
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse",  "absolute_error"],
    "subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
    "n_estimators":[10]
    }

params_logistic = {
    'penalty' : ['l1','l2'], 
    'C'       : np.logspace(-3,3,7),
    'solver'  : ['newton-cg', 'liblinear']}


params_LDA = {
    'solver' : ['svd', 'eigen'] 
}

params_forest = {'n_estimators': [10, 17, 25, 33, 41, 48, 56, 64, 72, 80],
               'max_features': ['auto', 'sqrt'],
               'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
               'min_samples_split': [2, 5, 10],
               'min_samples_leaf': [1, 2, 4],
               'bootstrap': [True, False]}

parameters1_list_rs = [params_svc_linear,
                       params_nb,
                       params_knn, 
                       params_svc_radial, 
                       params_logistic,
                       params_LDA]

parameters2_list_rs = [params_tree, 
                       params_GBC, 
                       params_forest]

### Function

In [None]:
def parameters_optimalization_rs(params, algorithm, num, df, scaled):              
  X = df.drop(['mal'],axis=1)
  Y = df['mal']
  X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

  cv_score = []
  final_score = []

  for i in range(0, 20):
    print('Iteracja: ' + str(i))
    rs = RandomizedSearchCV(algorithm, cv = 10, n_iter = 20, param_distributions = params, n_jobs = -1)
    rs.fit(X_train, y_train)
    cv_score.append(rs.best_score_)

  if scaled==0:
    model_1 = check_algorithm1(num, rs)
  elif scaled ==1:
    model_1 = check_algorithm2(num, rs)

  model_1.fit(X_train, y_train)
  final_score.append(model_1.score(X_test, y_test))

  return rs

### Test df_scale

In [None]:
best_parameters = dict()
for i in range(0,6):
  result = parameters_optimalization_rs(parameters1_list_rs[i], algorithms_1[i], i, df_scaled, 0)
  if(i==0):
    best_parameters[str(algorithms_1[i])+"_linear"] = {'best_score' : result.best_score_, 'parameters': result.best_params_}
  elif(i==3):
    best_parameters[str(algorithms_1[i])+"_radial"] = {'best_score' : result.best_score_, 'parameters': result.best_params_}
  else:
    best_parameters[algorithms_1[i] + '_'] = {'best_score' : result.best_score_, 'parameters': result.best_params_}

In [None]:
res_rs1 = pd.DataFrame(best_parameters)
res_rs1

### Test df

In [None]:
best_parameters = dict()
for i in range(0,3):
  result = parameters_optimalization_rs(parameters2_list_gs[i], algorithms_2[i], i, df, 1)
  best_parameters[algorithms_2[i]+ '_'] = {'best_score' : result.best_score_, 'parameters': result.best_params_}

In [None]:
res_rs2 = pd.DataFrame(best_parameters)
res_rs2

# 3) Final results

## After GridSearch

svm.SVC(kernel='linear', C=2.299),

GaussianNB(var_smoothing=0.43),  

KNeighborsClassifier(n_neighbors=10, leaf_size=21, p=2), 

svm.SVC(kernel='rbf', gamma='auto', C=10),

LogisticRegression(solver= 'newton-cg', penalty= 'l2', C= 10.0),

LinearDiscriminantAnalysis(solver='svd'),
              
tree.DecisionTreeClassifier(criterion='entropy', max_depth= 4, min_samples_leaf = 3, min_samples_split = 3, splitter = 'random' ),

GradientBoostingClassifier(subsample= 0.5, n_estimators= 10, min_samples_split= 0.21, min_samples_leaf= 0.1, max_features= 'log2', max_depth= 8, loss= 'deviance', learning_rate= 0.2, criterion= 'friedman_mse'),

RandomForestClassifier(n_estimators= 56, min_samples_split= 2, min_samples_leaf= 4, max_features= 'auto', max_depth= 40, bootstrap= False)






## After RandomSearch

svm.SVC(kernel='linear', C=1),

GaussianNB(var_smoothing=0.002310),

KNeighborsClassifier(n_neighbors=10, leaf_size=20, p=2), 

svm.SVC(kernel='rbf', gamma='auto', C=10),

LogisticRegression(solver= 'liblinear', penalty= 'l1', C= 1.0),

LinearDiscriminantAnalysis(solver='svd')
              
tree.DecisionTreeClassifier(criterion='gini', max_depth= 4, min_samples_leaf = 4, min_samples_split = 7, splitter = 'random' ),

GradientBoostingClassifier(subsample= 0.5, n_estimators= 500, max_depth= 3, learning_rate= 0.01),

RandomForestClassifier(n_estimators= 80, max_features= 'auto', max_depth= 5, bootstrap= False)