# Lung cancer full features models with balancing
**Solution notebook by Reem Abdel-Salam, Reviewed and Updated by Shweta Chandole**

Date: 10-June-2022

## Environment preparation

### Import Libraries

In [1]:
# import libraries
import pandas as pd
import numpy as np
import sys
import argparse
import csv
import regex 
import sklearn
import imblearn
import xgboost as xgb
from sklearn import utils
from sklearn import preprocessing
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, balanced_accuracy_score
from sklearn.svm import LinearSVC
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import cross_validate
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from scipy.sparse import *
from sklearn.base import TransformerMixin
from nltk.tokenize import TreebankWordTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.util import ngrams
from sklearn.model_selection import StratifiedKFold, train_test_split
from xgboost import XGBClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

  "Since version 1.0, "


### Define model and metrics creation fuctions

In [2]:
# define custom functions to run classification modules, build classifier models and calculate metrics

def print_statistics(y, y_pred):
    accuracy = metrics.accuracy_score(y, y_pred)
    weighted_precision = metrics.precision_score(y, y_pred, average='weighted')
    weighted_recall = metrics.recall_score(y, y_pred, average='weighted')
    weighted_f1_score = metrics.f1_score(y, y_pred, average='weighted')
    balanced_accuracy = balanced_accuracy_score(y, y_pred)

    print('Accuracy: %.3f\nWeighted_Precision: %.3f\nWeighted_Recall: %.3f\nWeighted_F1_score: %.3f\n'
          % (accuracy, weighted_precision, weighted_recall, weighted_f1_score))
    print('Balanced Accuracy: %.3f\n' % balanced_accuracy)
    print(metrics.classification_report(y, y_pred))
    
    return accuracy, weighted_precision, weighted_recall, weighted_f1_score, balanced_accuracy

# updated print_statistics function to include 'weighted_' label for weighted metrics, BACC  - Shweta C. 06/10

def plot_coefficients(classifier, feature_names, top_features=20, plot_name="/bow_models/bow_binary_"):
    # Get the top most positive/negative coefficients
    coef = classifier.coef_.ravel()
    top_positive_coefficients = np.argsort(coef)[-top_features:]
    top_negative_coefficients = np.argsort(coef)[:top_features]
    top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])
    x_names = [feature_names[feature] for feature in top_coefficients]

    # Plot the coefficients
    plt.figure(figsize=(15, 5))
    colors = ['red' if c < 0 else 'blue' for c in coef[top_coefficients]]
    plt.bar(np.arange(2 * top_features), coef[top_coefficients], color=colors)
    plt.xticks(np.arange(0, 2 * top_features), x_names, rotation=30, ha='right')
    plt.ylabel("Coefficient Value")
    plt.title("Visualising the top %d features taken up by an SVM model" % top_features)
    to_save_filename = path + "/plots/" + plot_name + "top%d_coefficients.png" % top_features
    plt.savefig(to_save_filename)
    print("Coefficients' visualisation saved to %s\n" % to_save_filename)

def get_regularization_params(a=-1, b=1, c=3, d=1, e=5):
    reg_range = np.outer(np.logspace(a, b, c), np.array([d, e]))
    reg_range = reg_range.flatten()
    return reg_range


def grid_classifier(x_train, y_train, x_test, y_test, model, parameters,
                    make_feature_analysis=False, feature_names=None, top_features=0, plot_name="coeff"):
    grid = GridSearchCV(estimator=model, param_grid=parameters, verbose=0)
    grid.fit(x_train, y_train)
    sorted(grid.cv_results_.keys())
    classifier = grid.best_estimator_
    if make_feature_analysis:
        plot_coefficients(classifier, feature_names, top_features, plot_name)
    y_hat = classifier.predict(x_test)
    print_statistics(y_test, y_hat)

# Method to print the header of the currently running model
def print_model_title(name):
    print("\n==================================================================")
    print('{:>20}'.format(name))
    print("==================================================================\n")


def linear_svm_grid(x_train, y_train, x_test, y_test, class_ratio,
               make_feature_analysis=False, feature_names=None, top_features=0, plot_name="coeff"):
    print_model_title("Linear SVM")
    C_range = get_regularization_params()
    parameters = {'C': C_range}
    linear_svm = LinearSVC(C=1.0, class_weight=class_ratio, penalty='l2')
    grid_classifier(x_train, y_train, x_test, y_test, linear_svm, parameters,
                    make_feature_analysis, feature_names, top_features, plot_name)


def nonlinear_svm_grid(x_train, y_train, x_test, y_test, class_ratio,
                  make_feature_analysis=False, feature_names=None, top_features=0, plot_name="coeff"):
    print_model_title("Nonlinear SVM")
    C_range = get_regularization_params(a=-1, b=0, c=2, d=1, e=5)
    gamma_range = get_regularization_params(a=-2, b=-1, c=2, d=1, e=5)
    parameters = {'kernel': ['rbf'], 'C': C_range, 'gamma': gamma_range}
    nonlinear_svm = SVC(class_weight=class_ratio)
    grid_classifier(x_train, y_train, x_test, y_test, nonlinear_svm, parameters,
                    make_feature_analysis, feature_names, top_features, plot_name)


def logistic_regression_grid(x_train, y_train, x_test, y_test, class_ratio,
                        make_feature_analysis=False, feature_names=None, top_features=0, plot_name="coeff"):
    print_model_title("Logistic Regression")
    C_range = [0.001, 0.01, 0.1, 1, 10, 100]
    parameters = {'C': C_range}
    log_regr = LogisticRegression(C=1.0, class_weight=class_ratio, penalty='l2')
    grid_classifier(x_train, y_train, x_test, y_test, log_regr, parameters,
                    make_feature_analysis, feature_names, top_features, plot_name)


def linear_svm(x_train, y_train, x_test, y_test, class_ratio='balanced'):
    print_model_title("Linear SVM")
    svm = LinearSVC(C=0.01, class_weight=class_ratio, penalty='l2')
    svm.fit(x_train, y_train)
    y_hat = svm.predict(x_test)
    print_statistics(y_test, y_hat)



def logistic_regression(x_train, y_train, x_test, y_test, class_ratio='balanced'):
    print_model_title("Logistic Regression")
    regr = LogisticRegression(C=0.01, class_weight=class_ratio, penalty='l2')
    regr.fit(x_train, y_train)
    y_hat = regr.predict(x_test)
    print_statistics(y_test, y_hat)



def random_forest(x_train, y_train, x_test, y_test, class_ratio='balanced'):
  print_model_title("Random Forest")
  rf = RandomForestClassifier(n_estimators=400, random_state=11)
  rf.fit(x_train, y_train)
  y_hat = rf.predict(x_test)
  print_statistics(y_test, y_hat)


def xg_boost(x_train, y_train, x_test, y_test):
  print_model_title("XGBoost")
  xgb_model =XGBClassifier(max_depth=50, n_estimators=1000)
  xgb_model .fit(x_train, y_train)
  y_hat = xgb_model .predict(x_test)
  print_statistics(y_test, y_hat)



def xg_boost_focal_loss(x_train, y_train, x_test, y_test):
  print_model_title("XGBoost Focal")
  xgboster_focal = imb_xgb(special_objective='focal')
  CV_focal_booster = GridSearchCV(xgboster_focal, {"focal_gamma":[1.0,1.5,2.0,2.5,3.0]})
  CV_focal_booster.fit(x_train, y_train)
  opt_focal_booster = CV_focal_booster.best_estimator_
  # xgb_model .fit(x_train, y_train)
  y_hat = opt_focal_booster.predict_determine(x_test)
  print_statistics(y_test, y_hat)


def xg_boost_weighted_loss(x_train, y_train, x_test, y_test):
  print_model_title("XGBoost Weighted")
  xgboster_focal = imb_xgb(special_objective='weighted')
  CV_focal_booster = GridSearchCV(xgboster_focal, {"imbalance_alpha":[1.5,2.0,2.5,3.0,4.0]})
  CV_focal_booster.fit(x_train, y_train)
  opt_focal_booster = CV_focal_booster.best_estimator_
  # xgb_model .fit(x_train, y_train)
  y_hat = opt_focal_booster.predict_determine(x_test)
  print_statistics(y_test, y_hat)



def feature_selection(x_train, y_train, x_test, y_test):
    print("Feature selection with LinearSVC")
    model = LinearSVC(C=0.1, penalty='l2')
    rfe = RFE(model, 5)
    best_features_model = rfe.fit(x_train, y_train)
    y_hat = best_features_model.predict(x_test)
    print_statistics(y_test, y_hat)


def ensemble_stacked(x_train, y_train, x_test, y_test):
  print_model_title("Ensemble Stacked Classifiers")
  estimators = [ ('lr',LogisticRegression(C=0.01, class_weight='balanced', penalty='l2')),('xgb',XGBClassifier(max_depth=16, n_estimators=1000)),('svm_linear',LinearSVC(C=0.01, class_weight='balanced', penalty='l2')),('rf', RandomForestClassifier(n_estimators=10, random_state=42))]
  from sklearn.ensemble import StackingClassifier
  clf = StackingClassifier(
      estimators=estimators )
  clf.fit(x_train, y_train)
  y_hat = clf .predict(x_test)
  print_statistics(y_test, y_hat)




from sklearn.ensemble import RandomForestClassifier, VotingClassifier
def voting_classifiers(x_train, y_train, x_test, y_test,voting_type='hard'):
  print_model_title("Voting Classifier")
  estimators = [ ('lr',LogisticRegression(C=0.01, class_weight='balanced', penalty='l2')),('xgb',XGBClassifier(max_depth=16, n_estimators=1000)),('svm_linear',LinearSVC(C=0.01, class_weight='balanced', penalty='l2')),('rf', RandomForestClassifier(n_estimators=10, random_state=42))]
  from sklearn.ensemble import StackingClassifier
  clf = VotingClassifier(
      estimators=estimators , voting=voting_type)
  clf.fit(x_train, y_train)
  y_hat = clf .predict(x_test)
  print_statistics(y_test, y_hat)




from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.ensemble import BaggingClassifier
def Bagging_Classifier_LR(x_train, y_train, x_test, y_test):
  print_model_title("Bagging Calssifier LR")
 
  clf =BaggingClassifier(base_estimator=LogisticRegression(C=0.01, class_weight='balanced', penalty='l2'),
                       n_estimators=10, random_state=42)
  clf.fit(x_train, y_train)
  y_hat = clf .predict(x_test)
  print_statistics(y_test, y_hat)



def Bagging_Classifier_SVM(x_train, y_train, x_test, y_test):
  print_model_title("Bagging Calssifier SVM")
 
  clf =BaggingClassifier(base_estimator=LinearSVC(C=0.01, class_weight='balanced', penalty='l2'),
                       n_estimators=10, random_state=42)
  clf.fit(x_train, y_train)
  y_hat = clf .predict(x_test)
  print_statistics(y_test, y_hat)



from sklearn.ensemble import GradientBoostingClassifier
def gradient_boosting(x_train, y_train, x_test, y_test):
  print_model_title("Gradient Boosting")
 
  clf =GradientBoostingClassifier(n_estimators=100, learning_rate=0.01,max_depth=30, random_state=42)
  clf.fit(x_train, y_train)
  y_hat = clf .predict(x_test)
  print_statistics(y_test, y_hat)

### Saving list of packages to requirements text file

In [None]:
#!pipreqs . --force

INFO: Successfully saved requirements file in ./requirements.txt


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
df=pd.read_csv('/content/drive/MyDrive/Omdena/RadmolAI/Data/Lung_Cancer_NEW_original_dataset.csv')

In [6]:
# check for percentage distribution of target variable "Case_Control", which indicates class 0 as non-cancer and class 1 as cancer
df['Case_Control'].value_counts(normalize=True)

0    0.833333
1    0.166667
Name: Case_Control, dtype: float64

In [7]:
df.head()

Unnamed: 0,patientid,Case_Control,group_code,Sex,age_at_dendrite_date,edob,dendritedate,definite_check_exclude,differentiation,histology_1,...,smoking4,_est_q,score,age_group,_est_qu,chest_pain,time_chest_pain,_Ismoking4_2,_Ismoking4_3,_Ismoking4_4
0,1203,1,2,1,72.887062,-12234,24 May 99,Definite,2.0,2,...,2,1,0.94351,2,1,0,0,1,0,0
1,1771,0,2,1,72.599586,-12129,24 May 99,,,2,...,4,1,0.000216,2,1,0,0,0,0,1
2,1176,0,2,1,73.237511,-12362,24 May 99,,,2,...,4,1,0.055466,2,1,0,0,0,0,1
3,1239,0,2,1,72.651611,-12148,24 May 99,,,2,...,2,1,0.000375,2,1,0,0,1,0,0
4,1970,0,2,1,73.237511,-12362,24 May 99,,,2,...,4,1,0.000216,2,1,0,0,0,0,1


## Data pre-processing


*   Fill Nan with -1
*   label encoder for string columns
*   Drop PatientID column




In [8]:
# replace missing values with -1
df_filled=df.fillna(-1)

In [9]:
# check for columns other than numeric columns
column_names_string= df_filled.select_dtypes(exclude=[np.number]).columns

In [10]:
column_names_string

Index(['dendritedate', 'definite_check_exclude', 'cancer_staging'], dtype='object')

In [11]:
# encode the object columns
for col in column_names_string:
  le = preprocessing.LabelEncoder()
  df_filled[col] = le.fit_transform(df[col].values)

In [12]:
# drop primary keys column
df_filled.drop("patientid", axis=1, inplace=True)


In [13]:
len(df_filled)

1482

In [14]:
# check dataset shape
df_filled.shape

(1482, 1249)

### Train-Test-Split

In [15]:
column_df=list(df_filled.columns)

In [16]:
column_df.remove('Case_Control')

In [17]:
X=df_filled[column_df]
y=df_filled['Case_Control']

In [18]:
# using stratification on target variable y to balance the train-test split - Shweta C.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,random_state=20,stratify=y)


In [None]:
#df_test=X_test.copy()
#df_test['target']=y_test
# df_test.to_csv('/content/drive/MyDrive/Omdena_ RadmolAI/Sampled_datasets/test.csv')

In [None]:
# df_train=X_train.copy()
# df_train['target']=y_train
# df_train.to_csv('/content/drive/MyDrive/Omdena_ RadmolAI/Sampled_datasets/train_unbalanced.csv')

In [19]:
len(y_test)

297

In [20]:
count_class_0, count_class_1 = y_test.value_counts(normalize=True)
print(count_class_0)
print(count_class_1)

0.8316498316498316
0.16835016835016836


In [21]:
import warnings
warnings.filterwarnings('ignore')

## Baseline Models

Models based on balanced class_weights for original data, after applying stratification on y variable

In [None]:
linear_svm_grid(X_train,y_train,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.859
Precision: 0.844
Recall: 0.859
F_score: 0.847

Balanced Accuracy:  0.6836842105263158
              precision    recall  f1-score   support

           0       0.89      0.95      0.92       247
           1       0.62      0.42      0.50        50

    accuracy                           0.86       297
   macro avg       0.75      0.68      0.71       297
weighted avg       0.84      0.86      0.85       297



In [None]:
nonlinear_svm_grid(X_train,y_train,X_test,y_test,class_ratio='balanced')



       Nonlinear SVM

Accuracy: 0.832
Precision: 0.692
Recall: 0.832
F_score: 0.755

Balanced Accuracy:  0.5
              precision    recall  f1-score   support

           0       0.83      1.00      0.91       247
           1       0.00      0.00      0.00        50

    accuracy                           0.83       297
   macro avg       0.42      0.50      0.45       297
weighted avg       0.69      0.83      0.76       297



In [None]:
logistic_regression_grid(X_train,y_train,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.852
Precision: 0.850
Recall: 0.852
F_score: 0.851

Balanced Accuracy:  0.7274898785425101
              precision    recall  f1-score   support

           0       0.91      0.91      0.91       247
           1       0.56      0.54      0.55        50

    accuracy                           0.85       297
   macro avg       0.74      0.73      0.73       297
weighted avg       0.85      0.85      0.85       297



In [None]:
linear_svm(X_train,y_train,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.865
Precision: 0.858
Recall: 0.865
F_score: 0.861

Balanced Accuracy:  0.7276113360323887
              precision    recall  f1-score   support

           0       0.91      0.94      0.92       247
           1       0.62      0.52      0.57        50

    accuracy                           0.87       297
   macro avg       0.76      0.73      0.74       297
weighted avg       0.86      0.87      0.86       297



In [None]:
logistic_regression(X_train,y_train,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.848
Precision: 0.847
Recall: 0.848
F_score: 0.848

Balanced Accuracy:  0.7254655870445345
              precision    recall  f1-score   support

           0       0.91      0.91      0.91       247
           1       0.55      0.54      0.55        50

    accuracy                           0.85       297
   macro avg       0.73      0.73      0.73       297
weighted avg       0.85      0.85      0.85       297



In [None]:
random_forest(X_train,y_train,X_test,y_test,class_ratio='balanced')



       Random Forest

Accuracy: 0.987
Precision: 0.987
Recall: 0.987
F_score: 0.986

Balanced Accuracy:  0.96
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       247
           1       1.00      0.92      0.96        50

    accuracy                           0.99       297
   macro avg       0.99      0.96      0.98       297
weighted avg       0.99      0.99      0.99       297



In [None]:
xg_boost(X_train,y_train,X_test,y_test)



             XGBoost

Accuracy: 1.000
Precision: 1.000
Recall: 1.000
F_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       247
           1       1.00      1.00      1.00        50

    accuracy                           1.00       297
   macro avg       1.00      1.00      1.00       297
weighted avg       1.00      1.00      1.00       297



In [None]:
ensemble_stacked(X_train,y_train,X_test,y_test)



Ensemble Stacked Classifiers

Accuracy: 1.000
Precision: 1.000
Recall: 1.000
F_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       247
           1       1.00      1.00      1.00        50

    accuracy                           1.00       297
   macro avg       1.00      1.00      1.00       297
weighted avg       1.00      1.00      1.00       297



In [None]:
voting_classifiers(X_train,y_train,X_test,y_test)



   Voting Classifier

Accuracy: 0.933
Precision: 0.938
Recall: 0.933
F_score: 0.926

Balanced Accuracy:  0.8
              precision    recall  f1-score   support

           0       0.93      1.00      0.96       247
           1       1.00      0.60      0.75        50

    accuracy                           0.93       297
   macro avg       0.96      0.80      0.86       297
weighted avg       0.94      0.93      0.93       297



In [None]:
Bagging_Classifier_LR(X_train,y_train,X_test,y_test)



Bagging Calssifier LR

Accuracy: 0.862
Precision: 0.847
Recall: 0.862
F_score: 0.850

Balanced Accuracy:  0.6857085020242915
              precision    recall  f1-score   support

           0       0.89      0.95      0.92       247
           1       0.64      0.42      0.51        50

    accuracy                           0.86       297
   macro avg       0.76      0.69      0.71       297
weighted avg       0.85      0.86      0.85       297



In [None]:
Bagging_Classifier_SVM(X_train,y_train,X_test,y_test)



Bagging Calssifier SVM

Accuracy: 0.865
Precision: 0.854
Recall: 0.865
F_score: 0.858

Balanced Accuracy:  0.7116599190283401
              precision    recall  f1-score   support

           0       0.90      0.94      0.92       247
           1       0.63      0.48      0.55        50

    accuracy                           0.87       297
   macro avg       0.77      0.71      0.73       297
weighted avg       0.85      0.87      0.86       297



In [None]:
gradient_boosting(X_train,y_train,X_test,y_test)



   Gradient Boosting

Accuracy: 1.000
Precision: 1.000
Recall: 1.000
F_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       247
           1       1.00      1.00      1.00        50

    accuracy                           1.00       297
   macro avg       1.00      1.00      1.00       297
weighted avg       1.00      1.00      1.00       297



## Sampling of dataset for balancing the classes

## Random UnderSampler

In [None]:
df_train=X_train.copy()
df_train['target']=y_train

In [None]:
# Class count
count_class_0, count_class_1 = df_train.target.value_counts()

# Divide by class
df_class_0 = df_train[df_train['target'] == 0]
df_class_1 = df_train[df_train['target'] == 1]

In [None]:
df_class_0_under = df_class_0.sample(count_class_1)
df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0)

In [None]:
# df_test_under.to_csv('/content/drive/MyDrive/Omdena_ RadmolAI/Sampled_datasets/Random_under_sampled_train.csv')

In [None]:
non_nan_cols=column_df

In [None]:
X_train_new_under_sampled=df_test_under[non_nan_cols]
y_train_new_under_sampler=df_test_under['target']

In [None]:
linear_svm_grid(X_train_new_under_sampled,y_train_new_under_sampler,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.781
Precision: 0.835
Recall: 0.781
F_score: 0.800

Balanced Accuracy:  0.732834008097166
              precision    recall  f1-score   support

           0       0.92      0.81      0.86       247
           1       0.41      0.66      0.50        50

    accuracy                           0.78       297
   macro avg       0.66      0.73      0.68       297
weighted avg       0.83      0.78      0.80       297



In [None]:
linear_svm_grid(X_train_new_under_sampled,y_train_new_under_sampler,X_test,y_test,class_ratio=None)



          Linear SVM

Accuracy: 0.737
Precision: 0.825
Recall: 0.737
F_score: 0.765

Balanced Accuracy:  0.7144939271255061
              precision    recall  f1-score   support

           0       0.92      0.75      0.83       247
           1       0.35      0.68      0.47        50

    accuracy                           0.74       297
   macro avg       0.64      0.71      0.65       297
weighted avg       0.83      0.74      0.77       297



In [None]:
nonlinear_svm_grid(X_train_new_under_sampled,y_train_new_under_sampler,X_test,y_test,class_ratio='balanced')



       Nonlinear SVM

Accuracy: 0.168
Precision: 0.028
Recall: 0.168
F_score: 0.049

Balanced Accuracy:  0.5
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       247
           1       0.17      1.00      0.29        50

    accuracy                           0.17       297
   macro avg       0.08      0.50      0.14       297
weighted avg       0.03      0.17      0.05       297



In [None]:
nonlinear_svm_grid(X_train_new_under_sampled,y_train_new_under_sampler,X_test,y_test,class_ratio=None)



       Nonlinear SVM

Accuracy: 0.168
Precision: 0.028
Recall: 0.168
F_score: 0.049

Balanced Accuracy:  0.5
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       247
           1       0.17      1.00      0.29        50

    accuracy                           0.17       297
   macro avg       0.08      0.50      0.14       297
weighted avg       0.03      0.17      0.05       297



In [None]:
logistic_regression_grid(X_train_new_under_sampled,y_train_new_under_sampler,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.734
Precision: 0.824
Recall: 0.734
F_score: 0.763

Balanced Accuracy:  0.7124696356275304
              precision    recall  f1-score   support

           0       0.92      0.74      0.82       247
           1       0.35      0.68      0.46        50

    accuracy                           0.73       297
   macro avg       0.64      0.71      0.64       297
weighted avg       0.82      0.73      0.76       297



In [None]:
logistic_regression_grid(X_train_new_under_sampled,y_train_new_under_sampler,X_test,y_test,class_ratio=None)



 Logistic Regression

Accuracy: 0.734
Precision: 0.824
Recall: 0.734
F_score: 0.763

Balanced Accuracy:  0.7124696356275304
              precision    recall  f1-score   support

           0       0.92      0.74      0.82       247
           1       0.35      0.68      0.46        50

    accuracy                           0.73       297
   macro avg       0.64      0.71      0.64       297
weighted avg       0.82      0.73      0.76       297



In [None]:
linear_svm(X_train_new_under_sampled,y_train_new_under_sampler,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.791
Precision: 0.828
Recall: 0.791
F_score: 0.805

Balanced Accuracy:  0.7149797570850203
              precision    recall  f1-score   support

           0       0.91      0.83      0.87       247
           1       0.42      0.60      0.49        50

    accuracy                           0.79       297
   macro avg       0.66      0.71      0.68       297
weighted avg       0.83      0.79      0.81       297



In [None]:
linear_svm(X_train_new_under_sampled,y_train_new_under_sampler,X_test,y_test,class_ratio=None)



          Linear SVM

Accuracy: 0.737
Precision: 0.829
Recall: 0.737
F_score: 0.766

Balanced Accuracy:  0.7224696356275304
              precision    recall  f1-score   support

           0       0.92      0.74      0.83       247
           1       0.36      0.70      0.47        50

    accuracy                           0.74       297
   macro avg       0.64      0.72      0.65       297
weighted avg       0.83      0.74      0.77       297



In [None]:
logistic_regression(X_train_new_under_sampled,y_train_new_under_sampler,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.717
Precision: 0.815
Recall: 0.717
F_score: 0.748

Balanced Accuracy:  0.6943724696356275
              precision    recall  f1-score   support

           0       0.91      0.73      0.81       247
           1       0.33      0.66      0.44        50

    accuracy                           0.72       297
   macro avg       0.62      0.69      0.63       297
weighted avg       0.82      0.72      0.75       297



In [None]:
logistic_regression(X_train_new_under_sampled,y_train_new_under_sampler,X_test,y_test,class_ratio=None)



 Logistic Regression

Accuracy: 0.717
Precision: 0.815
Recall: 0.717
F_score: 0.748

Balanced Accuracy:  0.6943724696356275
              precision    recall  f1-score   support

           0       0.91      0.73      0.81       247
           1       0.33      0.66      0.44        50

    accuracy                           0.72       297
   macro avg       0.62      0.69      0.63       297
weighted avg       0.82      0.72      0.75       297



In [None]:
random_forest(X_train_new_under_sampled,y_train_new_under_sampler,X_test,y_test,class_ratio='balanced')



       Random Forest

Accuracy: 0.966
Precision: 0.969
Recall: 0.966
F_score: 0.967

Balanced Accuracy:  0.9638056680161944
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       247
           1       0.86      0.96      0.91        50

    accuracy                           0.97       297
   macro avg       0.92      0.96      0.94       297
weighted avg       0.97      0.97      0.97       297



In [None]:
xg_boost(X_train_new_under_sampled,y_train_new_under_sampler,X_test,y_test)



             XGBoost

Accuracy: 1.000
Precision: 1.000
Recall: 1.000
F_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       247
           1       1.00      1.00      1.00        50

    accuracy                           1.00       297
   macro avg       1.00      1.00      1.00       297
weighted avg       1.00      1.00      1.00       297



In [None]:
ensemble_stacked(X_train_new_under_sampled,y_train_new_under_sampler,X_test,y_test)



Ensemble Stacked Classifiers

Accuracy: 1.000
Precision: 1.000
Recall: 1.000
F_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       247
           1       1.00      1.00      1.00        50

    accuracy                           1.00       297
   macro avg       1.00      1.00      1.00       297
weighted avg       1.00      1.00      1.00       297



In [None]:
voting_classifiers(X_train_new_under_sampled,y_train_new_under_sampler,X_test,y_test)



   Voting Classifier

Accuracy: 0.929
Precision: 0.927
Recall: 0.929
F_score: 0.927

Balanced Accuracy:  0.8458299595141701
              precision    recall  f1-score   support

           0       0.94      0.97      0.96       247
           1       0.84      0.72      0.77        50

    accuracy                           0.93       297
   macro avg       0.89      0.85      0.87       297
weighted avg       0.93      0.93      0.93       297



In [None]:
Bagging_Classifier_LR(X_train_new_under_sampled,y_train_new_under_sampler,X_test,y_test)



Bagging Calssifier LR

Accuracy: 0.697
Precision: 0.802
Recall: 0.697
F_score: 0.731

Balanced Accuracy:  0.6662753036437247
              precision    recall  f1-score   support

           0       0.90      0.71      0.80       247
           1       0.30      0.62      0.41        50

    accuracy                           0.70       297
   macro avg       0.60      0.67      0.60       297
weighted avg       0.80      0.70      0.73       297



In [None]:
Bagging_Classifier_SVM(X_train_new_under_sampled,y_train_new_under_sampler,X_test,y_test)



Bagging Calssifier SVM

Accuracy: 0.761
Precision: 0.828
Recall: 0.761
F_score: 0.784

Balanced Accuracy:  0.7206882591093118
              precision    recall  f1-score   support

           0       0.92      0.78      0.84       247
           1       0.38      0.66      0.48        50

    accuracy                           0.76       297
   macro avg       0.65      0.72      0.66       297
weighted avg       0.83      0.76      0.78       297



In [None]:
gradient_boosting(X_train_new_under_sampled,y_train_new_under_sampler,X_test,y_test)



   Gradient Boosting

Accuracy: 1.000
Precision: 1.000
Recall: 1.000
F_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       247
           1       1.00      1.00      1.00        50

    accuracy                           1.00       297
   macro avg       1.00      1.00      1.00       297
weighted avg       1.00      1.00      1.00       297



## Random Oversampler

In [None]:
df_class_1_over = df_class_1.sample(count_class_0, replace=True)
df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0)

In [None]:
# df_test_over.to_csv('/content/drive/MyDrive/Omdena_ RadmolAI/Sampled_datasets/Random_Over_sampled_train.csv')


In [None]:
X_train_new_over_sampled=df_test_over[non_nan_cols]
y_train_new_over_sampler=df_test_over['target']

In [None]:
linear_svm_grid(X_train_new_over_sampled,y_train_new_over_sampler,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.875
Precision: 0.873
Recall: 0.875
F_score: 0.874

Balanced Accuracy:  0.765587044534413
              precision    recall  f1-score   support

           0       0.92      0.93      0.93       247
           1       0.64      0.60      0.62        50

    accuracy                           0.88       297
   macro avg       0.78      0.77      0.77       297
weighted avg       0.87      0.88      0.87       297



In [None]:
linear_svm_grid(X_train_new_over_sampled,y_train_new_over_sampler,X_test,y_test,class_ratio=None)



          Linear SVM

Accuracy: 0.875
Precision: 0.873
Recall: 0.875
F_score: 0.874

Balanced Accuracy:  0.765587044534413
              precision    recall  f1-score   support

           0       0.92      0.93      0.93       247
           1       0.64      0.60      0.62        50

    accuracy                           0.88       297
   macro avg       0.78      0.77      0.77       297
weighted avg       0.87      0.88      0.87       297



In [None]:
nonlinear_svm_grid(X_train_new_over_sampled,y_train_new_over_sampler,X_test,y_test,class_ratio='balanced')



       Nonlinear SVM

Accuracy: 0.832
Precision: 0.692
Recall: 0.832
F_score: 0.755

Balanced Accuracy:  0.5
              precision    recall  f1-score   support

           0       0.83      1.00      0.91       247
           1       0.00      0.00      0.00        50

    accuracy                           0.83       297
   macro avg       0.42      0.50      0.45       297
weighted avg       0.69      0.83      0.76       297



In [None]:
nonlinear_svm_grid(X_train_new_over_sampled,y_train_new_over_sampler,X_test,y_test,class_ratio=None)



       Nonlinear SVM

Accuracy: 0.832
Precision: 0.692
Recall: 0.832
F_score: 0.755

Balanced Accuracy:  0.5
              precision    recall  f1-score   support

           0       0.83      1.00      0.91       247
           1       0.00      0.00      0.00        50

    accuracy                           0.83       297
   macro avg       0.42      0.50      0.45       297
weighted avg       0.69      0.83      0.76       297



In [None]:
logistic_regression_grid(X_train_new_over_sampled,y_train_new_over_sampler,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.845
Precision: 0.845
Recall: 0.845
F_score: 0.845

Balanced Accuracy:  0.7234412955465588
              precision    recall  f1-score   support

           0       0.91      0.91      0.91       247
           1       0.54      0.54      0.54        50

    accuracy                           0.85       297
   macro avg       0.72      0.72      0.72       297
weighted avg       0.85      0.85      0.85       297



In [None]:
logistic_regression_grid(X_train_new_over_sampled,y_train_new_over_sampler,X_test,y_test,class_ratio=None)



 Logistic Regression

Accuracy: 0.852
Precision: 0.857
Recall: 0.852
F_score: 0.854

Balanced Accuracy:  0.751417004048583
              precision    recall  f1-score   support

           0       0.92      0.90      0.91       247
           1       0.56      0.60      0.58        50

    accuracy                           0.85       297
   macro avg       0.74      0.75      0.74       297
weighted avg       0.86      0.85      0.85       297



In [None]:
linear_svm(X_train_new_over_sampled,y_train_new_over_sampler,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.862
Precision: 0.851
Recall: 0.862
F_score: 0.855

Balanced Accuracy:  0.7096356275303644
              precision    recall  f1-score   support

           0       0.90      0.94      0.92       247
           1       0.62      0.48      0.54        50

    accuracy                           0.86       297
   macro avg       0.76      0.71      0.73       297
weighted avg       0.85      0.86      0.85       297



In [None]:
linear_svm(X_train_new_over_sampled,y_train_new_over_sampler,X_test,y_test,class_ratio=None)



          Linear SVM

Accuracy: 0.862
Precision: 0.863
Recall: 0.862
F_score: 0.862

Balanced Accuracy:  0.7574898785425102
              precision    recall  f1-score   support

           0       0.92      0.91      0.92       247
           1       0.59      0.60      0.59        50

    accuracy                           0.86       297
   macro avg       0.75      0.76      0.76       297
weighted avg       0.86      0.86      0.86       297



In [None]:
logistic_regression(X_train_new_over_sampled,y_train_new_over_sampler,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.848
Precision: 0.855
Recall: 0.848
F_score: 0.851

Balanced Accuracy:  0.7493927125506072
              precision    recall  f1-score   support

           0       0.92      0.90      0.91       247
           1       0.55      0.60      0.57        50

    accuracy                           0.85       297
   macro avg       0.73      0.75      0.74       297
weighted avg       0.85      0.85      0.85       297



In [None]:
logistic_regression(X_train_new_over_sampled,y_train_new_over_sampler,X_test,y_test,class_ratio=None)



 Logistic Regression

Accuracy: 0.848
Precision: 0.855
Recall: 0.848
F_score: 0.851

Balanced Accuracy:  0.7493927125506072
              precision    recall  f1-score   support

           0       0.92      0.90      0.91       247
           1       0.55      0.60      0.57        50

    accuracy                           0.85       297
   macro avg       0.73      0.75      0.74       297
weighted avg       0.85      0.85      0.85       297



In [None]:
random_forest(X_train_new_over_sampled,y_train_new_over_sampler,X_test,y_test,class_ratio='balanced')



       Random Forest

Accuracy: 0.990
Precision: 0.990
Recall: 0.990
F_score: 0.990

Balanced Accuracy:  0.9779757085020242
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       247
           1       0.98      0.96      0.97        50

    accuracy                           0.99       297
   macro avg       0.99      0.98      0.98       297
weighted avg       0.99      0.99      0.99       297



In [None]:
xg_boost(X_train_new_over_sampled,y_train_new_over_sampler,X_test,y_test)



             XGBoost

Accuracy: 1.000
Precision: 1.000
Recall: 1.000
F_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       247
           1       1.00      1.00      1.00        50

    accuracy                           1.00       297
   macro avg       1.00      1.00      1.00       297
weighted avg       1.00      1.00      1.00       297



In [None]:
ensemble_stacked(X_train_new_over_sampled,y_train_new_over_sampler,X_test,y_test)



Ensemble Stacked Classifiers

Accuracy: 1.000
Precision: 1.000
Recall: 1.000
F_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       247
           1       1.00      1.00      1.00        50

    accuracy                           1.00       297
   macro avg       1.00      1.00      1.00       297
weighted avg       1.00      1.00      1.00       297



In [None]:
voting_classifiers(X_train_new_over_sampled,y_train_new_over_sampler,X_test,y_test)



   Voting Classifier

Accuracy: 0.936
Precision: 0.938
Recall: 0.936
F_score: 0.931

Balanced Accuracy:  0.8179757085020243
              precision    recall  f1-score   support

           0       0.93      1.00      0.96       247
           1       0.97      0.64      0.77        50

    accuracy                           0.94       297
   macro avg       0.95      0.82      0.87       297
weighted avg       0.94      0.94      0.93       297



In [None]:
Bagging_Classifier_LR(X_train_new_over_sampled,y_train_new_over_sampler,X_test,y_test)



Bagging Calssifier LR

Accuracy: 0.828
Precision: 0.830
Recall: 0.828
F_score: 0.829

Balanced Accuracy:  0.6973684210526316
              precision    recall  f1-score   support

           0       0.90      0.89      0.90       247
           1       0.49      0.50      0.50        50

    accuracy                           0.83       297
   macro avg       0.69      0.70      0.70       297
weighted avg       0.83      0.83      0.83       297



In [None]:
Bagging_Classifier_SVM(X_train_new_over_sampled,y_train_new_over_sampler,X_test,y_test)



Bagging Calssifier SVM

Accuracy: 0.859
Precision: 0.859
Recall: 0.859
F_score: 0.859

Balanced Accuracy:  0.7474898785425101
              precision    recall  f1-score   support

           0       0.91      0.91      0.91       247
           1       0.58      0.58      0.58        50

    accuracy                           0.86       297
   macro avg       0.75      0.75      0.75       297
weighted avg       0.86      0.86      0.86       297



In [None]:
gradient_boosting(X_train_new_over_sampled,y_train_new_over_sampler,X_test,y_test)



   Gradient Boosting

Accuracy: 1.000
Precision: 1.000
Recall: 1.000
F_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       247
           1       1.00      1.00      1.00        50

    accuracy                           1.00       297
   macro avg       1.00      1.00      1.00       297
weighted avg       1.00      1.00      1.00       297



## Python imbalanced-learn module


In [None]:
import imblearn


### Random under-sampling and over-sampling with imbalanced-learn


#### Random under-sampler

In [None]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler()
X_rus, y_rus = rus.fit_resample(X_train,y_train)

In [None]:
# df_train=X_rus.copy()
# df_train['target']=y_rus
# df_train.to_csv('/content/drive/MyDrive/Omdena_ RadmolAI/Sampled_datasets/imblearn_Random_under_sampled_train.csv')



In [None]:
linear_svm_grid(X_rus,y_rus,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.751
Precision: 0.821
Recall: 0.751
F_score: 0.775

Balanced Accuracy:  0.7066396761133603
              precision    recall  f1-score   support

           0       0.91      0.77      0.84       247
           1       0.36      0.64      0.46        50

    accuracy                           0.75       297
   macro avg       0.64      0.71      0.65       297
weighted avg       0.82      0.75      0.77       297



In [None]:
linear_svm_grid(X_rus,y_rus,X_test,y_test,class_ratio=None)



          Linear SVM

Accuracy: 0.737
Precision: 0.817
Recall: 0.737
F_score: 0.764

Balanced Accuracy:  0.6985425101214575
              precision    recall  f1-score   support

           0       0.91      0.76      0.83       247
           1       0.35      0.64      0.45        50

    accuracy                           0.74       297
   macro avg       0.63      0.70      0.64       297
weighted avg       0.82      0.74      0.76       297



In [None]:
nonlinear_svm_grid(X_rus,y_rus,X_test,y_test,class_ratio='balanced')



       Nonlinear SVM

Accuracy: 0.168
Precision: 0.028
Recall: 0.168
F_score: 0.049

Balanced Accuracy:  0.5
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       247
           1       0.17      1.00      0.29        50

    accuracy                           0.17       297
   macro avg       0.08      0.50      0.14       297
weighted avg       0.03      0.17      0.05       297



In [None]:
nonlinear_svm_grid(X_rus,y_rus,X_test,y_test,class_ratio=None)



       Nonlinear SVM

Accuracy: 0.168
Precision: 0.028
Recall: 0.168
F_score: 0.049

Balanced Accuracy:  0.5
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       247
           1       0.17      1.00      0.29        50

    accuracy                           0.17       297
   macro avg       0.08      0.50      0.14       297
weighted avg       0.03      0.17      0.05       297



In [None]:
logistic_regression_grid(X_rus,y_rus,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.741
Precision: 0.826
Recall: 0.741
F_score: 0.768

Balanced Accuracy:  0.7165182186234818
              precision    recall  f1-score   support

           0       0.92      0.75      0.83       247
           1       0.36      0.68      0.47        50

    accuracy                           0.74       297
   macro avg       0.64      0.72      0.65       297
weighted avg       0.83      0.74      0.77       297



In [None]:
logistic_regression_grid(X_rus,y_rus,X_test,y_test,class_ratio=None)



 Logistic Regression

Accuracy: 0.741
Precision: 0.826
Recall: 0.741
F_score: 0.768

Balanced Accuracy:  0.7165182186234818
              precision    recall  f1-score   support

           0       0.92      0.75      0.83       247
           1       0.36      0.68      0.47        50

    accuracy                           0.74       297
   macro avg       0.64      0.72      0.65       297
weighted avg       0.83      0.74      0.77       297



In [None]:
linear_svm(X_rus,y_rus,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.751
Precision: 0.821
Recall: 0.751
F_score: 0.775

Balanced Accuracy:  0.7066396761133603
              precision    recall  f1-score   support

           0       0.91      0.77      0.84       247
           1       0.36      0.64      0.46        50

    accuracy                           0.75       297
   macro avg       0.64      0.71      0.65       297
weighted avg       0.82      0.75      0.77       297



In [None]:
linear_svm(X_rus,y_rus,X_test,y_test,class_ratio=None)



          Linear SVM

Accuracy: 0.754
Precision: 0.819
Recall: 0.754
F_score: 0.777

Balanced Accuracy:  0.7006882591093118
              precision    recall  f1-score   support

           0       0.91      0.78      0.84       247
           1       0.36      0.62      0.46        50

    accuracy                           0.75       297
   macro avg       0.64      0.70      0.65       297
weighted avg       0.82      0.75      0.78       297



In [None]:
logistic_regression(X_rus,y_rus,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.714
Precision: 0.819
Recall: 0.714
F_score: 0.746

Balanced Accuracy:  0.7003238866396762
              precision    recall  f1-score   support

           0       0.92      0.72      0.81       247
           1       0.33      0.68      0.44        50

    accuracy                           0.71       297
   macro avg       0.62      0.70      0.63       297
weighted avg       0.82      0.71      0.75       297



In [None]:
logistic_regression(X_rus,y_rus,X_test,y_test,class_ratio=None)



 Logistic Regression

Accuracy: 0.714
Precision: 0.819
Recall: 0.714
F_score: 0.746

Balanced Accuracy:  0.7003238866396762
              precision    recall  f1-score   support

           0       0.92      0.72      0.81       247
           1       0.33      0.68      0.44        50

    accuracy                           0.71       297
   macro avg       0.62      0.70      0.63       297
weighted avg       0.82      0.71      0.75       297



In [None]:
random_forest(X_rus,y_rus,X_test,y_test,class_ratio='balanced')



       Random Forest

Accuracy: 0.960
Precision: 0.964
Recall: 0.960
F_score: 0.961

Balanced Accuracy:  0.9597570850202428
              precision    recall  f1-score   support

           0       0.99      0.96      0.98       247
           1       0.83      0.96      0.89        50

    accuracy                           0.96       297
   macro avg       0.91      0.96      0.93       297
weighted avg       0.96      0.96      0.96       297



In [None]:
xg_boost(X_rus,y_rus,X_test,y_test)



             XGBoost

Accuracy: 1.000
Precision: 1.000
Recall: 1.000
F_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       247
           1       1.00      1.00      1.00        50

    accuracy                           1.00       297
   macro avg       1.00      1.00      1.00       297
weighted avg       1.00      1.00      1.00       297



In [None]:
ensemble_stacked(X_rus,y_rus,X_test,y_test)



Ensemble Stacked Classifiers

Accuracy: 1.000
Precision: 1.000
Recall: 1.000
F_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       247
           1       1.00      1.00      1.00        50

    accuracy                           1.00       297
   macro avg       1.00      1.00      1.00       297
weighted avg       1.00      1.00      1.00       297



In [None]:
voting_classifiers(X_rus,y_rus,X_test,y_test)



   Voting Classifier

Accuracy: 0.912
Precision: 0.911
Recall: 0.912
F_score: 0.912

Balanced Accuracy:  0.8357085020242915
              precision    recall  f1-score   support

           0       0.94      0.95      0.95       247
           1       0.75      0.72      0.73        50

    accuracy                           0.91       297
   macro avg       0.85      0.84      0.84       297
weighted avg       0.91      0.91      0.91       297



In [None]:
Bagging_Classifier_LR(X_rus,y_rus,X_test,y_test)



Bagging Calssifier LR

Accuracy: 0.747
Precision: 0.816
Recall: 0.747
F_score: 0.771

Balanced Accuracy:  0.6966396761133603
              precision    recall  f1-score   support

           0       0.91      0.77      0.84       247
           1       0.36      0.62      0.45        50

    accuracy                           0.75       297
   macro avg       0.63      0.70      0.64       297
weighted avg       0.82      0.75      0.77       297



In [None]:
Bagging_Classifier_SVM(X_rus,y_rus,X_test,y_test)



Bagging Calssifier SVM

Accuracy: 0.774
Precision: 0.848
Recall: 0.774
F_score: 0.797

Balanced Accuracy:  0.7606882591093118
              precision    recall  f1-score   support

           0       0.94      0.78      0.85       247
           1       0.41      0.74      0.52        50

    accuracy                           0.77       297
   macro avg       0.67      0.76      0.69       297
weighted avg       0.85      0.77      0.80       297



In [None]:
gradient_boosting(X_rus,y_rus,X_test,y_test)



   Gradient Boosting

Accuracy: 1.000
Precision: 1.000
Recall: 1.000
F_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       247
           1       1.00      1.00      1.00        50

    accuracy                           1.00       297
   macro avg       1.00      1.00      1.00       297
weighted avg       1.00      1.00      1.00       297



#### Random over-sampler

In [None]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler()
X_ros, y_ros = ros.fit_resample(X_train,y_train)

In [None]:
# df_train=X_ros.copy()
# df_train['target']=y_ros
# df_train.to_csv('/content/drive/MyDrive/Omdena_ RadmolAI/Sampled_datasets/imblearn_Random_over_sampled_train.csv')

In [None]:
linear_svm_grid(X_ros,y_ros,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.859
Precision: 0.854
Recall: 0.859
F_score: 0.856

Balanced Accuracy:  0.7315384615384616
              precision    recall  f1-score   support

           0       0.91      0.92      0.92       247
           1       0.59      0.54      0.56        50

    accuracy                           0.86       297
   macro avg       0.75      0.73      0.74       297
weighted avg       0.85      0.86      0.86       297



In [None]:
linear_svm_grid(X_ros,y_ros,X_test,y_test,class_ratio=None)



          Linear SVM

Accuracy: 0.859
Precision: 0.859
Recall: 0.859
F_score: 0.859

Balanced Accuracy:  0.7474898785425101
              precision    recall  f1-score   support

           0       0.91      0.91      0.91       247
           1       0.58      0.58      0.58        50

    accuracy                           0.86       297
   macro avg       0.75      0.75      0.75       297
weighted avg       0.86      0.86      0.86       297



In [None]:
nonlinear_svm_grid(X_ros,y_ros,X_test,y_test,class_ratio='balanced')



       Nonlinear SVM

Accuracy: 0.832
Precision: 0.692
Recall: 0.832
F_score: 0.755

Balanced Accuracy:  0.5
              precision    recall  f1-score   support

           0       0.83      1.00      0.91       247
           1       0.00      0.00      0.00        50

    accuracy                           0.83       297
   macro avg       0.42      0.50      0.45       297
weighted avg       0.69      0.83      0.76       297



In [None]:
nonlinear_svm_grid(X_ros,y_ros,X_test,y_test,class_ratio=None)



       Nonlinear SVM

Accuracy: 0.832
Precision: 0.692
Recall: 0.832
F_score: 0.755

Balanced Accuracy:  0.5
              precision    recall  f1-score   support

           0       0.83      1.00      0.91       247
           1       0.00      0.00      0.00        50

    accuracy                           0.83       297
   macro avg       0.42      0.50      0.45       297
weighted avg       0.69      0.83      0.76       297



In [None]:
logistic_regression_grid(X_ros,y_ros,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.845
Precision: 0.843
Recall: 0.845
F_score: 0.844

Balanced Accuracy:  0.7154655870445344
              precision    recall  f1-score   support

           0       0.90      0.91      0.91       247
           1       0.54      0.52      0.53        50

    accuracy                           0.85       297
   macro avg       0.72      0.72      0.72       297
weighted avg       0.84      0.85      0.84       297



In [None]:
logistic_regression_grid(X_ros,y_ros,X_test,y_test,class_ratio=None)



 Logistic Regression

Accuracy: 0.842
Precision: 0.846
Recall: 0.842
F_score: 0.844

Balanced Accuracy:  0.7293927125506073
              precision    recall  f1-score   support

           0       0.91      0.90      0.90       247
           1       0.53      0.56      0.54        50

    accuracy                           0.84       297
   macro avg       0.72      0.73      0.72       297
weighted avg       0.85      0.84      0.84       297



In [None]:
linear_svm(X_ros,y_ros,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.852
Precision: 0.852
Recall: 0.852
F_score: 0.852

Balanced Accuracy:  0.7354655870445345
              precision    recall  f1-score   support

           0       0.91      0.91      0.91       247
           1       0.56      0.56      0.56        50

    accuracy                           0.85       297
   macro avg       0.74      0.74      0.74       297
weighted avg       0.85      0.85      0.85       297



In [None]:
linear_svm(X_ros,y_ros,X_test,y_test,class_ratio=None)



          Linear SVM

Accuracy: 0.855
Precision: 0.859
Recall: 0.855
F_score: 0.857

Balanced Accuracy:  0.7534412955465587
              precision    recall  f1-score   support

           0       0.92      0.91      0.91       247
           1       0.57      0.60      0.58        50

    accuracy                           0.86       297
   macro avg       0.74      0.75      0.75       297
weighted avg       0.86      0.86      0.86       297



In [None]:
logistic_regression(X_ros,y_ros,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.838
Precision: 0.844
Recall: 0.838
F_score: 0.841

Balanced Accuracy:  0.7273684210526317
              precision    recall  f1-score   support

           0       0.91      0.89      0.90       247
           1       0.52      0.56      0.54        50

    accuracy                           0.84       297
   macro avg       0.71      0.73      0.72       297
weighted avg       0.84      0.84      0.84       297



In [None]:
logistic_regression(X_ros,y_ros,X_test,y_test,class_ratio=None)



 Logistic Regression

Accuracy: 0.838
Precision: 0.844
Recall: 0.838
F_score: 0.841

Balanced Accuracy:  0.7273684210526317
              precision    recall  f1-score   support

           0       0.91      0.89      0.90       247
           1       0.52      0.56      0.54        50

    accuracy                           0.84       297
   macro avg       0.71      0.73      0.72       297
weighted avg       0.84      0.84      0.84       297



In [None]:
random_forest(X_ros,y_ros,X_test,y_test,class_ratio='balanced')



       Random Forest

Accuracy: 0.990
Precision: 0.990
Recall: 0.990
F_score: 0.990

Balanced Accuracy:  0.9779757085020242
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       247
           1       0.98      0.96      0.97        50

    accuracy                           0.99       297
   macro avg       0.99      0.98      0.98       297
weighted avg       0.99      0.99      0.99       297



In [None]:
xg_boost(X_ros,y_ros,X_test,y_test)



             XGBoost

Accuracy: 1.000
Precision: 1.000
Recall: 1.000
F_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       247
           1       1.00      1.00      1.00        50

    accuracy                           1.00       297
   macro avg       1.00      1.00      1.00       297
weighted avg       1.00      1.00      1.00       297



In [None]:
ensemble_stacked(X_ros,y_ros,X_test,y_test)



Ensemble Stacked Classifiers

Accuracy: 1.000
Precision: 1.000
Recall: 1.000
F_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       247
           1       1.00      1.00      1.00        50

    accuracy                           1.00       297
   macro avg       1.00      1.00      1.00       297
weighted avg       1.00      1.00      1.00       297



In [None]:
voting_classifiers(X_ros,y_ros,X_test,y_test)



   Voting Classifier

Accuracy: 0.946
Precision: 0.949
Recall: 0.946
F_score: 0.942

Balanced Accuracy:  0.8400000000000001
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       247
           1       1.00      0.68      0.81        50

    accuracy                           0.95       297
   macro avg       0.97      0.84      0.89       297
weighted avg       0.95      0.95      0.94       297



In [None]:
Bagging_Classifier_LR(X_ros,y_ros,X_test,y_test)



Bagging Calssifier LR

Accuracy: 0.818
Precision: 0.830
Recall: 0.818
F_score: 0.823

Balanced Accuracy:  0.7072469635627531
              precision    recall  f1-score   support

           0       0.90      0.87      0.89       247
           1       0.47      0.54      0.50        50

    accuracy                           0.82       297
   macro avg       0.68      0.71      0.69       297
weighted avg       0.83      0.82      0.82       297



In [None]:
Bagging_Classifier_SVM(X_ros,y_ros,X_test,y_test)



Bagging Calssifier SVM

Accuracy: 0.852
Precision: 0.852
Recall: 0.852
F_score: 0.852

Balanced Accuracy:  0.7354655870445345
              precision    recall  f1-score   support

           0       0.91      0.91      0.91       247
           1       0.56      0.56      0.56        50

    accuracy                           0.85       297
   macro avg       0.74      0.74      0.74       297
weighted avg       0.85      0.85      0.85       297



In [None]:
gradient_boosting(X_ros,y_ros,X_test,y_test)



   Gradient Boosting

Accuracy: 1.000
Precision: 1.000
Recall: 1.000
F_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       247
           1       1.00      1.00      1.00        50

    accuracy                           1.00       297
   macro avg       1.00      1.00      1.00       297
weighted avg       1.00      1.00      1.00       297



#### Under-sampling: Tomek links


In [None]:
from imblearn.under_sampling import TomekLinks

tl = TomekLinks( sampling_strategy='majority')
X_tl, y_tl = tl.fit_resample(X_train,y_train)

In [None]:
# df_train=X_tl.copy()
# df_train['target']=y_tl
# df_train.to_csv('/content/drive/MyDrive/Omdena_ RadmolAI/Sampled_datasets/imblearn_TomekLinks_Random_over_sampled_train.csv')

In [None]:
linear_svm_grid(X_tl,y_tl,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.855
Precision: 0.841
Recall: 0.855
F_score: 0.845

Balanced Accuracy:  0.6816599190283401
              precision    recall  f1-score   support

           0       0.89      0.94      0.92       247
           1       0.60      0.42      0.49        50

    accuracy                           0.86       297
   macro avg       0.74      0.68      0.70       297
weighted avg       0.84      0.86      0.84       297



In [None]:
linear_svm_grid(X_tl,y_tl,X_test,y_test,class_ratio=None)



          Linear SVM

Accuracy: 0.862
Precision: 0.850
Recall: 0.862
F_score: 0.853

Balanced Accuracy:  0.7016599190283401
              precision    recall  f1-score   support

           0       0.90      0.94      0.92       247
           1       0.62      0.46      0.53        50

    accuracy                           0.86       297
   macro avg       0.76      0.70      0.72       297
weighted avg       0.85      0.86      0.85       297



In [None]:
nonlinear_svm_grid(X_tl,y_tl,X_test,y_test,class_ratio='balanced')



       Nonlinear SVM

Accuracy: 0.168
Precision: 0.028
Recall: 0.168
F_score: 0.049

Balanced Accuracy:  0.5
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       247
           1       0.17      1.00      0.29        50

    accuracy                           0.17       297
   macro avg       0.08      0.50      0.14       297
weighted avg       0.03      0.17      0.05       297



In [None]:
nonlinear_svm_grid(X_tl,y_tl,X_test,y_test,class_ratio=None)



       Nonlinear SVM

Accuracy: 0.832
Precision: 0.692
Recall: 0.832
F_score: 0.755

Balanced Accuracy:  0.5
              precision    recall  f1-score   support

           0       0.83      1.00      0.91       247
           1       0.00      0.00      0.00        50

    accuracy                           0.83       297
   macro avg       0.42      0.50      0.45       297
weighted avg       0.69      0.83      0.76       297



In [None]:
logistic_regression_grid(X_tl,y_tl,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.859
Precision: 0.863
Recall: 0.859
F_score: 0.861

Balanced Accuracy:  0.7634412955465587
              precision    recall  f1-score   support

           0       0.92      0.91      0.91       247
           1       0.57      0.62      0.60        50

    accuracy                           0.86       297
   macro avg       0.75      0.76      0.76       297
weighted avg       0.86      0.86      0.86       297



In [None]:
logistic_regression_grid(X_tl,y_tl,X_test,y_test,class_ratio=None)



 Logistic Regression

Accuracy: 0.825
Precision: 0.817
Recall: 0.825
F_score: 0.820

Balanced Accuracy:  0.6634412955465587
              precision    recall  f1-score   support

           0       0.89      0.91      0.90       247
           1       0.48      0.42      0.45        50

    accuracy                           0.82       297
   macro avg       0.68      0.66      0.67       297
weighted avg       0.82      0.82      0.82       297



In [None]:
linear_svm(X_tl,y_tl,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.855
Precision: 0.850
Recall: 0.855
F_score: 0.852

Balanced Accuracy:  0.7215384615384616
              precision    recall  f1-score   support

           0       0.90      0.92      0.91       247
           1       0.58      0.52      0.55        50

    accuracy                           0.86       297
   macro avg       0.74      0.72      0.73       297
weighted avg       0.85      0.86      0.85       297



In [None]:
linear_svm(X_tl,y_tl,X_test,y_test,class_ratio=None)



          Linear SVM

Accuracy: 0.872
Precision: 0.863
Recall: 0.872
F_score: 0.866

Balanced Accuracy:  0.7316599190283402
              precision    recall  f1-score   support

           0       0.91      0.94      0.92       247
           1       0.65      0.52      0.58        50

    accuracy                           0.87       297
   macro avg       0.78      0.73      0.75       297
weighted avg       0.86      0.87      0.87       297



In [None]:
logistic_regression(X_tl,y_tl,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.842
Precision: 0.843
Recall: 0.842
F_score: 0.842

Balanced Accuracy:  0.721417004048583
              precision    recall  f1-score   support

           0       0.91      0.90      0.90       247
           1       0.53      0.54      0.53        50

    accuracy                           0.84       297
   macro avg       0.72      0.72      0.72       297
weighted avg       0.84      0.84      0.84       297



In [None]:
logistic_regression(X_tl,y_tl,X_test,y_test,class_ratio=None)



 Logistic Regression

Accuracy: 0.825
Precision: 0.817
Recall: 0.825
F_score: 0.820

Balanced Accuracy:  0.6634412955465587
              precision    recall  f1-score   support

           0       0.89      0.91      0.90       247
           1       0.48      0.42      0.45        50

    accuracy                           0.82       297
   macro avg       0.68      0.66      0.67       297
weighted avg       0.82      0.82      0.82       297



In [None]:
random_forest(X_tl,y_tl,X_test,y_test,class_ratio='balanced')



       Random Forest

Accuracy: 0.987
Precision: 0.987
Recall: 0.987
F_score: 0.986

Balanced Accuracy:  0.96
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       247
           1       1.00      0.92      0.96        50

    accuracy                           0.99       297
   macro avg       0.99      0.96      0.98       297
weighted avg       0.99      0.99      0.99       297



In [None]:
xg_boost(X_tl,y_tl,X_test,y_test)



             XGBoost

Accuracy: 1.000
Precision: 1.000
Recall: 1.000
F_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       247
           1       1.00      1.00      1.00        50

    accuracy                           1.00       297
   macro avg       1.00      1.00      1.00       297
weighted avg       1.00      1.00      1.00       297



In [None]:
ensemble_stacked(X_tl,y_tl,X_test,y_test)



Ensemble Stacked Classifiers

Accuracy: 1.000
Precision: 1.000
Recall: 1.000
F_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       247
           1       1.00      1.00      1.00        50

    accuracy                           1.00       297
   macro avg       1.00      1.00      1.00       297
weighted avg       1.00      1.00      1.00       297



In [None]:
voting_classifiers(X_tl,y_tl,X_test,y_test)



   Voting Classifier

Accuracy: 0.929
Precision: 0.935
Recall: 0.929
F_score: 0.921

Balanced Accuracy:  0.79
              precision    recall  f1-score   support

           0       0.92      1.00      0.96       247
           1       1.00      0.58      0.73        50

    accuracy                           0.93       297
   macro avg       0.96      0.79      0.85       297
weighted avg       0.93      0.93      0.92       297



In [None]:
Bagging_Classifier_LR(X_tl,y_tl,X_test,y_test)



Bagging Calssifier LR

Accuracy: 0.832
Precision: 0.816
Recall: 0.832
F_score: 0.822

Balanced Accuracy:  0.6515384615384616
              precision    recall  f1-score   support

           0       0.88      0.92      0.90       247
           1       0.50      0.38      0.43        50

    accuracy                           0.83       297
   macro avg       0.69      0.65      0.67       297
weighted avg       0.82      0.83      0.82       297



In [None]:
Bagging_Classifier_SVM(X_tl,y_tl,X_test,y_test)



Bagging Calssifier SVM

Accuracy: 0.865
Precision: 0.850
Recall: 0.865
F_score: 0.851

Balanced Accuracy:  0.6797570850202429
              precision    recall  f1-score   support

           0       0.89      0.96      0.92       247
           1       0.67      0.40      0.50        50

    accuracy                           0.87       297
   macro avg       0.78      0.68      0.71       297
weighted avg       0.85      0.87      0.85       297



In [None]:
gradient_boosting(X_tl,y_tl,X_test,y_test)



   Gradient Boosting

Accuracy: 1.000
Precision: 1.000
Recall: 1.000
F_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       247
           1       1.00      1.00      1.00        50

    accuracy                           1.00       297
   macro avg       1.00      1.00      1.00       297
weighted avg       1.00      1.00      1.00       297



#### Under-sampling: Cluster Centroids


In [None]:
from imblearn.under_sampling import ClusterCentroids

cc = ClusterCentroids()
X_cc, y_cc = cc.fit_resample(X_train,y_train)

In [None]:
# df_train=X_cc.copy()
# df_train['target']=y_cc
# df_train.to_csv('/content/drive/MyDrive/Omdena_ RadmolAI/Sampled_datasets/imblearn_ClusterCentroids_Random_under_sampled_train.csv')

In [None]:
linear_svm_grid(X_cc,y_cc,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.684
Precision: 0.802
Recall: 0.684
F_score: 0.721

Balanced Accuracy:  0.6661538461538461
              precision    recall  f1-score   support

           0       0.90      0.69      0.78       247
           1       0.30      0.64      0.41        50

    accuracy                           0.68       297
   macro avg       0.60      0.67      0.59       297
weighted avg       0.80      0.68      0.72       297



In [None]:
linear_svm_grid(X_cc,y_cc,X_test,y_test,class_ratio=None)



          Linear SVM

Accuracy: 0.680
Precision: 0.806
Recall: 0.680
F_score: 0.718

Balanced Accuracy:  0.6721052631578948
              precision    recall  f1-score   support

           0       0.91      0.68      0.78       247
           1       0.30      0.66      0.41        50

    accuracy                           0.68       297
   macro avg       0.60      0.67      0.60       297
weighted avg       0.81      0.68      0.72       297



In [None]:
nonlinear_svm_grid(X_cc,y_cc,X_test,y_test,class_ratio='balanced')



       Nonlinear SVM

Accuracy: 0.168
Precision: 0.028
Recall: 0.168
F_score: 0.049

Balanced Accuracy:  0.5
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       247
           1       0.17      1.00      0.29        50

    accuracy                           0.17       297
   macro avg       0.08      0.50      0.14       297
weighted avg       0.03      0.17      0.05       297



In [None]:
nonlinear_svm_grid(X_cc,y_cc,X_test,y_test,class_ratio=None)



       Nonlinear SVM

Accuracy: 0.168
Precision: 0.028
Recall: 0.168
F_score: 0.049

Balanced Accuracy:  0.5
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       247
           1       0.17      1.00      0.29        50

    accuracy                           0.17       297
   macro avg       0.08      0.50      0.14       297
weighted avg       0.03      0.17      0.05       297



In [None]:
logistic_regression_grid(X_cc,y_cc,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.660
Precision: 0.784
Recall: 0.660
F_score: 0.700

Balanced Accuracy:  0.6280566801619434
              precision    recall  f1-score   support

           0       0.89      0.68      0.77       247
           1       0.27      0.58      0.36        50

    accuracy                           0.66       297
   macro avg       0.58      0.63      0.57       297
weighted avg       0.78      0.66      0.70       297



In [None]:
logistic_regression_grid(X_cc,y_cc,X_test,y_test,class_ratio=None)



 Logistic Regression

Accuracy: 0.636
Precision: 0.761
Recall: 0.636
F_score: 0.679

Balanced Accuracy:  0.5819838056680162
              precision    recall  f1-score   support

           0       0.87      0.66      0.75       247
           1       0.23      0.50      0.32        50

    accuracy                           0.64       297
   macro avg       0.55      0.58      0.53       297
weighted avg       0.76      0.64      0.68       297



In [None]:
linear_svm(X_cc,y_cc,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.721
Precision: 0.804
Recall: 0.721
F_score: 0.749

Balanced Accuracy:  0.6724696356275304
              precision    recall  f1-score   support

           0       0.90      0.74      0.82       247
           1       0.32      0.60      0.42        50

    accuracy                           0.72       297
   macro avg       0.61      0.67      0.62       297
weighted avg       0.80      0.72      0.75       297



In [None]:
linear_svm(X_cc,y_cc,X_test,y_test,class_ratio=None)



          Linear SVM

Accuracy: 0.687
Precision: 0.803
Recall: 0.687
F_score: 0.723

Balanced Accuracy:  0.6681781376518219
              precision    recall  f1-score   support

           0       0.91      0.70      0.79       247
           1       0.30      0.64      0.41        50

    accuracy                           0.69       297
   macro avg       0.60      0.67      0.60       297
weighted avg       0.80      0.69      0.72       297



In [None]:
logistic_regression(X_cc,y_cc,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.646
Precision: 0.772
Recall: 0.646
F_score: 0.688

Balanced Accuracy:  0.6040080971659919
              precision    recall  f1-score   support

           0       0.88      0.67      0.76       247
           1       0.25      0.54      0.34        50

    accuracy                           0.65       297
   macro avg       0.56      0.60      0.55       297
weighted avg       0.77      0.65      0.69       297



In [None]:
logistic_regression(X_cc,y_cc,X_test,y_test,class_ratio=None)



 Logistic Regression

Accuracy: 0.646
Precision: 0.772
Recall: 0.646
F_score: 0.688

Balanced Accuracy:  0.6040080971659919
              precision    recall  f1-score   support

           0       0.88      0.67      0.76       247
           1       0.25      0.54      0.34        50

    accuracy                           0.65       297
   macro avg       0.56      0.60      0.55       297
weighted avg       0.77      0.65      0.69       297



In [None]:
random_forest(X_cc,y_cc,X_test,y_test,class_ratio='balanced')



       Random Forest

Accuracy: 0.805
Precision: 0.905
Recall: 0.805
F_score: 0.827

Balanced Accuracy:  0.8746153846153846
              precision    recall  f1-score   support

           0       0.99      0.77      0.87       247
           1       0.46      0.98      0.63        50

    accuracy                           0.80       297
   macro avg       0.73      0.87      0.75       297
weighted avg       0.91      0.80      0.83       297



In [None]:
xg_boost(X_cc,y_cc,X_test,y_test)



             XGBoost

Accuracy: 1.000
Precision: 1.000
Recall: 1.000
F_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       247
           1       1.00      1.00      1.00        50

    accuracy                           1.00       297
   macro avg       1.00      1.00      1.00       297
weighted avg       1.00      1.00      1.00       297



In [None]:
ensemble_stacked(X_cc,y_cc,X_test,y_test)



Ensemble Stacked Classifiers

Accuracy: 1.000
Precision: 1.000
Recall: 1.000
F_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       247
           1       1.00      1.00      1.00        50

    accuracy                           1.00       297
   macro avg       1.00      1.00      1.00       297
weighted avg       1.00      1.00      1.00       297



In [None]:
voting_classifiers(X_cc,y_cc,X_test,y_test)



   Voting Classifier

Accuracy: 0.909
Precision: 0.904
Recall: 0.909
F_score: 0.903

Balanced Accuracy:  0.78582995951417
              precision    recall  f1-score   support

           0       0.92      0.97      0.95       247
           1       0.81      0.60      0.69        50

    accuracy                           0.91       297
   macro avg       0.87      0.79      0.82       297
weighted avg       0.90      0.91      0.90       297



In [None]:
Bagging_Classifier_LR(X_cc,y_cc,X_test,y_test)



Bagging Calssifier LR

Accuracy: 0.684
Precision: 0.778
Recall: 0.684
F_score: 0.717

Balanced Accuracy:  0.6182995951417004
              precision    recall  f1-score   support

           0       0.88      0.72      0.79       247
           1       0.27      0.52      0.36        50

    accuracy                           0.68       297
   macro avg       0.58      0.62      0.57       297
weighted avg       0.78      0.68      0.72       297



In [None]:
Bagging_Classifier_SVM(X_cc,y_cc,X_test,y_test)



Bagging Calssifier SVM

Accuracy: 0.737
Precision: 0.806
Recall: 0.737
F_score: 0.762

Balanced Accuracy:  0.6746153846153846
              precision    recall  f1-score   support

           0       0.90      0.77      0.83       247
           1       0.34      0.58      0.43        50

    accuracy                           0.74       297
   macro avg       0.62      0.67      0.63       297
weighted avg       0.81      0.74      0.76       297



In [None]:
gradient_boosting(X_cc,y_cc,X_test,y_test)



   Gradient Boosting

Accuracy: 1.000
Precision: 1.000
Recall: 1.000
F_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       247
           1       1.00      1.00      1.00        50

    accuracy                           1.00       297
   macro avg       1.00      1.00      1.00       297
weighted avg       1.00      1.00      1.00       297



#### Over-sampling: SMOTE


In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_resample(X_train,y_train)

In [None]:
# df_train=X_sm.copy()
# df_train['target']=y_sm
# df_train.to_csv('/content/drive/MyDrive/Omdena_ RadmolAI/Sampled_datasets/imblearn_SMOTE_Random_over_sampled_train.csv')

In [None]:
linear_svm_grid(X_sm,y_sm,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.859
Precision: 0.856
Recall: 0.859
F_score: 0.857

Balanced Accuracy:  0.7395141700404859
              precision    recall  f1-score   support

           0       0.91      0.92      0.92       247
           1       0.58      0.56      0.57        50

    accuracy                           0.86       297
   macro avg       0.75      0.74      0.74       297
weighted avg       0.86      0.86      0.86       297



In [None]:
linear_svm_grid(X_sm,y_sm,X_test,y_test,class_ratio=None)



          Linear SVM

Accuracy: 0.859
Precision: 0.852
Recall: 0.859
F_score: 0.855

Balanced Accuracy:  0.7235627530364372
              precision    recall  f1-score   support

           0       0.91      0.93      0.92       247
           1       0.59      0.52      0.55        50

    accuracy                           0.86       297
   macro avg       0.75      0.72      0.73       297
weighted avg       0.85      0.86      0.85       297



In [None]:
nonlinear_svm_grid(X_sm,y_sm,X_test,y_test,class_ratio='balanced')



       Nonlinear SVM

Accuracy: 0.832
Precision: 0.692
Recall: 0.832
F_score: 0.755

Balanced Accuracy:  0.5
              precision    recall  f1-score   support

           0       0.83      1.00      0.91       247
           1       0.00      0.00      0.00        50

    accuracy                           0.83       297
   macro avg       0.42      0.50      0.45       297
weighted avg       0.69      0.83      0.76       297



In [None]:
nonlinear_svm_grid(X_sm,y_sm,X_test,y_test,class_ratio=None)



       Nonlinear SVM

Accuracy: 0.832
Precision: 0.692
Recall: 0.832
F_score: 0.755

Balanced Accuracy:  0.5
              precision    recall  f1-score   support

           0       0.83      1.00      0.91       247
           1       0.00      0.00      0.00        50

    accuracy                           0.83       297
   macro avg       0.42      0.50      0.45       297
weighted avg       0.69      0.83      0.76       297



In [None]:
logistic_regression_grid(X_sm,y_sm,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.852
Precision: 0.852
Recall: 0.852
F_score: 0.852

Balanced Accuracy:  0.7354655870445345
              precision    recall  f1-score   support

           0       0.91      0.91      0.91       247
           1       0.56      0.56      0.56        50

    accuracy                           0.85       297
   macro avg       0.74      0.74      0.74       297
weighted avg       0.85      0.85      0.85       297



In [None]:
logistic_regression_grid(X_sm,y_sm,X_test,y_test,class_ratio=None)



 Logistic Regression

Accuracy: 0.838
Precision: 0.841
Recall: 0.838
F_score: 0.840

Balanced Accuracy:  0.7193927125506073
              precision    recall  f1-score   support

           0       0.91      0.90      0.90       247
           1       0.52      0.54      0.53        50

    accuracy                           0.84       297
   macro avg       0.71      0.72      0.72       297
weighted avg       0.84      0.84      0.84       297



In [None]:
linear_svm(X_sm,y_sm,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.855
Precision: 0.848
Recall: 0.855
F_score: 0.851

Balanced Accuracy:  0.7135627530364372
              precision    recall  f1-score   support

           0       0.90      0.93      0.91       247
           1       0.58      0.50      0.54        50

    accuracy                           0.86       297
   macro avg       0.74      0.71      0.73       297
weighted avg       0.85      0.86      0.85       297



In [None]:
linear_svm(X_sm,y_sm,X_test,y_test,class_ratio=None)



          Linear SVM

Accuracy: 0.862
Precision: 0.861
Recall: 0.862
F_score: 0.861

Balanced Accuracy:  0.7495141700404858
              precision    recall  f1-score   support

           0       0.92      0.92      0.92       247
           1       0.59      0.58      0.59        50

    accuracy                           0.86       297
   macro avg       0.75      0.75      0.75       297
weighted avg       0.86      0.86      0.86       297



In [None]:
logistic_regression(X_sm,y_sm,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.835
Precision: 0.836
Recall: 0.835
F_score: 0.836

Balanced Accuracy:  0.7093927125506073
              precision    recall  f1-score   support

           0       0.90      0.90      0.90       247
           1       0.51      0.52      0.51        50

    accuracy                           0.84       297
   macro avg       0.71      0.71      0.71       297
weighted avg       0.84      0.84      0.84       297



In [None]:
logistic_regression(X_sm,y_sm,X_test,y_test,class_ratio=None)



 Logistic Regression

Accuracy: 0.835
Precision: 0.836
Recall: 0.835
F_score: 0.836

Balanced Accuracy:  0.7093927125506073
              precision    recall  f1-score   support

           0       0.90      0.90      0.90       247
           1       0.51      0.52      0.51        50

    accuracy                           0.84       297
   macro avg       0.71      0.71      0.71       297
weighted avg       0.84      0.84      0.84       297



In [None]:
random_forest(X_sm,y_sm,X_test,y_test,class_ratio='balanced')



       Random Forest

Accuracy: 0.990
Precision: 0.990
Recall: 0.990
F_score: 0.990

Balanced Accuracy:  0.97
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       247
           1       1.00      0.94      0.97        50

    accuracy                           0.99       297
   macro avg       0.99      0.97      0.98       297
weighted avg       0.99      0.99      0.99       297



In [None]:
xg_boost(X_sm,y_sm,X_test,y_test)



             XGBoost

Accuracy: 1.000
Precision: 1.000
Recall: 1.000
F_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       247
           1       1.00      1.00      1.00        50

    accuracy                           1.00       297
   macro avg       1.00      1.00      1.00       297
weighted avg       1.00      1.00      1.00       297



In [None]:
ensemble_stacked(X_sm,y_sm,X_test,y_test)



Ensemble Stacked Classifiers

Accuracy: 0.997
Precision: 0.997
Recall: 0.997
F_score: 0.997

Balanced Accuracy:  0.99
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       247
           1       1.00      0.98      0.99        50

    accuracy                           1.00       297
   macro avg       1.00      0.99      0.99       297
weighted avg       1.00      1.00      1.00       297



In [None]:
voting_classifiers(X_sm,y_sm,X_test,y_test)



   Voting Classifier

Accuracy: 0.926
Precision: 0.932
Recall: 0.926
F_score: 0.917

Balanced Accuracy:  0.78
              precision    recall  f1-score   support

           0       0.92      1.00      0.96       247
           1       1.00      0.56      0.72        50

    accuracy                           0.93       297
   macro avg       0.96      0.78      0.84       297
weighted avg       0.93      0.93      0.92       297



In [None]:
Bagging_Classifier_LR(X_sm,y_sm,X_test,y_test)



Bagging Calssifier LR

Accuracy: 0.842
Precision: 0.846
Recall: 0.842
F_score: 0.844

Balanced Accuracy:  0.7293927125506073
              precision    recall  f1-score   support

           0       0.91      0.90      0.90       247
           1       0.53      0.56      0.54        50

    accuracy                           0.84       297
   macro avg       0.72      0.73      0.72       297
weighted avg       0.85      0.84      0.84       297



In [None]:
Bagging_Classifier_SVM(X_sm,y_sm,X_test,y_test)



Bagging Calssifier SVM

Accuracy: 0.862
Precision: 0.859
Recall: 0.862
F_score: 0.860

Balanced Accuracy:  0.7415384615384616
              precision    recall  f1-score   support

           0       0.91      0.92      0.92       247
           1       0.60      0.56      0.58        50

    accuracy                           0.86       297
   macro avg       0.75      0.74      0.75       297
weighted avg       0.86      0.86      0.86       297



In [None]:
gradient_boosting(X_sm,y_sm,X_test,y_test)



   Gradient Boosting

Accuracy: 1.000
Precision: 1.000
Recall: 1.000
F_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       247
           1       1.00      1.00      1.00        50

    accuracy                           1.00       297
   macro avg       1.00      1.00      1.00       297
weighted avg       1.00      1.00      1.00       297



#### Over-sampling followed by under-sampling


In [None]:
from imblearn.combine import SMOTETomek

smt = SMOTETomek(sampling_strategy='auto')
X_smt, y_smt = smt.fit_resample(X_train,y_train)

In [None]:
# df_train=X_smt.copy()
# df_train['target']=y_smt
# df_train.to_csv('/content/drive/MyDrive/Omdena_ RadmolAI/Sampled_datasets/imblearn_SMOTETomek_Random_over_under_sampled_train.csv')

In [None]:
linear_svm_grid(X_smt,y_smt,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.855
Precision: 0.848
Recall: 0.855
F_score: 0.851

Balanced Accuracy:  0.7135627530364372
              precision    recall  f1-score   support

           0       0.90      0.93      0.91       247
           1       0.58      0.50      0.54        50

    accuracy                           0.86       297
   macro avg       0.74      0.71      0.73       297
weighted avg       0.85      0.86      0.85       297



In [None]:
linear_svm_grid(X_smt,y_smt,X_test,y_test,class_ratio=None)



          Linear SVM

Accuracy: 0.842
Precision: 0.840
Recall: 0.842
F_score: 0.841

Balanced Accuracy:  0.7134412955465588
              precision    recall  f1-score   support

           0       0.90      0.91      0.91       247
           1       0.53      0.52      0.53        50

    accuracy                           0.84       297
   macro avg       0.72      0.71      0.72       297
weighted avg       0.84      0.84      0.84       297



In [None]:
nonlinear_svm_grid(X_smt,y_smt,X_test,y_test,class_ratio='balanced')



       Nonlinear SVM

Accuracy: 0.832
Precision: 0.692
Recall: 0.832
F_score: 0.755

Balanced Accuracy:  0.5
              precision    recall  f1-score   support

           0       0.83      1.00      0.91       247
           1       0.00      0.00      0.00        50

    accuracy                           0.83       297
   macro avg       0.42      0.50      0.45       297
weighted avg       0.69      0.83      0.76       297



In [None]:
nonlinear_svm_grid(X_smt,y_smt,X_test,y_test,class_ratio=None)



       Nonlinear SVM

Accuracy: 0.832
Precision: 0.692
Recall: 0.832
F_score: 0.755

Balanced Accuracy:  0.5
              precision    recall  f1-score   support

           0       0.83      1.00      0.91       247
           1       0.00      0.00      0.00        50

    accuracy                           0.83       297
   macro avg       0.42      0.50      0.45       297
weighted avg       0.69      0.83      0.76       297



In [None]:
logistic_regression_grid(X_smt,y_smt,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.838
Precision: 0.841
Recall: 0.838
F_score: 0.840

Balanced Accuracy:  0.7193927125506073
              precision    recall  f1-score   support

           0       0.91      0.90      0.90       247
           1       0.52      0.54      0.53        50

    accuracy                           0.84       297
   macro avg       0.71      0.72      0.72       297
weighted avg       0.84      0.84      0.84       297



In [None]:
logistic_regression_grid(X_smt,y_smt,X_test,y_test,class_ratio=None)



 Logistic Regression

Accuracy: 0.838
Precision: 0.841
Recall: 0.838
F_score: 0.840

Balanced Accuracy:  0.7193927125506073
              precision    recall  f1-score   support

           0       0.91      0.90      0.90       247
           1       0.52      0.54      0.53        50

    accuracy                           0.84       297
   macro avg       0.71      0.72      0.72       297
weighted avg       0.84      0.84      0.84       297



In [None]:
linear_svm(X_smt,y_smt,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.838
Precision: 0.841
Recall: 0.838
F_score: 0.840

Balanced Accuracy:  0.7193927125506073
              precision    recall  f1-score   support

           0       0.91      0.90      0.90       247
           1       0.52      0.54      0.53        50

    accuracy                           0.84       297
   macro avg       0.71      0.72      0.72       297
weighted avg       0.84      0.84      0.84       297



In [None]:
linear_svm(X_smt,y_smt,X_test,y_test,class_ratio=None)



          Linear SVM

Accuracy: 0.855
Precision: 0.848
Recall: 0.855
F_score: 0.851

Balanced Accuracy:  0.7135627530364372
              precision    recall  f1-score   support

           0       0.90      0.93      0.91       247
           1       0.58      0.50      0.54        50

    accuracy                           0.86       297
   macro avg       0.74      0.71      0.73       297
weighted avg       0.85      0.86      0.85       297



In [None]:
logistic_regression(X_smt,y_smt,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.835
Precision: 0.836
Recall: 0.835
F_score: 0.836

Balanced Accuracy:  0.7093927125506073
              precision    recall  f1-score   support

           0       0.90      0.90      0.90       247
           1       0.51      0.52      0.51        50

    accuracy                           0.84       297
   macro avg       0.71      0.71      0.71       297
weighted avg       0.84      0.84      0.84       297



In [None]:
logistic_regression(X_smt,y_smt,X_test,y_test,class_ratio=None)



 Logistic Regression

Accuracy: 0.835
Precision: 0.836
Recall: 0.835
F_score: 0.836

Balanced Accuracy:  0.7093927125506073
              precision    recall  f1-score   support

           0       0.90      0.90      0.90       247
           1       0.51      0.52      0.51        50

    accuracy                           0.84       297
   macro avg       0.71      0.71      0.71       297
weighted avg       0.84      0.84      0.84       297



In [None]:
random_forest(X_smt,y_smt,X_test,y_test,class_ratio='balanced')



       Random Forest

Accuracy: 0.987
Precision: 0.986
Recall: 0.987
F_score: 0.986

Balanced Accuracy:  0.9679757085020242
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       247
           1       0.98      0.94      0.96        50

    accuracy                           0.99       297
   macro avg       0.98      0.97      0.98       297
weighted avg       0.99      0.99      0.99       297



In [None]:
xg_boost(X_smt,y_smt,X_test,y_test)



             XGBoost

Accuracy: 1.000
Precision: 1.000
Recall: 1.000
F_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       247
           1       1.00      1.00      1.00        50

    accuracy                           1.00       297
   macro avg       1.00      1.00      1.00       297
weighted avg       1.00      1.00      1.00       297



In [None]:
ensemble_stacked(X_smt,y_smt,X_test,y_test)



Ensemble Stacked Classifiers

Accuracy: 1.000
Precision: 1.000
Recall: 1.000
F_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       247
           1       1.00      1.00      1.00        50

    accuracy                           1.00       297
   macro avg       1.00      1.00      1.00       297
weighted avg       1.00      1.00      1.00       297



In [None]:
voting_classifiers(X_smt,y_smt,X_test,y_test)



   Voting Classifier

Accuracy: 0.936
Precision: 0.941
Recall: 0.936
F_score: 0.930

Balanced Accuracy:  0.81
              precision    recall  f1-score   support

           0       0.93      1.00      0.96       247
           1       1.00      0.62      0.77        50

    accuracy                           0.94       297
   macro avg       0.96      0.81      0.86       297
weighted avg       0.94      0.94      0.93       297



In [None]:
Bagging_Classifier_LR(X_smt,y_smt,X_test,y_test)



Bagging Calssifier LR

Accuracy: 0.832
Precision: 0.834
Recall: 0.832
F_score: 0.833

Balanced Accuracy:  0.7073684210526316
              precision    recall  f1-score   support

           0       0.90      0.89      0.90       247
           1       0.50      0.52      0.51        50

    accuracy                           0.83       297
   macro avg       0.70      0.71      0.70       297
weighted avg       0.83      0.83      0.83       297



In [None]:
Bagging_Classifier_SVM(X_smt,y_smt,X_test,y_test)



Bagging Calssifier SVM

Accuracy: 0.845
Precision: 0.840
Recall: 0.845
F_score: 0.843

Balanced Accuracy:  0.7074898785425101
              precision    recall  f1-score   support

           0       0.90      0.91      0.91       247
           1       0.54      0.50      0.52        50

    accuracy                           0.85       297
   macro avg       0.72      0.71      0.71       297
weighted avg       0.84      0.85      0.84       297



In [None]:
gradient_boosting(X_smt,y_smt,X_test,y_test)



   Gradient Boosting

Accuracy: 1.000
Precision: 1.000
Recall: 1.000
F_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       247
           1       1.00      1.00      1.00        50

    accuracy                           1.00       297
   macro avg       1.00      1.00      1.00       297
weighted avg       1.00      1.00      1.00       297



## Recommended Models

In [None]:
random_forest(X_rus,y_rus,X_test,y_test,class_ratio='balanced')
# Recommended model no.1, since it has best class 1 recall, class 0 precision, weighted f1-score and BACC score amongst all models - Shweta C. 06/10


       Random Forest

Accuracy: 0.960
Precision: 0.964
Recall: 0.960
F_score: 0.961

Balanced Accuracy:  0.9597570850202428
              precision    recall  f1-score   support

           0       0.99      0.96      0.98       247
           1       0.83      0.96      0.89        50

    accuracy                           0.96       297
   macro avg       0.91      0.96      0.93       297
weighted avg       0.96      0.96      0.96       297



In [None]:
voting_classifiers(X_rus,y_rus,X_test,y_test)
# Recommended model no.2


   Voting Classifier

Accuracy: 0.943
Precision: 0.949
Recall: 0.943
F_score: 0.945

Balanced Accuracy:  0.9336842105263158
              precision    recall  f1-score   support

           0       0.98      0.95      0.96       247
           1       0.78      0.92      0.84        50

    accuracy                           0.94       297
   macro avg       0.88      0.93      0.90       297
weighted avg       0.95      0.94      0.94       297



In [None]:
voting_classifiers(X_smt,y_smt,X_test,y_test)



   Voting Classifier

Accuracy: 0.936
Precision: 0.941
Recall: 0.936
F_score: 0.930

Balanced Accuracy:  0.81
              precision    recall  f1-score   support

           0       0.93      1.00      0.96       247
           1       1.00      0.62      0.77        50

    accuracy                           0.94       297
   macro avg       0.96      0.81      0.86       297
weighted avg       0.94      0.94      0.93       297



In [None]:
voting_classifiers(X_sm,y_sm,X_test,y_test)



   Voting Classifier

Accuracy: 0.926
Precision: 0.932
Recall: 0.926
F_score: 0.917

Balanced Accuracy:  0.78
              precision    recall  f1-score   support

           0       0.92      1.00      0.96       247
           1       1.00      0.56      0.72        50

    accuracy                           0.93       297
   macro avg       0.96      0.78      0.84       297
weighted avg       0.93      0.93      0.92       297



In [None]:
random_forest(X_cc,y_cc,X_test,y_test,class_ratio='balanced')



       Random Forest

Accuracy: 0.805
Precision: 0.905
Recall: 0.805
F_score: 0.827

Balanced Accuracy:  0.8746153846153846
              precision    recall  f1-score   support

           0       0.99      0.77      0.87       247
           1       0.46      0.98      0.63        50

    accuracy                           0.80       297
   macro avg       0.73      0.87      0.75       297
weighted avg       0.91      0.80      0.83       297



In [None]:
ensemble_stacked(X_smt,y_smt,X_test,y_test)



Ensemble Stacked Classifiers

Accuracy: 1.000
Precision: 1.000
Recall: 1.000
F_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       247
           1       1.00      1.00      1.00        50

    accuracy                           1.00       297
   macro avg       1.00      1.00      1.00       297
weighted avg       1.00      1.00      1.00       297



In [None]:
voting_classifiers(X_train,y_train,X_test,y_test)



   Voting Classifier

Accuracy: 0.929
Precision: 0.935
Recall: 0.929
F_score: 0.921

Balanced Accuracy:  0.79
              precision    recall  f1-score   support

           0       0.92      1.00      0.96       247
           1       1.00      0.58      0.73        50

    accuracy                           0.93       297
   macro avg       0.96      0.79      0.85       297
weighted avg       0.93      0.93      0.92       297



In [None]:
xg_boost(X_train,y_train,X_test,y_test)



             XGBoost

Accuracy: 1.000
Precision: 1.000
Recall: 1.000
F_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       247
           1       1.00      1.00      1.00        50

    accuracy                           1.00       297
   macro avg       1.00      1.00      1.00       297
weighted avg       1.00      1.00      1.00       297

