# Colorectal cancer full features models with balancing
**Solution notebook by Reem Abdel-Salam, Reviewed and Updated by Shweta Chandole**

Date: 04-June-2022

## Environment preparation

### Import Libraries

In [None]:
# import libraries
import pandas as pd
import numpy as np
import sys
import argparse
import csv
import regex 
import sklearn
import imblearn

from sklearn.svm import LinearSVC
from nltk.tokenize import TreebankWordTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import cross_validate
from sklearn import metrics
from sklearn.feature_selection import VarianceThreshold
from sklearn import preprocessing
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, balanced_accuracy_score
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from scipy.sparse import *
from sklearn.base import TransformerMixin
from nltk.tokenize import TreebankWordTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.util import ngrams
from sklearn.model_selection import StratifiedKFold, train_test_split

from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC, LinearSVC
from sklearn import utils
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from lightgbm import LGBMClassifier


### Define model and metrics creation fuctions

In [None]:
# define custom functions to run classification modules, build classifier models and calculate metrics

def print_statistics(y, y_pred):
    accuracy = metrics.accuracy_score(y, y_pred)
    weighted_precision = metrics.precision_score(y, y_pred, average='weighted')
    weighted_recall = metrics.recall_score(y, y_pred, average='weighted')
    weighted_f1_score = metrics.f1_score(y, y_pred, average='weighted')
    balanced_accuracy =balanced_accuracy_score(y, y_pred)

    print('Accuracy: %.3f\nWeighted_Precision: %.3f\nWeighted_Recall: %.3f\nWeighted_F1_score: %.3f\n'
          % (accuracy, weighted_precision, weighted_recall, weighted_f1_score))
    print('Balanced Accuracy: ', balanced_accuracy)
    print(metrics.classification_report(y, y_pred))
    
    return accuracy, weighted_precision, weighted_recall, weighted_f1_score, balanced_accuracy

# updated print_statistics function to include 'weighted_' label to the metrics using weighted avg scores - Shweta C. 06/04

def plot_coefficients(classifier, feature_names, top_features=20, plot_name="/bow_models/bow_binary_"):
    # Get the top most positive/negative coefficients
    coef = classifier.coef_.ravel()
    top_positive_coefficients = np.argsort(coef)[-top_features:]
    top_negative_coefficients = np.argsort(coef)[:top_features]
    top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])
    x_names = [feature_names[feature] for feature in top_coefficients]

    # Plot the coefficients
    plt.figure(figsize=(15, 5))
    colors = ['red' if c < 0 else 'blue' for c in coef[top_coefficients]]
    plt.bar(np.arange(2 * top_features), coef[top_coefficients], color=colors)
    plt.xticks(np.arange(0, 2 * top_features), x_names, rotation=30, ha='right')
    plt.ylabel("Coefficient Value")
    plt.title("Visualising the top %d features taken up by an SVM model" % top_features)
    to_save_filename = path + "/plots/" + plot_name + "top%d_coefficients.png" % top_features
    plt.savefig(to_save_filename)
    print("Coefficients' visualisation saved to %s\n" % to_save_filename)

def get_regularization_params(a=-1, b=1, c=3, d=1, e=5):
    reg_range = np.outer(np.logspace(a, b, c), np.array([d, e]))
    reg_range = reg_range.flatten()
    return reg_range


def grid_classifier(x_train, y_train, x_test, y_test, model, parameters,
                    make_feature_analysis=False, feature_names=None, top_features=0, plot_name="coeff"):
    grid = GridSearchCV(estimator=model, param_grid=parameters, verbose=0)
    grid.fit(x_train, y_train)
    sorted(grid.cv_results_.keys())
    classifier = grid.best_estimator_
    if make_feature_analysis:
        plot_coefficients(classifier, feature_names, top_features, plot_name)
    y_hat = classifier.predict(x_test)
    print_statistics(y_test, y_hat)

# Method to print the header of the currently running model
def print_model_title(name):
    print("\n==================================================================")
    print('{:>20}'.format(name))
    print("==================================================================\n")


def linear_svm_grid(x_train, y_train, x_test, y_test, class_ratio,
               make_feature_analysis=False, feature_names=None, top_features=0, plot_name="coeff"):
    print_model_title("Linear SVM")
    C_range = get_regularization_params()
    parameters = {'C': C_range}
    linear_svm = LinearSVC(C=1.0, class_weight=class_ratio, penalty='l2')
    grid_classifier(x_train, y_train, x_test, y_test, linear_svm, parameters,
                    make_feature_analysis, feature_names, top_features, plot_name)


def nonlinear_svm_grid(x_train, y_train, x_test, y_test, class_ratio,
                  make_feature_analysis=False, feature_names=None, top_features=0, plot_name="coeff"):
    print_model_title("Nonlinear SVM")
    C_range = get_regularization_params(a=-1, b=0, c=2, d=1, e=5)
    gamma_range = get_regularization_params(a=-2, b=-1, c=2, d=1, e=5)
    parameters = {'kernel': ['rbf'], 'C': C_range, 'gamma': gamma_range}
    nonlinear_svm = SVC(class_weight=class_ratio)
    grid_classifier(x_train, y_train, x_test, y_test, nonlinear_svm, parameters,
                    make_feature_analysis, feature_names, top_features, plot_name)


def logistic_regression_grid(x_train, y_train, x_test, y_test, class_ratio,
                        make_feature_analysis=False, feature_names=None, top_features=0, plot_name="coeff"):
    print_model_title("Logistic Regression")
    C_range = [0.001, 0.01, 0.1, 1, 10, 100]
    parameters = {'C': C_range}
    log_regr = LogisticRegression(C=1.0, class_weight=class_ratio, penalty='l2')
    grid_classifier(x_train, y_train, x_test, y_test, log_regr, parameters,
                    make_feature_analysis, feature_names, top_features, plot_name)


def linear_svm(x_train, y_train, x_test, y_test, class_ratio='balanced'):
    print_model_title("Linear SVM")
    svm = LinearSVC(C=0.01, class_weight=class_ratio, penalty='l2')
    svm.fit(x_train, y_train)
    y_hat = svm.predict(x_test)
    print_statistics(y_test, y_hat)


def logistic_regression(x_train, y_train, x_test, y_test, class_ratio='balanced'):
    print_model_title("Logistic Regression")
    regr = LogisticRegression(C=0.01, class_weight=class_ratio, penalty='l2')
    regr.fit(x_train, y_train)
    y_hat = regr.predict(x_test)
    print_statistics(y_test, y_hat)


def random_forest(x_train, y_train, x_test, y_test, class_ratio='balanced'):
  print_model_title("Random Forest")
  rf = RandomForestClassifier(n_estimators=400, random_state=11)
  rf.fit(x_train, y_train)
  y_hat = rf.predict(x_test)
  print_statistics(y_test, y_hat)


def random_forest_grid_search(x_train, y_train, x_test, y_test, class_ratio='balanced',make_feature_analysis=False, feature_names=None, top_features=0, plot_name="coeff"):
  print_model_title("Random Forest Grid search")
  parameters1 = { 
                "max_features": [0.8,0.9],
                "max_depth": [6,7,8,9],
                "min_samples_leaf": [50, 75, 100],
                "n_estimators": [10,30],
                "max_samples" : [0.8,0.9]
             }
  rf = RandomForestClassifier(n_estimators=400, random_state=11)
  grid_classifier(x_train, y_train, x_test, y_test, rf, parameters1,
                    make_feature_analysis, feature_names, top_features, plot_name)
  # rf.fit(x_train, y_train)
  # y_hat = rf.predict(x_test)
  # print_statistics(y_test, y_hat)


def xg_boost(x_train, y_train, x_test, y_test):
  print_model_title("XGBoost")
  xgb_model =XGBClassifier(max_depth=50, n_estimators=1000)
  xgb_model .fit(x_train, y_train)
  y_hat = xgb_model .predict(x_test)
  print_statistics(y_test, y_hat)


def xg_boost_focal_loss(x_train, y_train, x_test, y_test):
  print_model_title("XGBoost Focal")
  xgboster_focal = imb_xgb(special_objective='focal')
  CV_focal_booster = GridSearchCV(xgboster_focal, {"focal_gamma":[1.0,1.5,2.0,2.5,3.0]})
  CV_focal_booster.fit(x_train, y_train)
  opt_focal_booster = CV_focal_booster.best_estimator_
  # xgb_model .fit(x_train, y_train)
  y_hat = opt_focal_booster.predict_determine(x_test)
  print_statistics(y_test, y_hat)


def xg_boost_weighted_loss(x_train, y_train, x_test, y_test):
  print_model_title("XGBoost Weighted")
  xgboster_focal = imb_xgb(special_objective='weighted')
  CV_focal_booster = GridSearchCV(xgboster_focal, {"imbalance_alpha":[1.5,2.0,2.5,3.0,4.0]})
  CV_focal_booster.fit(x_train, y_train)
  opt_focal_booster = CV_focal_booster.best_estimator_
  # xgb_model .fit(x_train, y_train)
  y_hat = opt_focal_booster.predict_determine(x_test)
  print_statistics(y_test, y_hat)


def feature_selection(x_train, y_train, x_test, y_test):
    print("Feature selection with LinearSVC")
    model = LinearSVC(C=0.1, penalty='l2')
    rfe = RFE(model, 5)
    best_features_model = rfe.fit(x_train, y_train)
    y_hat = best_features_model.predict(x_test)
    print_statistics(y_test, y_hat)


def ensemble_stacked(x_train, y_train, x_test, y_test):
  print_model_title("Ensemble Stacked Classifiers")
  estimators = [ ('lr',LogisticRegression(C=0.01, class_weight='balanced', penalty='l2')),('xgb',XGBClassifier(max_depth=16, n_estimators=1000)),('svm_linear',LinearSVC(C=0.01, class_weight='balanced', penalty='l2')),('rf', RandomForestClassifier(n_estimators=10, random_state=42))]
  from sklearn.ensemble import StackingClassifier
  clf = StackingClassifier(
      estimators=estimators )
  clf.fit(x_train, y_train)
  y_hat = clf .predict(x_test)
  print_statistics(y_test, y_hat)

from sklearn.ensemble import RandomForestClassifier, VotingClassifier
def voting_classifiers(x_train, y_train, x_test, y_test,voting_type='hard'):
  print_model_title("Voting Classifier")
  estimators = [ ('lr',LogisticRegression(C=0.01, class_weight='balanced', penalty='l2')),('xgb',XGBClassifier(max_depth=16, n_estimators=1000)),('svm_linear',LinearSVC(C=0.01, class_weight='balanced', penalty='l2')),('rf', RandomForestClassifier(n_estimators=10, random_state=42))]
  from sklearn.ensemble import StackingClassifier
  clf = VotingClassifier(
      estimators=estimators , voting=voting_type)
  clf.fit(x_train, y_train)
  y_hat = clf .predict(x_test)
  print_statistics(y_test, y_hat)


from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.ensemble import BaggingClassifier
def Bagging_Classifier_LR(x_train, y_train, x_test, y_test):
  print_model_title("Bagging Calssifier LR")
 
  clf =BaggingClassifier(base_estimator=LogisticRegression(C=0.01, class_weight='balanced', penalty='l2'),
                       n_estimators=10, random_state=42)
  clf.fit(x_train, y_train)
  y_hat = clf .predict(x_test)
  print_statistics(y_test, y_hat)


def Bagging_Classifier_SVM(x_train, y_train, x_test, y_test):
  print_model_title("Bagging Calssifier SVM")
 
  clf =BaggingClassifier(base_estimator=LinearSVC(C=0.01, class_weight='balanced', penalty='l2'),
                       n_estimators=10, random_state=42)
  clf.fit(x_train, y_train)
  y_hat = clf .predict(x_test)
  print_statistics(y_test, y_hat)

from sklearn.ensemble import GradientBoostingClassifier
def gradient_boosting(x_train, y_train, x_test, y_test):
  print_model_title("Gradient Boosting")
 
  clf =GradientBoostingClassifier(n_estimators=100, learning_rate=0.01,max_depth=30, random_state=42)
  clf.fit(x_train, y_train)
  y_hat = clf .predict(x_test)
  print_statistics(y_test, y_hat)

def xg_boost(x_train, y_train, x_test, y_test):
  print_model_title("XGBoost")
  xgb_model =XGBClassifier(max_depth=50, n_estimators=1000)
  xgb_model .fit(x_train, y_train)
  y_hat = xgb_model .predict(x_test)
  print_statistics(y_test, y_hat)



### Saving list of packages to text file

In [None]:
with open("crc_modules_colab.txt", "w") as f:
  print(sys.modules.keys(), file=f)

In [None]:
!pip freeze > crc_requirements.txt

In [None]:
!pip install pipreqs

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pipreqs
  Downloading pipreqs-0.4.11-py2.py3-none-any.whl (32 kB)
Collecting yarg
  Downloading yarg-0.1.9-py2.py3-none-any.whl (19 kB)
Installing collected packages: yarg, pipreqs
Successfully installed pipreqs-0.4.11 yarg-0.1.9


In [None]:
!pipreqs . --force

INFO: Successfully saved requirements file in ./requirements.txt


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df=pd.read_csv('/content/drive/MyDrive/Omdena/RadmolAI/Data/CRC_original_dataset.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
# check for percentage distribution of target variable "Case_Control", which indicates class 0 as non-cancer and class 1 as cancer
df['Case_Control'].value_counts(normalize=True)

0    0.833254
1    0.166746
Name: Case_Control, dtype: float64

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,abd_tend_score,PR_score,age_group,Case_Control,code_D11_1st,code_D11_2nd,code_D12_1st,code_D12_2nd,code_T03_1st,...,diarrhoea_last_year,earliest_rectal_bleeding,rectal_bleeding_last_year,earliest_mild_anaemia,mild_anaemia_last_year,earliest_severe_anaemia,severe_anaemia_last_year,earliest_FOB,FOB_last_year,diarrhoea_duration
0,0,0.0,0.0,1.0,0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,,0.0,,0.0,,0.0,22.0
1,1,0.0,0.0,1.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,,0.0,,0.0,,0.0,
2,2,0.0,0.0,2.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,,0.0,,0.0,,0.0,
3,3,0.0,0.0,1.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,,0.0,,0.0,,0.0,
4,4,0.0,0.0,2.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,,0.0,,0.0,,0.0,


## Data pre-processing


*   Fill Nan with -1
*   label encoder for string columns
*   Drop unnamed column




In [None]:
# replace missing values with -1
df_filled=df.fillna(-1)


In [None]:
# check for columns other than numeric columns
column_names_string= df_filled.select_dtypes(exclude=[np.number]).columns

In [None]:
column_names_string

Index(['definite_check_exclude', 'cancer_staging'], dtype='object')

In [None]:
# encode the object columns
for col in column_names_string:
  le = preprocessing.LabelEncoder()
  df_filled[col] = le.fit_transform(df[col].astype(str).values)

In [None]:
# drop primary keys column
df_filled.drop("Unnamed: 0", axis=1, inplace=True)

In [None]:
len(df_filled)

2093

In [None]:
# check dataset shape 
df_filled.shape

(2093, 1258)

### Train-Test-Split

In [None]:
column_df=list(df_filled.columns)

In [None]:
column_df.remove('Case_Control')
column_df.remove('Control_Case')

In [None]:
'Case_Control'  in column_df

False

In [None]:
'Control_Case'  in column_df

False

In [None]:
X=df_filled[column_df]
y=df_filled['Case_Control']

In [None]:
# using stratification on target variable y to balance the train-test split - Shweta C.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,random_state=20,stratify=y)


In [None]:
df_test=X_test.copy()
df_test['target']=y_test
#df_test.to_csv('/content/drive/MyDrive/Omdena_ RadmolAI/Sampled_datasets/test.csv')

In [None]:
df_train=X_train.copy()
df_train['target']=y_train
#df_train.to_csv('/content/drive/MyDrive/Omdena_ RadmolAI/Sampled_datasets/train_unbalanced.csv')

In [None]:
len(y_test)

419

In [None]:
count_class_0, count_class_1 = y_test.value_counts(normalize=True)
print(count_class_0)
print(count_class_1)

0.8329355608591885
0.16706443914081145


In [None]:
import warnings
warnings.filterwarnings('ignore')

## Baseline Models

Models based on balanced class_weights for original data, after applying stratification on y variable

In [None]:
linear_svm_grid(X_train,y_train,X_test,y_test,class_ratio=None)



          Linear SVM

Accuracy: 0.878
Weighted_Precision: 0.867
Weighted_Recall: 0.878
Weighted_F1_score: 0.866

Balanced Accuracy:  0.7042365943512074
              precision    recall  f1-score   support

           0       0.90      0.97      0.93       349
           1       0.72      0.44      0.55        70

    accuracy                           0.88       419
   macro avg       0.81      0.70      0.74       419
weighted avg       0.87      0.88      0.87       419



In [None]:
linear_svm_grid(X_train,y_train,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.888
Weighted_Precision: 0.884
Weighted_Recall: 0.888
Weighted_F1_score: 0.885

Balanced Accuracy:  0.7784895620139174
              precision    recall  f1-score   support

           0       0.92      0.94      0.93       349
           1       0.68      0.61      0.65        70

    accuracy                           0.89       419
   macro avg       0.80      0.78      0.79       419
weighted avg       0.88      0.89      0.89       419



In [None]:
nonlinear_svm_grid(X_train,y_train,X_test,y_test,class_ratio='balanced')



       Nonlinear SVM

Accuracy: 0.833
Weighted_Precision: 0.694
Weighted_Recall: 0.833
Weighted_F1_score: 0.757

Balanced Accuracy:  0.5
              precision    recall  f1-score   support

           0       0.83      1.00      0.91       349
           1       0.00      0.00      0.00        70

    accuracy                           0.83       419
   macro avg       0.42      0.50      0.45       419
weighted avg       0.69      0.83      0.76       419



In [None]:
logistic_regression_grid(X_train,y_train,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.833
Weighted_Precision: 0.849
Weighted_Recall: 0.833
Weighted_F1_score: 0.840

Balanced Accuracy:  0.7455382726156365
              precision    recall  f1-score   support

           0       0.92      0.88      0.90       349
           1       0.50      0.61      0.55        70

    accuracy                           0.83       419
   macro avg       0.71      0.75      0.72       419
weighted avg       0.85      0.83      0.84       419



In [None]:
linear_svm(X_train,y_train,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.881
Weighted_Precision: 0.870
Weighted_Recall: 0.881
Weighted_F1_score: 0.869

Balanced Accuracy:  0.7113794514940647
              precision    recall  f1-score   support

           0       0.90      0.97      0.93       349
           1       0.73      0.46      0.56        70

    accuracy                           0.88       419
   macro avg       0.81      0.71      0.75       419
weighted avg       0.87      0.88      0.87       419



In [None]:
logistic_regression(X_train,y_train,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.842
Weighted_Precision: 0.854
Weighted_Recall: 0.842
Weighted_F1_score: 0.847

Balanced Accuracy:  0.7512689316414245
              precision    recall  f1-score   support

           0       0.92      0.89      0.90       349
           1       0.52      0.61      0.57        70

    accuracy                           0.84       419
   macro avg       0.72      0.75      0.73       419
weighted avg       0.85      0.84      0.85       419



In [None]:
random_forest_grid_search(X_train,y_train,X_test,y_test,class_ratio='balanced')



Random Forest Grid search

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



In [None]:
random_forest(X_train,y_train,X_test,y_test,class_ratio='balanced')



       Random Forest

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



In [None]:
xg_boost(X_train,y_train,X_train,y_train)



             XGBoost

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1395
           1       1.00      1.00      1.00       279

    accuracy                           1.00      1674
   macro avg       1.00      1.00      1.00      1674
weighted avg       1.00      1.00      1.00      1674



In [None]:
xg_boost(X_train,y_train,X_test,y_test)



             XGBoost

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



In [None]:
ensemble_stacked(X_train,y_train,X_test,y_test)



Ensemble Stacked Classifiers

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



In [None]:
voting_classifiers(X_train,y_train,X_test,y_test)



   Voting Classifier

Accuracy: 0.943
Weighted_Precision: 0.946
Weighted_Recall: 0.943
Weighted_F1_score: 0.938

Balanced Accuracy:  0.8285714285714285
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       349
           1       1.00      0.66      0.79        70

    accuracy                           0.94       419
   macro avg       0.97      0.83      0.88       419
weighted avg       0.95      0.94      0.94       419



In [None]:
Bagging_Classifier_LR(X_train,y_train,X_test,y_test)



Bagging Calssifier LR

Accuracy: 0.852
Weighted_Precision: 0.849
Weighted_Recall: 0.852
Weighted_F1_score: 0.850

Balanced Accuracy:  0.7227384363487516
              precision    recall  f1-score   support

           0       0.91      0.92      0.91       349
           1       0.56      0.53      0.54        70

    accuracy                           0.85       419
   macro avg       0.73      0.72      0.73       419
weighted avg       0.85      0.85      0.85       419



In [None]:
Bagging_Classifier_SVM(X_train,y_train,X_test,y_test)



Bagging Calssifier SVM

Accuracy: 0.883
Weighted_Precision: 0.873
Weighted_Recall: 0.883
Weighted_F1_score: 0.872

Balanced Accuracy:  0.7185223086369218
              precision    recall  f1-score   support

           0       0.90      0.97      0.93       349
           1       0.73      0.47      0.57        70

    accuracy                           0.88       419
   macro avg       0.82      0.72      0.75       419
weighted avg       0.87      0.88      0.87       419



In [None]:
gradient_boosting(X_train,y_train,X_test,y_test)



   Gradient Boosting

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



In [None]:
gradient_boosting(X_train,y_train,X_train,y_train)



   Gradient Boosting

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1395
           1       1.00      1.00      1.00       279

    accuracy                           1.00      1674
   macro avg       1.00      1.00      1.00      1674
weighted avg       1.00      1.00      1.00      1674



## Sampling of dataset for balancing the classes

### Random UnderSampler

In [None]:
df_train=X_train.copy()
df_train['target']=y_train

In [None]:
# Class count
count_class_0, count_class_1 = df_train.target.value_counts()

# Divide by class
df_class_0 = df_train[df_train['target'] == 0]
df_class_1 = df_train[df_train['target'] == 1]

In [None]:
df_class_0_under = df_class_0.sample(count_class_1)
df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0)

In [None]:
#df_test_under.to_csv('/content/drive/MyDrive/Omdena_ RadmolAI/Sampled_datasets/Random_under_sampled_train.csv')


In [None]:
non_nan_cols=column_df

In [None]:
X_train_new_under_sampled=df_test_under[non_nan_cols]
y_train_new_under_sampler=df_test_under['target']

In [None]:
linear_svm_grid(X_train_new_under_sampled,y_train_new_under_sampler,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.718
Weighted_Precision: 0.846
Weighted_Recall: 0.718
Weighted_F1_score: 0.753

Balanced Accuracy:  0.7510028653295129
              precision    recall  f1-score   support

           0       0.95      0.70      0.81       349
           1       0.35      0.80      0.49        70

    accuracy                           0.72       419
   macro avg       0.65      0.75      0.65       419
weighted avg       0.85      0.72      0.75       419



In [None]:
linear_svm_grid(X_train_new_under_sampled,y_train_new_under_sampler,X_test,y_test,class_ratio=None)



          Linear SVM

Accuracy: 0.547
Weighted_Precision: 0.862
Weighted_Recall: 0.547
Weighted_F1_score: 0.594

Balanced Accuracy:  0.7106631191158412
              precision    recall  f1-score   support

           0       0.98      0.46      0.63       349
           1       0.26      0.96      0.41        70

    accuracy                           0.55       419
   macro avg       0.62      0.71      0.52       419
weighted avg       0.86      0.55      0.59       419



In [None]:
nonlinear_svm_grid(X_train_new_under_sampled,y_train_new_under_sampler,X_test,y_test,class_ratio='balanced')



       Nonlinear SVM

Accuracy: 0.167
Weighted_Precision: 0.028
Weighted_Recall: 0.167
Weighted_F1_score: 0.048

Balanced Accuracy:  0.5
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       349
           1       0.17      1.00      0.29        70

    accuracy                           0.17       419
   macro avg       0.08      0.50      0.14       419
weighted avg       0.03      0.17      0.05       419



In [None]:
nonlinear_svm_grid(X_train_new_under_sampled,y_train_new_under_sampler,X_test,y_test,class_ratio=None)



       Nonlinear SVM

Accuracy: 0.167
Weighted_Precision: 0.028
Weighted_Recall: 0.167
Weighted_F1_score: 0.048

Balanced Accuracy:  0.5
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       349
           1       0.17      1.00      0.29        70

    accuracy                           0.17       419
   macro avg       0.08      0.50      0.14       419
weighted avg       0.03      0.17      0.05       419



In [None]:
logistic_regression_grid(X_train_new_under_sampled,y_train_new_under_sampler,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.726
Weighted_Precision: 0.830
Weighted_Recall: 0.726
Weighted_F1_score: 0.757

Balanced Accuracy:  0.721039705280393
              precision    recall  f1-score   support

           0       0.93      0.73      0.82       349
           1       0.34      0.71      0.47        70

    accuracy                           0.73       419
   macro avg       0.64      0.72      0.64       419
weighted avg       0.83      0.73      0.76       419



In [None]:
logistic_regression_grid(X_train_new_under_sampled,y_train_new_under_sampler,X_test,y_test,class_ratio=None)



 Logistic Regression

Accuracy: 0.726
Weighted_Precision: 0.830
Weighted_Recall: 0.726
Weighted_F1_score: 0.757

Balanced Accuracy:  0.721039705280393
              precision    recall  f1-score   support

           0       0.93      0.73      0.82       349
           1       0.34      0.71      0.47        70

    accuracy                           0.73       419
   macro avg       0.64      0.72      0.64       419
weighted avg       0.83      0.73      0.76       419



In [None]:
linear_svm(X_train_new_under_sampled,y_train_new_under_sampler,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.654
Weighted_Precision: 0.862
Weighted_Recall: 0.654
Weighted_F1_score: 0.698

Balanced Accuracy:  0.7580024559967253
              precision    recall  f1-score   support

           0       0.97      0.60      0.74       349
           1       0.32      0.91      0.47        70

    accuracy                           0.65       419
   macro avg       0.64      0.76      0.61       419
weighted avg       0.86      0.65      0.70       419



In [None]:
linear_svm(X_train_new_under_sampled,y_train_new_under_sampler,X_test,y_test,class_ratio=None)



          Linear SVM

Accuracy: 0.804
Weighted_Precision: 0.839
Weighted_Recall: 0.804
Weighted_F1_score: 0.817

Balanced Accuracy:  0.7340564879246827
              precision    recall  f1-score   support

           0       0.92      0.84      0.88       349
           1       0.44      0.63      0.52        70

    accuracy                           0.80       419
   macro avg       0.68      0.73      0.70       419
weighted avg       0.84      0.80      0.82       419



In [None]:
logistic_regression(X_train_new_under_sampled,y_train_new_under_sampler,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.757
Weighted_Precision: 0.850
Weighted_Recall: 0.757
Weighted_F1_score: 0.784

Balanced Accuracy:  0.7625051166598444
              precision    recall  f1-score   support

           0       0.94      0.75      0.84       349
           1       0.39      0.77      0.51        70

    accuracy                           0.76       419
   macro avg       0.66      0.76      0.68       419
weighted avg       0.85      0.76      0.78       419



In [None]:
logistic_regression(X_train_new_under_sampled,y_train_new_under_sampler,X_test,y_test,class_ratio=None)



 Logistic Regression

Accuracy: 0.757
Weighted_Precision: 0.850
Weighted_Recall: 0.757
Weighted_F1_score: 0.784

Balanced Accuracy:  0.7625051166598444
              precision    recall  f1-score   support

           0       0.94      0.75      0.84       349
           1       0.39      0.77      0.51        70

    accuracy                           0.76       419
   macro avg       0.66      0.76      0.68       419
weighted avg       0.85      0.76      0.78       419



In [None]:
random_forest(X_train_new_under_sampled,y_train_new_under_sampler,X_test,y_test,class_ratio='balanced')



       Random Forest

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



In [None]:
xg_boost(X_train_new_under_sampled,y_train_new_under_sampler,X_test,y_test)



             XGBoost

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



In [None]:
ensemble_stacked(X_train_new_under_sampled,y_train_new_under_sampler,X_test,y_test)



Ensemble Stacked Classifiers

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



In [None]:
voting_classifiers(X_train_new_under_sampled,y_train_new_under_sampler,X_test,y_test)



   Voting Classifier

Accuracy: 0.969
Weighted_Precision: 0.970
Weighted_Recall: 0.969
Weighted_F1_score: 0.968

Balanced Accuracy:  0.9071428571428571
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       349
           1       1.00      0.81      0.90        70

    accuracy                           0.97       419
   macro avg       0.98      0.91      0.94       419
weighted avg       0.97      0.97      0.97       419



In [None]:
Bagging_Classifier_LR(X_train_new_under_sampled,y_train_new_under_sampler,X_test,y_test)



Bagging Calssifier LR

Accuracy: 0.730
Weighted_Precision: 0.837
Weighted_Recall: 0.730
Weighted_F1_score: 0.761

Balanced Accuracy:  0.7353254195661072
              precision    recall  f1-score   support

           0       0.93      0.73      0.82       349
           1       0.35      0.74      0.48        70

    accuracy                           0.73       419
   macro avg       0.64      0.74      0.65       419
weighted avg       0.84      0.73      0.76       419



In [None]:
Bagging_Classifier_SVM(X_train_new_under_sampled,y_train_new_under_sampler,X_test,y_test)



Bagging Calssifier SVM

Accuracy: 0.742
Weighted_Precision: 0.843
Weighted_Recall: 0.742
Weighted_F1_score: 0.772

Balanced Accuracy:  0.7481989357347523
              precision    recall  f1-score   support

           0       0.94      0.74      0.83       349
           1       0.37      0.76      0.50        70

    accuracy                           0.74       419
   macro avg       0.65      0.75      0.66       419
weighted avg       0.84      0.74      0.77       419



In [None]:
gradient_boosting(X_train_new_under_sampled,y_train_new_under_sampler,X_test,y_test)



   Gradient Boosting

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



### Random Oversampler

In [None]:
df_class_1_over = df_class_1.sample(count_class_0, replace=True)
df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0)

In [None]:
#df_test_over.to_csv('/content/drive/MyDrive/Omdena_ RadmolAI/Sampled_datasets/Random_Over_sampled_train.csv')


In [None]:
X_train_new_over_sampled=df_test_over[non_nan_cols]
y_train_new_over_sampler=df_test_over['target']

In [None]:
linear_svm_grid(X_train_new_over_sampled,y_train_new_over_sampler,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.866
Weighted_Precision: 0.876
Weighted_Recall: 0.866
Weighted_F1_score: 0.870

Balanced Accuracy:  0.7941465411379451
              precision    recall  f1-score   support

           0       0.93      0.90      0.92       349
           1       0.59      0.69      0.63        70

    accuracy                           0.87       419
   macro avg       0.76      0.79      0.77       419
weighted avg       0.88      0.87      0.87       419



In [None]:
linear_svm_grid(X_train_new_over_sampled,y_train_new_over_sampler,X_test,y_test,class_ratio=None)



          Linear SVM

Accuracy: 0.845
Weighted_Precision: 0.876
Weighted_Recall: 0.845
Weighted_F1_score: 0.855

Balanced Accuracy:  0.809803520261973
              precision    recall  f1-score   support

           0       0.95      0.86      0.90       349
           1       0.52      0.76      0.62        70

    accuracy                           0.84       419
   macro avg       0.74      0.81      0.76       419
weighted avg       0.88      0.84      0.86       419



In [None]:
nonlinear_svm_grid(X_train_new_over_sampled,y_train_new_over_sampler,X_test,y_test,class_ratio='balanced')



       Nonlinear SVM

Accuracy: 0.833
Weighted_Precision: 0.694
Weighted_Recall: 0.833
Weighted_F1_score: 0.757

Balanced Accuracy:  0.5
              precision    recall  f1-score   support

           0       0.83      1.00      0.91       349
           1       0.00      0.00      0.00        70

    accuracy                           0.83       419
   macro avg       0.42      0.50      0.45       419
weighted avg       0.69      0.83      0.76       419



In [None]:
nonlinear_svm_grid(X_train_new_over_sampled,y_train_new_over_sampler,X_test,y_test,class_ratio=None)



       Nonlinear SVM

Accuracy: 0.833
Weighted_Precision: 0.694
Weighted_Recall: 0.833
Weighted_F1_score: 0.757

Balanced Accuracy:  0.5
              precision    recall  f1-score   support

           0       0.83      1.00      0.91       349
           1       0.00      0.00      0.00        70

    accuracy                           0.83       419
   macro avg       0.42      0.50      0.45       419
weighted avg       0.69      0.83      0.76       419



In [None]:
logistic_regression_grid(X_train_new_over_sampled,y_train_new_over_sampler,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.821
Weighted_Precision: 0.837
Weighted_Recall: 0.821
Weighted_F1_score: 0.828

Balanced Accuracy:  0.7212443716741711
              precision    recall  f1-score   support

           0       0.91      0.87      0.89       349
           1       0.47      0.57      0.52        70

    accuracy                           0.82       419
   macro avg       0.69      0.72      0.70       419
weighted avg       0.84      0.82      0.83       419



In [None]:
logistic_regression_grid(X_train_new_over_sampled,y_train_new_over_sampler,X_test,y_test,class_ratio=None)



 Logistic Regression

Accuracy: 0.821
Weighted_Precision: 0.837
Weighted_Recall: 0.821
Weighted_F1_score: 0.828

Balanced Accuracy:  0.7212443716741711
              precision    recall  f1-score   support

           0       0.91      0.87      0.89       349
           1       0.47      0.57      0.52        70

    accuracy                           0.82       419
   macro avg       0.69      0.72      0.70       419
weighted avg       0.84      0.82      0.83       419



In [None]:
linear_svm(X_train_new_over_sampled,y_train_new_over_sampler,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.842
Weighted_Precision: 0.868
Weighted_Recall: 0.842
Weighted_F1_score: 0.852

Balanced Accuracy:  0.7912402783462955
              precision    recall  f1-score   support

           0       0.94      0.87      0.90       349
           1       0.52      0.71      0.60        70

    accuracy                           0.84       419
   macro avg       0.73      0.79      0.75       419
weighted avg       0.87      0.84      0.85       419



In [None]:
linear_svm(X_train_new_over_sampled,y_train_new_over_sampler,X_test,y_test,class_ratio=None)



          Linear SVM

Accuracy: 0.854
Weighted_Precision: 0.866
Weighted_Recall: 0.854
Weighted_F1_score: 0.859

Balanced Accuracy:  0.7755628325828898
              precision    recall  f1-score   support

           0       0.93      0.89      0.91       349
           1       0.55      0.66      0.60        70

    accuracy                           0.85       419
   macro avg       0.74      0.78      0.76       419
weighted avg       0.87      0.85      0.86       419



In [None]:
logistic_regression(X_train_new_over_sampled,y_train_new_over_sampler,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.821
Weighted_Precision: 0.837
Weighted_Recall: 0.821
Weighted_F1_score: 0.828

Balanced Accuracy:  0.7212443716741711
              precision    recall  f1-score   support

           0       0.91      0.87      0.89       349
           1       0.47      0.57      0.52        70

    accuracy                           0.82       419
   macro avg       0.69      0.72      0.70       419
weighted avg       0.84      0.82      0.83       419



In [None]:
logistic_regression(X_train_new_over_sampled,y_train_new_over_sampler,X_test,y_test,class_ratio=None)



 Logistic Regression

Accuracy: 0.821
Weighted_Precision: 0.837
Weighted_Recall: 0.821
Weighted_F1_score: 0.828

Balanced Accuracy:  0.7212443716741711
              precision    recall  f1-score   support

           0       0.91      0.87      0.89       349
           1       0.47      0.57      0.52        70

    accuracy                           0.82       419
   macro avg       0.69      0.72      0.70       419
weighted avg       0.84      0.82      0.83       419



In [None]:
random_forest(X_train_new_over_sampled,y_train_new_over_sampler,X_test,y_test,class_ratio='balanced')



       Random Forest

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



In [None]:
xg_boost(X_train_new_over_sampled,y_train_new_over_sampler,X_test,y_test)



             XGBoost

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



In [None]:
ensemble_stacked(X_train_new_over_sampled,y_train_new_over_sampler,X_test,y_test)



Ensemble Stacked Classifiers

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



In [None]:
voting_classifiers(X_train_new_over_sampled,y_train_new_over_sampler,X_test,y_test)



   Voting Classifier

Accuracy: 0.959
Weighted_Precision: 0.961
Weighted_Recall: 0.959
Weighted_F1_score: 0.957

Balanced Accuracy:  0.8785714285714286
              precision    recall  f1-score   support

           0       0.95      1.00      0.98       349
           1       1.00      0.76      0.86        70

    accuracy                           0.96       419
   macro avg       0.98      0.88      0.92       419
weighted avg       0.96      0.96      0.96       419



In [None]:
Bagging_Classifier_LR(X_train_new_over_sampled,y_train_new_over_sampler,X_test,y_test)



Bagging Calssifier LR

Accuracy: 0.823
Weighted_Precision: 0.844
Weighted_Recall: 0.823
Weighted_F1_score: 0.832

Balanced Accuracy:  0.7398076135898486
              precision    recall  f1-score   support

           0       0.92      0.87      0.89       349
           1       0.48      0.61      0.54        70

    accuracy                           0.82       419
   macro avg       0.70      0.74      0.71       419
weighted avg       0.84      0.82      0.83       419



In [None]:
Bagging_Classifier_SVM(X_train_new_over_sampled,y_train_new_over_sampler,X_test,y_test)



Bagging Calssifier SVM

Accuracy: 0.857
Weighted_Precision: 0.855
Weighted_Recall: 0.857
Weighted_F1_score: 0.856

Balanced Accuracy:  0.7370241506344659
              precision    recall  f1-score   support

           0       0.91      0.92      0.91       349
           1       0.57      0.56      0.57        70

    accuracy                           0.86       419
   macro avg       0.74      0.74      0.74       419
weighted avg       0.86      0.86      0.86       419



In [None]:
gradient_boosting(X_train_new_over_sampled,y_train_new_over_sampler,X_test,y_test)



   Gradient Boosting

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



### Python imbalanced-learn module


In [None]:
import imblearn


### Random under-sampling and over-sampling with imbalanced-learn


#### Random under-sampler

In [None]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler()
X_rus, y_rus = rus.fit_resample(X_train,y_train)

In [None]:
df_train=X_rus.copy()
df_train['target']=y_rus
#df_train.to_csv('/content/drive/MyDrive/Omdena_ RadmolAI/Sampled_datasets/imblearn_Random_under_sampled_train.csv')



In [None]:
linear_svm_grid(X_rus,y_rus,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.702
Weighted_Precision: 0.849
Weighted_Recall: 0.702
Weighted_F1_score: 0.739

Balanced Accuracy:  0.7523945968072043
              precision    recall  f1-score   support

           0       0.95      0.68      0.79       349
           1       0.34      0.83      0.48        70

    accuracy                           0.70       419
   macro avg       0.65      0.75      0.64       419
weighted avg       0.85      0.70      0.74       419



In [None]:
linear_svm_grid(X_rus,y_rus,X_test,y_test,class_ratio=None)



          Linear SVM

Accuracy: 0.535
Weighted_Precision: 0.821
Weighted_Recall: 0.535
Weighted_F1_score: 0.587

Balanced Accuracy:  0.657818256242325
              precision    recall  f1-score   support

           0       0.94      0.47      0.63       349
           1       0.24      0.84      0.38        70

    accuracy                           0.53       419
   macro avg       0.59      0.66      0.50       419
weighted avg       0.82      0.53      0.59       419



In [None]:
nonlinear_svm_grid(X_rus,y_rus,X_test,y_test,class_ratio='balanced')



       Nonlinear SVM

Accuracy: 0.167
Weighted_Precision: 0.028
Weighted_Recall: 0.167
Weighted_F1_score: 0.048

Balanced Accuracy:  0.5
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       349
           1       0.17      1.00      0.29        70

    accuracy                           0.17       419
   macro avg       0.08      0.50      0.14       419
weighted avg       0.03      0.17      0.05       419



In [None]:
nonlinear_svm_grid(X_rus,y_rus,X_test,y_test,class_ratio=None)



       Nonlinear SVM

Accuracy: 0.167
Weighted_Precision: 0.028
Weighted_Recall: 0.167
Weighted_F1_score: 0.048

Balanced Accuracy:  0.5
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       349
           1       0.17      1.00      0.29        70

    accuracy                           0.17       419
   macro avg       0.08      0.50      0.14       419
weighted avg       0.03      0.17      0.05       419



In [None]:
logistic_regression_grid(X_rus,y_rus,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.711
Weighted_Precision: 0.832
Weighted_Recall: 0.711
Weighted_F1_score: 0.746

Balanced Accuracy:  0.7238641015145313
              precision    recall  f1-score   support

           0       0.93      0.70      0.80       349
           1       0.34      0.74      0.46        70

    accuracy                           0.71       419
   macro avg       0.63      0.72      0.63       419
weighted avg       0.83      0.71      0.75       419



In [None]:
logistic_regression_grid(X_rus,y_rus,X_test,y_test,class_ratio=None)



 Logistic Regression

Accuracy: 0.711
Weighted_Precision: 0.832
Weighted_Recall: 0.711
Weighted_F1_score: 0.746

Balanced Accuracy:  0.7238641015145313
              precision    recall  f1-score   support

           0       0.93      0.70      0.80       349
           1       0.34      0.74      0.46        70

    accuracy                           0.71       419
   macro avg       0.63      0.72      0.63       419
weighted avg       0.83      0.71      0.75       419



In [None]:
linear_svm(X_rus,y_rus,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.811
Weighted_Precision: 0.821
Weighted_Recall: 0.811
Weighted_F1_score: 0.816

Balanced Accuracy:  0.6869627507163324
              precision    recall  f1-score   support

           0       0.90      0.87      0.89       349
           1       0.44      0.50      0.47        70

    accuracy                           0.81       419
   macro avg       0.67      0.69      0.68       419
weighted avg       0.82      0.81      0.82       419



In [None]:
linear_svm(X_rus,y_rus,X_test,y_test,class_ratio=None)



          Linear SVM

Accuracy: 0.783
Weighted_Precision: 0.846
Weighted_Recall: 0.783
Weighted_F1_score: 0.803

Balanced Accuracy:  0.7554236594351207
              precision    recall  f1-score   support

           0       0.93      0.80      0.86       349
           1       0.41      0.71      0.52        70

    accuracy                           0.78       419
   macro avg       0.67      0.76      0.69       419
weighted avg       0.85      0.78      0.80       419



In [None]:
logistic_regression(X_rus,y_rus,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.711
Weighted_Precision: 0.832
Weighted_Recall: 0.711
Weighted_F1_score: 0.746

Balanced Accuracy:  0.7238641015145313
              precision    recall  f1-score   support

           0       0.93      0.70      0.80       349
           1       0.34      0.74      0.46        70

    accuracy                           0.71       419
   macro avg       0.63      0.72      0.63       419
weighted avg       0.83      0.71      0.75       419



In [None]:
logistic_regression(X_rus,y_rus,X_test,y_test,class_ratio=None)



 Logistic Regression

Accuracy: 0.711
Weighted_Precision: 0.832
Weighted_Recall: 0.711
Weighted_F1_score: 0.746

Balanced Accuracy:  0.7238641015145313
              precision    recall  f1-score   support

           0       0.93      0.70      0.80       349
           1       0.34      0.74      0.46        70

    accuracy                           0.71       419
   macro avg       0.63      0.72      0.63       419
weighted avg       0.83      0.71      0.75       419



In [None]:
random_forest(X_rus,y_rus,X_test,y_test,class_ratio='balanced')



       Random Forest

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



In [None]:
xg_boost(X_rus,y_rus,X_test,y_test)



             XGBoost

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



In [None]:
ensemble_stacked(X_rus,y_rus,X_test,y_test)



Ensemble Stacked Classifiers

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



In [None]:
voting_classifiers(X_rus,y_rus,X_test,y_test)



   Voting Classifier

Accuracy: 0.986
Weighted_Precision: 0.986
Weighted_Recall: 0.986
Weighted_F1_score: 0.986

Balanced Accuracy:  0.9628530495292673
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       349
           1       0.98      0.93      0.96        70

    accuracy                           0.99       419
   macro avg       0.99      0.96      0.97       419
weighted avg       0.99      0.99      0.99       419



In [None]:
Bagging_Classifier_LR(X_rus,y_rus,X_test,y_test)



Bagging Calssifier LR

Accuracy: 0.728
Weighted_Precision: 0.839
Weighted_Recall: 0.728
Weighted_F1_score: 0.760

Balanced Accuracy:  0.7396029471960703
              precision    recall  f1-score   support

           0       0.94      0.72      0.82       349
           1       0.35      0.76      0.48        70

    accuracy                           0.73       419
   macro avg       0.65      0.74      0.65       419
weighted avg       0.84      0.73      0.76       419



In [None]:
Bagging_Classifier_SVM(X_rus,y_rus,X_test,y_test)



Bagging Calssifier SVM

Accuracy: 0.780
Weighted_Precision: 0.837
Weighted_Recall: 0.780
Weighted_F1_score: 0.800

Balanced Accuracy:  0.7368604175194433
              precision    recall  f1-score   support

           0       0.92      0.80      0.86       349
           1       0.41      0.67      0.51        70

    accuracy                           0.78       419
   macro avg       0.66      0.74      0.68       419
weighted avg       0.84      0.78      0.80       419



In [None]:
gradient_boosting(X_rus,y_rus,X_test,y_test)



   Gradient Boosting

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



#### Random over-sampler

In [None]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler()
X_ros, y_ros = ros.fit_resample(X_train,y_train)

In [None]:
df_train=X_ros.copy()
df_train['target']=y_ros
#df_train.to_csv('/content/drive/MyDrive/Omdena_ RadmolAI/Sampled_datasets/imblearn_Random_over_sampled_train.csv')

In [None]:
linear_svm_grid(X_ros,y_ros,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.876
Weighted_Precision: 0.873
Weighted_Recall: 0.876
Weighted_F1_score: 0.874

Balanced Accuracy:  0.7656160458452722
              precision    recall  f1-score   support

           0       0.92      0.93      0.93       349
           1       0.64      0.60      0.62        70

    accuracy                           0.88       419
   macro avg       0.78      0.77      0.77       419
weighted avg       0.87      0.88      0.87       419



In [None]:
linear_svm_grid(X_ros,y_ros,X_test,y_test,class_ratio=None)



          Linear SVM

Accuracy: 0.876
Weighted_Precision: 0.870
Weighted_Recall: 0.876
Weighted_F1_score: 0.872

Balanced Accuracy:  0.7484854686860418
              precision    recall  f1-score   support

           0       0.91      0.94      0.93       349
           1       0.65      0.56      0.60        70

    accuracy                           0.88       419
   macro avg       0.78      0.75      0.76       419
weighted avg       0.87      0.88      0.87       419



In [None]:
nonlinear_svm_grid(X_ros,y_ros,X_test,y_test,class_ratio='balanced')



       Nonlinear SVM

Accuracy: 0.833
Weighted_Precision: 0.694
Weighted_Recall: 0.833
Weighted_F1_score: 0.757

Balanced Accuracy:  0.5
              precision    recall  f1-score   support

           0       0.83      1.00      0.91       349
           1       0.00      0.00      0.00        70

    accuracy                           0.83       419
   macro avg       0.42      0.50      0.45       419
weighted avg       0.69      0.83      0.76       419



In [None]:
nonlinear_svm_grid(X_ros,y_ros,X_test,y_test,class_ratio=None)



       Nonlinear SVM

Accuracy: 0.833
Weighted_Precision: 0.694
Weighted_Recall: 0.833
Weighted_F1_score: 0.757

Balanced Accuracy:  0.5
              precision    recall  f1-score   support

           0       0.83      1.00      0.91       349
           1       0.00      0.00      0.00        70

    accuracy                           0.83       419
   macro avg       0.42      0.50      0.45       419
weighted avg       0.69      0.83      0.76       419



In [None]:
logistic_regression_grid(X_ros,y_ros,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.852
Weighted_Precision: 0.857
Weighted_Recall: 0.852
Weighted_F1_score: 0.854

Balanced Accuracy:  0.7512893982808022
              precision    recall  f1-score   support

           0       0.92      0.90      0.91       349
           1       0.55      0.60      0.58        70

    accuracy                           0.85       419
   macro avg       0.74      0.75      0.74       419
weighted avg       0.86      0.85      0.85       419



In [None]:
logistic_regression_grid(X_ros,y_ros,X_test,y_test,class_ratio=None)



 Logistic Regression

Accuracy: 0.852
Weighted_Precision: 0.857
Weighted_Recall: 0.852
Weighted_F1_score: 0.854

Balanced Accuracy:  0.7512893982808022
              precision    recall  f1-score   support

           0       0.92      0.90      0.91       349
           1       0.55      0.60      0.58        70

    accuracy                           0.85       419
   macro avg       0.74      0.75      0.74       419
weighted avg       0.86      0.85      0.85       419



In [None]:
linear_svm(X_ros,y_ros,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.866
Weighted_Precision: 0.873
Weighted_Recall: 0.866
Weighted_F1_score: 0.869

Balanced Accuracy:  0.7827261563651249
              precision    recall  f1-score   support

           0       0.93      0.91      0.92       349
           1       0.59      0.66      0.62        70

    accuracy                           0.87       419
   macro avg       0.76      0.78      0.77       419
weighted avg       0.87      0.87      0.87       419



In [None]:
linear_svm(X_ros,y_ros,X_test,y_test,class_ratio=None)



          Linear SVM

Accuracy: 0.697
Weighted_Precision: 0.866
Weighted_Recall: 0.697
Weighted_F1_score: 0.735

Balanced Accuracy:  0.7780802292263611
              precision    recall  f1-score   support

           0       0.97      0.66      0.78       349
           1       0.34      0.90      0.50        70

    accuracy                           0.70       419
   macro avg       0.66      0.78      0.64       419
weighted avg       0.87      0.70      0.74       419



In [None]:
logistic_regression(X_ros,y_ros,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.852
Weighted_Precision: 0.857
Weighted_Recall: 0.852
Weighted_F1_score: 0.854

Balanced Accuracy:  0.7512893982808022
              precision    recall  f1-score   support

           0       0.92      0.90      0.91       349
           1       0.55      0.60      0.58        70

    accuracy                           0.85       419
   macro avg       0.74      0.75      0.74       419
weighted avg       0.86      0.85      0.85       419



In [None]:
logistic_regression(X_ros,y_ros,X_test,y_test,class_ratio=None)



 Logistic Regression

Accuracy: 0.852
Weighted_Precision: 0.857
Weighted_Recall: 0.852
Weighted_F1_score: 0.854

Balanced Accuracy:  0.7512893982808022
              precision    recall  f1-score   support

           0       0.92      0.90      0.91       349
           1       0.55      0.60      0.58        70

    accuracy                           0.85       419
   macro avg       0.74      0.75      0.74       419
weighted avg       0.86      0.85      0.85       419



In [None]:
random_forest(X_ros,y_ros,X_test,y_test,class_ratio='balanced')



       Random Forest

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



In [None]:
xg_boost(X_ros,y_ros,X_test,y_test)



             XGBoost

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



In [None]:
ensemble_stacked(X_ros,y_ros,X_test,y_test)



Ensemble Stacked Classifiers

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



In [None]:
voting_classifiers(X_ros,y_ros,X_test,y_test)



   Voting Classifier

Accuracy: 0.950
Weighted_Precision: 0.953
Weighted_Recall: 0.950
Weighted_F1_score: 0.946

Balanced Accuracy:  0.85
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       349
           1       1.00      0.70      0.82        70

    accuracy                           0.95       419
   macro avg       0.97      0.85      0.90       419
weighted avg       0.95      0.95      0.95       419



In [None]:
Bagging_Classifier_LR(X_ros,y_ros,X_test,y_test)



Bagging Calssifier LR

Accuracy: 0.833
Weighted_Precision: 0.853
Weighted_Recall: 0.833
Weighted_F1_score: 0.841

Balanced Accuracy:  0.7569586573884568
              precision    recall  f1-score   support

           0       0.92      0.87      0.90       349
           1       0.50      0.64      0.56        70

    accuracy                           0.83       419
   macro avg       0.71      0.76      0.73       419
weighted avg       0.85      0.83      0.84       419



In [None]:
Bagging_Classifier_SVM(X_ros,y_ros,X_test,y_test)



Bagging Calssifier SVM

Accuracy: 0.881
Weighted_Precision: 0.882
Weighted_Recall: 0.881
Weighted_F1_score: 0.881

Balanced Accuracy:  0.7913221449038068
              precision    recall  f1-score   support

           0       0.93      0.93      0.93       349
           1       0.64      0.66      0.65        70

    accuracy                           0.88       419
   macro avg       0.78      0.79      0.79       419
weighted avg       0.88      0.88      0.88       419



In [None]:
gradient_boosting(X_ros,y_ros,X_test,y_test)



   Gradient Boosting

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



#### Under-sampling: Tomek links


In [None]:
from imblearn.under_sampling import TomekLinks

tl = TomekLinks( sampling_strategy='majority')
X_tl, y_tl = tl.fit_resample(X_train,y_train)

In [None]:
df_train=X_tl.copy()
df_train['target']=y_tl
#df_train.to_csv('/content/drive/MyDrive/Omdena_ RadmolAI/Sampled_datasets/imblearn_TomekLinks_Random_over_sampled_train.csv')

In [None]:
linear_svm_grid(X_tl,y_tl,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.883
Weighted_Precision: 0.873
Weighted_Recall: 0.883
Weighted_F1_score: 0.870

Balanced Accuracy:  0.7071019238641014
              precision    recall  f1-score   support

           0       0.90      0.97      0.93       349
           1       0.76      0.44      0.56        70

    accuracy                           0.88       419
   macro avg       0.83      0.71      0.75       419
weighted avg       0.87      0.88      0.87       419



In [None]:
linear_svm_grid(X_tl,y_tl,X_test,y_test,class_ratio=None)



          Linear SVM

Accuracy: 0.881
Weighted_Precision: 0.887
Weighted_Recall: 0.881
Weighted_F1_score: 0.883

Balanced Accuracy:  0.8084527220630373
              precision    recall  f1-score   support

           0       0.94      0.92      0.93       349
           1       0.63      0.70      0.66        70

    accuracy                           0.88       419
   macro avg       0.78      0.81      0.79       419
weighted avg       0.89      0.88      0.88       419



In [None]:
nonlinear_svm_grid(X_tl,y_tl,X_test,y_test,class_ratio='balanced')



       Nonlinear SVM

Accuracy: 0.833
Weighted_Precision: 0.694
Weighted_Recall: 0.833
Weighted_F1_score: 0.757

Balanced Accuracy:  0.5
              precision    recall  f1-score   support

           0       0.83      1.00      0.91       349
           1       0.00      0.00      0.00        70

    accuracy                           0.83       419
   macro avg       0.42      0.50      0.45       419
weighted avg       0.69      0.83      0.76       419



In [None]:
nonlinear_svm_grid(X_tl,y_tl,X_test,y_test,class_ratio=None)



       Nonlinear SVM

Accuracy: 0.833
Weighted_Precision: 0.694
Weighted_Recall: 0.833
Weighted_F1_score: 0.757

Balanced Accuracy:  0.5
              precision    recall  f1-score   support

           0       0.83      1.00      0.91       349
           1       0.00      0.00      0.00        70

    accuracy                           0.83       419
   macro avg       0.42      0.50      0.45       419
weighted avg       0.69      0.83      0.76       419



In [None]:
logistic_regression_grid(X_tl,y_tl,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.842
Weighted_Precision: 0.850
Weighted_Recall: 0.842
Weighted_F1_score: 0.846

Balanced Accuracy:  0.7398485468686042
              precision    recall  f1-score   support

           0       0.91      0.89      0.90       349
           1       0.53      0.59      0.55        70

    accuracy                           0.84       419
   macro avg       0.72      0.74      0.73       419
weighted avg       0.85      0.84      0.85       419



In [None]:
logistic_regression_grid(X_tl,y_tl,X_test,y_test,class_ratio=None)



 Logistic Regression

Accuracy: 0.850
Weighted_Precision: 0.838
Weighted_Recall: 0.850
Weighted_F1_score: 0.842

Balanced Accuracy:  0.6870446172738436
              precision    recall  f1-score   support

           0       0.89      0.93      0.91       349
           1       0.56      0.44      0.50        70

    accuracy                           0.85       419
   macro avg       0.73      0.69      0.70       419
weighted avg       0.84      0.85      0.84       419



In [None]:
linear_svm(X_tl,y_tl,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.888
Weighted_Precision: 0.888
Weighted_Recall: 0.888
Weighted_F1_score: 0.888

Balanced Accuracy:  0.8013303315595579
              precision    recall  f1-score   support

           0       0.93      0.93      0.93       349
           1       0.66      0.67      0.67        70

    accuracy                           0.89       419
   macro avg       0.80      0.80      0.80       419
weighted avg       0.89      0.89      0.89       419



In [None]:
linear_svm(X_tl,y_tl,X_test,y_test,class_ratio=None)



          Linear SVM

Accuracy: 0.885
Weighted_Precision: 0.877
Weighted_Recall: 0.885
Weighted_F1_score: 0.879

Balanced Accuracy:  0.7485059353254195
              precision    recall  f1-score   support

           0       0.91      0.95      0.93       349
           1       0.70      0.54      0.61        70

    accuracy                           0.89       419
   macro avg       0.81      0.75      0.77       419
weighted avg       0.88      0.89      0.88       419



In [None]:
logistic_regression(X_tl,y_tl,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.842
Weighted_Precision: 0.856
Weighted_Recall: 0.842
Weighted_F1_score: 0.848

Balanced Accuracy:  0.7569791240278346
              precision    recall  f1-score   support

           0       0.92      0.89      0.90       349
           1       0.52      0.63      0.57        70

    accuracy                           0.84       419
   macro avg       0.72      0.76      0.74       419
weighted avg       0.86      0.84      0.85       419



In [None]:
logistic_regression(X_tl,y_tl,X_test,y_test,class_ratio=None)



 Logistic Regression

Accuracy: 0.845
Weighted_Precision: 0.831
Weighted_Recall: 0.845
Weighted_F1_score: 0.836

Balanced Accuracy:  0.6727589029881293
              precision    recall  f1-score   support

           0       0.89      0.93      0.91       349
           1       0.55      0.41      0.47        70

    accuracy                           0.84       419
   macro avg       0.72      0.67      0.69       419
weighted avg       0.83      0.84      0.84       419



In [None]:
random_forest(X_tl,y_tl,X_test,y_test,class_ratio='balanced')



       Random Forest

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



In [None]:
xg_boost(X_tl,y_tl,X_test,y_test)



             XGBoost

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



In [None]:
ensemble_stacked(X_tl,y_tl,X_test,y_test)



Ensemble Stacked Classifiers

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



In [None]:
voting_classifiers(X_tl,y_tl,X_test,y_test)



   Voting Classifier

Accuracy: 0.938
Weighted_Precision: 0.942
Weighted_Recall: 0.938
Weighted_F1_score: 0.932

Balanced Accuracy:  0.8142857142857143
              precision    recall  f1-score   support

           0       0.93      1.00      0.96       349
           1       1.00      0.63      0.77        70

    accuracy                           0.94       419
   macro avg       0.97      0.81      0.87       419
weighted avg       0.94      0.94      0.93       419



In [None]:
Bagging_Classifier_LR(X_tl,y_tl,X_test,y_test)



Bagging Calssifier LR

Accuracy: 0.850
Weighted_Precision: 0.844
Weighted_Recall: 0.850
Weighted_F1_score: 0.846

Balanced Accuracy:  0.7098853868194843
              precision    recall  f1-score   support

           0       0.90      0.92      0.91       349
           1       0.56      0.50      0.53        70

    accuracy                           0.85       419
   macro avg       0.73      0.71      0.72       419
weighted avg       0.84      0.85      0.85       419



In [None]:
Bagging_Classifier_SVM(X_tl,y_tl,X_test,y_test)



Bagging Calssifier SVM

Accuracy: 0.885
Weighted_Precision: 0.876
Weighted_Recall: 0.885
Weighted_F1_score: 0.877

Balanced Accuracy:  0.731375358166189
              precision    recall  f1-score   support

           0       0.91      0.96      0.93       349
           1       0.73      0.50      0.59        70

    accuracy                           0.89       419
   macro avg       0.82      0.73      0.76       419
weighted avg       0.88      0.89      0.88       419



In [None]:
gradient_boosting(X_tl,y_tl,X_test,y_test)



   Gradient Boosting

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



#### Under-sampling: Cluster Centroids


In [None]:
from imblearn.under_sampling import ClusterCentroids

cc = ClusterCentroids()
X_cc, y_cc = cc.fit_resample(X_train,y_train)

In [None]:
df_train=X_cc.copy()
df_train['target']=y_cc
# df_train.to_csv('/content/drive/MyDrive/Omdena_ RadmolAI/Sampled_datasets/imblearn_ClusterCentroids_Random_under_sampled_train.csv')

In [None]:
linear_svm_grid(X_cc,y_cc,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.754
Weighted_Precision: 0.824
Weighted_Recall: 0.754
Weighted_F1_score: 0.778

Balanced Accuracy:  0.7096807204257061
              precision    recall  f1-score   support

           0       0.92      0.78      0.84       349
           1       0.37      0.64      0.47        70

    accuracy                           0.75       419
   macro avg       0.64      0.71      0.65       419
weighted avg       0.82      0.75      0.78       419



In [None]:
linear_svm_grid(X_cc,y_cc,X_test,y_test,class_ratio=None)



          Linear SVM

Accuracy: 0.802
Weighted_Precision: 0.830
Weighted_Recall: 0.802
Weighted_F1_score: 0.813

Balanced Accuracy:  0.7154932460090053
              precision    recall  f1-score   support

           0       0.91      0.85      0.88       349
           1       0.43      0.59      0.50        70

    accuracy                           0.80       419
   macro avg       0.67      0.72      0.69       419
weighted avg       0.83      0.80      0.81       419



In [None]:
nonlinear_svm_grid(X_cc,y_cc,X_test,y_test,class_ratio='balanced')



       Nonlinear SVM

Accuracy: 0.167
Weighted_Precision: 0.028
Weighted_Recall: 0.167
Weighted_F1_score: 0.048

Balanced Accuracy:  0.5
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       349
           1       0.17      1.00      0.29        70

    accuracy                           0.17       419
   macro avg       0.08      0.50      0.14       419
weighted avg       0.03      0.17      0.05       419



In [None]:
nonlinear_svm_grid(X_cc,y_cc,X_test,y_test,class_ratio=None)



       Nonlinear SVM

Accuracy: 0.167
Weighted_Precision: 0.028
Weighted_Recall: 0.167
Weighted_F1_score: 0.048

Balanced Accuracy:  0.5
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       349
           1       0.17      1.00      0.29        70

    accuracy                           0.17       419
   macro avg       0.08      0.50      0.14       419
weighted avg       0.03      0.17      0.05       419



In [None]:
logistic_regression_grid(X_cc,y_cc,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.699
Weighted_Precision: 0.808
Weighted_Recall: 0.699
Weighted_F1_score: 0.734

Balanced Accuracy:  0.6767294310274253
              precision    recall  f1-score   support

           0       0.91      0.71      0.80       349
           1       0.31      0.64      0.42        70

    accuracy                           0.70       419
   macro avg       0.61      0.68      0.61       419
weighted avg       0.81      0.70      0.73       419



In [None]:
logistic_regression_grid(X_cc,y_cc,X_test,y_test,class_ratio=None)



 Logistic Regression

Accuracy: 0.704
Weighted_Precision: 0.809
Weighted_Recall: 0.704
Weighted_F1_score: 0.738

Balanced Accuracy:  0.6795947605403193
              precision    recall  f1-score   support

           0       0.91      0.72      0.80       349
           1       0.31      0.64      0.42        70

    accuracy                           0.70       419
   macro avg       0.61      0.68      0.61       419
weighted avg       0.81      0.70      0.74       419



In [None]:
linear_svm(X_cc,y_cc,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.745
Weighted_Precision: 0.824
Weighted_Recall: 0.745
Weighted_F1_score: 0.771

Balanced Accuracy:  0.7096602537863284
              precision    recall  f1-score   support

           0       0.92      0.76      0.83       349
           1       0.36      0.66      0.46        70

    accuracy                           0.74       419
   macro avg       0.64      0.71      0.65       419
weighted avg       0.82      0.74      0.77       419



In [None]:
linear_svm(X_cc,y_cc,X_test,y_test,class_ratio=None)



          Linear SVM

Accuracy: 0.788
Weighted_Precision: 0.827
Weighted_Recall: 0.788
Weighted_F1_score: 0.803

Balanced Accuracy:  0.7126074498567335
              precision    recall  f1-score   support

           0       0.91      0.83      0.87       349
           1       0.41      0.60      0.49        70

    accuracy                           0.79       419
   macro avg       0.66      0.71      0.68       419
weighted avg       0.83      0.79      0.80       419



In [None]:
logistic_regression(X_cc,y_cc,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.695
Weighted_Precision: 0.807
Weighted_Recall: 0.695
Weighted_F1_score: 0.730

Balanced Accuracy:  0.6738641015145314
              precision    recall  f1-score   support

           0       0.91      0.70      0.79       349
           1       0.30      0.64      0.41        70

    accuracy                           0.69       419
   macro avg       0.61      0.67      0.60       419
weighted avg       0.81      0.69      0.73       419



In [None]:
logistic_regression(X_cc,y_cc,X_test,y_test,class_ratio=None)



 Logistic Regression

Accuracy: 0.695
Weighted_Precision: 0.807
Weighted_Recall: 0.695
Weighted_F1_score: 0.730

Balanced Accuracy:  0.6738641015145314
              precision    recall  f1-score   support

           0       0.91      0.70      0.79       349
           1       0.30      0.64      0.41        70

    accuracy                           0.69       419
   macro avg       0.61      0.67      0.60       419
weighted avg       0.81      0.69      0.73       419



In [None]:
random_forest(X_cc,y_cc,X_test,y_test,class_ratio='balanced')



       Random Forest

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



In [None]:
xg_boost(X_cc,y_cc,X_test,y_test)



             XGBoost

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



In [None]:
ensemble_stacked(X_cc,y_cc,X_test,y_test)



Ensemble Stacked Classifiers

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



In [None]:
voting_classifiers(X_cc,y_cc,X_test,y_test)



   Voting Classifier

Accuracy: 0.959
Weighted_Precision: 0.961
Weighted_Recall: 0.959
Weighted_F1_score: 0.957

Balanced Accuracy:  0.8785714285714286
              precision    recall  f1-score   support

           0       0.95      1.00      0.98       349
           1       1.00      0.76      0.86        70

    accuracy                           0.96       419
   macro avg       0.98      0.88      0.92       419
weighted avg       0.96      0.96      0.96       419



In [None]:
Bagging_Classifier_LR(X_cc,y_cc,X_test,y_test)



Bagging Calssifier LR

Accuracy: 0.730
Weighted_Precision: 0.819
Weighted_Recall: 0.730
Weighted_F1_score: 0.759

Balanced Accuracy:  0.7010642652476463
              precision    recall  f1-score   support

           0       0.92      0.74      0.82       349
           1       0.34      0.66      0.45        70

    accuracy                           0.73       419
   macro avg       0.63      0.70      0.64       419
weighted avg       0.82      0.73      0.76       419



In [None]:
Bagging_Classifier_SVM(X_cc,y_cc,X_test,y_test)



Bagging Calssifier SVM

Accuracy: 0.790
Weighted_Precision: 0.816
Weighted_Recall: 0.790
Weighted_F1_score: 0.801

Balanced Accuracy:  0.6854891526811298
              precision    recall  f1-score   support

           0       0.90      0.84      0.87       349
           1       0.40      0.53      0.46        70

    accuracy                           0.79       419
   macro avg       0.65      0.69      0.66       419
weighted avg       0.82      0.79      0.80       419



In [None]:
gradient_boosting(X_cc,y_cc,X_test,y_test)



   Gradient Boosting

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



#### Over-sampling: SMOTE


In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_resample(X_train,y_train)

In [None]:
df_train=X_sm.copy()
df_train['target']=y_sm
# df_train.to_csv('/content/drive/MyDrive/Omdena_ RadmolAI/Sampled_datasets/imblearn_SMOTE_Random_over_sampled_train.csv')

In [None]:
linear_svm_grid(X_sm,y_sm,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.838
Weighted_Precision: 0.875
Weighted_Recall: 0.838
Weighted_F1_score: 0.850

Balanced Accuracy:  0.8112157183790422
              precision    recall  f1-score   support

           0       0.95      0.85      0.90       349
           1       0.51      0.77      0.61        70

    accuracy                           0.84       419
   macro avg       0.73      0.81      0.76       419
weighted avg       0.88      0.84      0.85       419



In [None]:
linear_svm_grid(X_sm,y_sm,X_test,y_test,class_ratio=None)



          Linear SVM

Accuracy: 0.876
Weighted_Precision: 0.880
Weighted_Recall: 0.876
Weighted_F1_score: 0.878

Balanced Accuracy:  0.794167007777323
              precision    recall  f1-score   support

           0       0.93      0.92      0.92       349
           1       0.62      0.67      0.64        70

    accuracy                           0.88       419
   macro avg       0.78      0.79      0.78       419
weighted avg       0.88      0.88      0.88       419



In [None]:
nonlinear_svm_grid(X_sm,y_sm,X_test,y_test,class_ratio='balanced')



       Nonlinear SVM

Accuracy: 0.833
Weighted_Precision: 0.694
Weighted_Recall: 0.833
Weighted_F1_score: 0.757

Balanced Accuracy:  0.5
              precision    recall  f1-score   support

           0       0.83      1.00      0.91       349
           1       0.00      0.00      0.00        70

    accuracy                           0.83       419
   macro avg       0.42      0.50      0.45       419
weighted avg       0.69      0.83      0.76       419



In [None]:
nonlinear_svm_grid(X_sm,y_sm,X_test,y_test,class_ratio=None)



       Nonlinear SVM

Accuracy: 0.833
Weighted_Precision: 0.694
Weighted_Recall: 0.833
Weighted_F1_score: 0.757

Balanced Accuracy:  0.5
              precision    recall  f1-score   support

           0       0.83      1.00      0.91       349
           1       0.00      0.00      0.00        70

    accuracy                           0.83       419
   macro avg       0.42      0.50      0.45       419
weighted avg       0.69      0.83      0.76       419



In [None]:
logistic_regression_grid(X_sm,y_sm,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.840
Weighted_Precision: 0.847
Weighted_Recall: 0.840
Weighted_F1_score: 0.843

Balanced Accuracy:  0.732705689725747
              precision    recall  f1-score   support

           0       0.91      0.89      0.90       349
           1       0.52      0.57      0.54        70

    accuracy                           0.84       419
   macro avg       0.72      0.73      0.72       419
weighted avg       0.85      0.84      0.84       419



In [None]:
logistic_regression_grid(X_sm,y_sm,X_test,y_test,class_ratio=None)



 Logistic Regression

Accuracy: 0.840
Weighted_Precision: 0.847
Weighted_Recall: 0.840
Weighted_F1_score: 0.843

Balanced Accuracy:  0.732705689725747
              precision    recall  f1-score   support

           0       0.91      0.89      0.90       349
           1       0.52      0.57      0.54        70

    accuracy                           0.84       419
   macro avg       0.72      0.73      0.72       419
weighted avg       0.85      0.84      0.84       419



In [None]:
linear_svm(X_sm,y_sm,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.869
Weighted_Precision: 0.881
Weighted_Recall: 0.869
Weighted_F1_score: 0.874

Balanced Accuracy:  0.8069995906672125
              precision    recall  f1-score   support

           0       0.94      0.90      0.92       349
           1       0.59      0.71      0.65        70

    accuracy                           0.87       419
   macro avg       0.76      0.81      0.78       419
weighted avg       0.88      0.87      0.87       419



In [None]:
linear_svm(X_sm,y_sm,X_test,y_test,class_ratio=None)



          Linear SVM

Accuracy: 0.857
Weighted_Precision: 0.875
Weighted_Recall: 0.857
Weighted_F1_score: 0.864

Balanced Accuracy:  0.7998362668849774
              precision    recall  f1-score   support

           0       0.94      0.89      0.91       349
           1       0.56      0.71      0.63        70

    accuracy                           0.86       419
   macro avg       0.75      0.80      0.77       419
weighted avg       0.88      0.86      0.86       419



In [None]:
logistic_regression(X_sm,y_sm,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.842
Weighted_Precision: 0.848
Weighted_Recall: 0.842
Weighted_F1_score: 0.845

Balanced Accuracy:  0.734138354482194
              precision    recall  f1-score   support

           0       0.91      0.90      0.90       349
           1       0.53      0.57      0.55        70

    accuracy                           0.84       419
   macro avg       0.72      0.73      0.73       419
weighted avg       0.85      0.84      0.85       419



In [None]:
logistic_regression(X_sm,y_sm,X_test,y_test,class_ratio=None)



 Logistic Regression

Accuracy: 0.842
Weighted_Precision: 0.848
Weighted_Recall: 0.842
Weighted_F1_score: 0.845

Balanced Accuracy:  0.734138354482194
              precision    recall  f1-score   support

           0       0.91      0.90      0.90       349
           1       0.53      0.57      0.55        70

    accuracy                           0.84       419
   macro avg       0.72      0.73      0.73       419
weighted avg       0.85      0.84      0.85       419



In [None]:
random_forest(X_sm,y_sm,X_test,y_test,class_ratio='balanced')



       Random Forest

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



In [None]:
xg_boost(X_sm,y_sm,X_test,y_test)



             XGBoost

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



In [None]:
ensemble_stacked(X_sm,y_sm,X_test,y_test)



Ensemble Stacked Classifiers

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



In [None]:
voting_classifiers(X_sm,y_sm,X_test,y_test)



   Voting Classifier



In [None]:
Bagging_Classifier_LR(X_sm,y_sm,X_test,y_test)



Bagging Calssifier LR

Accuracy: 0.845
Weighted_Precision: 0.855
Weighted_Recall: 0.845
Weighted_F1_score: 0.849

Balanced Accuracy:  0.7527015963978715
              precision    recall  f1-score   support

           0       0.92      0.89      0.91       349
           1       0.53      0.61      0.57        70

    accuracy                           0.84       419
   macro avg       0.73      0.75      0.74       419
weighted avg       0.86      0.84      0.85       419



In [None]:
Bagging_Classifier_SVM(X_sm,y_sm,X_test,y_test)



Bagging Calssifier SVM

Accuracy: 0.866
Weighted_Precision: 0.866
Weighted_Recall: 0.866
Weighted_F1_score: 0.866

Balanced Accuracy:  0.7598853868194843
              precision    recall  f1-score   support

           0       0.92      0.92      0.92       349
           1       0.60      0.60      0.60        70

    accuracy                           0.87       419
   macro avg       0.76      0.76      0.76       419
weighted avg       0.87      0.87      0.87       419



In [None]:
gradient_boosting(X_sm,y_sm,X_test,y_test)



   Gradient Boosting

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



#### Over-sampling followed by under-sampling


In [None]:
from imblearn.combine import SMOTETomek

smt = SMOTETomek(sampling_strategy='auto')
X_smt, y_smt = smt.fit_resample(X_train,y_train)

In [None]:
df_train=X_smt.copy()
df_train['target']=y_smt
# df_train.to_csv('/content/drive/MyDrive/Omdena_ RadmolAI/Sampled_datasets/imblearn_SMOTETomek_Random_over_under_sampled_train.csv')

In [None]:
linear_svm_grid(X_smt,y_smt,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.850
Weighted_Precision: 0.876
Weighted_Recall: 0.850
Weighted_F1_score: 0.859

Balanced Accuracy:  0.8069586573884568
              precision    recall  f1-score   support

           0       0.94      0.87      0.91       349
           1       0.54      0.74      0.62        70

    accuracy                           0.85       419
   macro avg       0.74      0.81      0.76       419
weighted avg       0.88      0.85      0.86       419



In [None]:
linear_svm_grid(X_smt,y_smt,X_test,y_test,class_ratio=None)



          Linear SVM

Accuracy: 0.876
Weighted_Precision: 0.876
Weighted_Recall: 0.876
Weighted_F1_score: 0.876

Balanced Accuracy:  0.7770364306180926
              precision    recall  f1-score   support

           0       0.93      0.93      0.93       349
           1       0.63      0.63      0.63        70

    accuracy                           0.88       419
   macro avg       0.78      0.78      0.78       419
weighted avg       0.88      0.88      0.88       419



In [None]:
nonlinear_svm_grid(X_smt,y_smt,X_test,y_test,class_ratio='balanced')



       Nonlinear SVM

Accuracy: 0.833
Weighted_Precision: 0.694
Weighted_Recall: 0.833
Weighted_F1_score: 0.757

Balanced Accuracy:  0.5
              precision    recall  f1-score   support

           0       0.83      1.00      0.91       349
           1       0.00      0.00      0.00        70

    accuracy                           0.83       419
   macro avg       0.42      0.50      0.45       419
weighted avg       0.69      0.83      0.76       419



In [None]:
nonlinear_svm_grid(X_smt,y_smt,X_test,y_test,class_ratio=None)



       Nonlinear SVM

Accuracy: 0.833
Weighted_Precision: 0.694
Weighted_Recall: 0.833
Weighted_F1_score: 0.757

Balanced Accuracy:  0.5
              precision    recall  f1-score   support

           0       0.83      1.00      0.91       349
           1       0.00      0.00      0.00        70

    accuracy                           0.83       419
   macro avg       0.42      0.50      0.45       419
weighted avg       0.69      0.83      0.76       419



In [None]:
logistic_regression_grid(X_smt,y_smt,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.826
Weighted_Precision: 0.837
Weighted_Recall: 0.826
Weighted_F1_score: 0.831

Balanced Accuracy:  0.718399508800655
              precision    recall  f1-score   support

           0       0.91      0.88      0.89       349
           1       0.48      0.56      0.52        70

    accuracy                           0.83       419
   macro avg       0.69      0.72      0.71       419
weighted avg       0.84      0.83      0.83       419



In [None]:
logistic_regression_grid(X_smt,y_smt,X_test,y_test,class_ratio=None)



 Logistic Regression

Accuracy: 0.826
Weighted_Precision: 0.837
Weighted_Recall: 0.826
Weighted_F1_score: 0.831

Balanced Accuracy:  0.718399508800655
              precision    recall  f1-score   support

           0       0.91      0.88      0.89       349
           1       0.48      0.56      0.52        70

    accuracy                           0.83       419
   macro avg       0.69      0.72      0.71       419
weighted avg       0.84      0.83      0.83       419



In [None]:
linear_svm(X_smt,y_smt,X_test,y_test,class_ratio='balanced')



          Linear SVM

Accuracy: 0.881
Weighted_Precision: 0.877
Weighted_Recall: 0.881
Weighted_F1_score: 0.879

Balanced Accuracy:  0.7684813753581662
              precision    recall  f1-score   support

           0       0.92      0.94      0.93       349
           1       0.66      0.60      0.63        70

    accuracy                           0.88       419
   macro avg       0.79      0.77      0.78       419
weighted avg       0.88      0.88      0.88       419



In [None]:
linear_svm(X_smt,y_smt,X_test,y_test,class_ratio=None)



          Linear SVM

Accuracy: 0.854
Weighted_Precision: 0.880
Weighted_Recall: 0.854
Weighted_F1_score: 0.863

Balanced Accuracy:  0.815534179287761
              precision    recall  f1-score   support

           0       0.95      0.87      0.91       349
           1       0.55      0.76      0.63        70

    accuracy                           0.85       419
   macro avg       0.75      0.82      0.77       419
weighted avg       0.88      0.85      0.86       419



In [None]:
logistic_regression(X_smt,y_smt,X_test,y_test,class_ratio='balanced')



 Logistic Regression

Accuracy: 0.835
Weighted_Precision: 0.846
Weighted_Recall: 0.835
Weighted_F1_score: 0.840

Balanced Accuracy:  0.7355505525992632
              precision    recall  f1-score   support

           0       0.91      0.89      0.90       349
           1       0.51      0.59      0.54        70

    accuracy                           0.84       419
   macro avg       0.71      0.74      0.72       419
weighted avg       0.85      0.84      0.84       419



In [None]:
logistic_regression(X_smt,y_smt,X_test,y_test,class_ratio=None)



 Logistic Regression

Accuracy: 0.835
Weighted_Precision: 0.846
Weighted_Recall: 0.835
Weighted_F1_score: 0.840

Balanced Accuracy:  0.7355505525992632
              precision    recall  f1-score   support

           0       0.91      0.89      0.90       349
           1       0.51      0.59      0.54        70

    accuracy                           0.84       419
   macro avg       0.71      0.74      0.72       419
weighted avg       0.85      0.84      0.84       419



In [None]:
random_forest(X_smt,y_smt,X_test,y_test,class_ratio='balanced')



       Random Forest

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



In [None]:
xg_boost(X_smt,y_smt,X_test,y_test)



             XGBoost

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



In [None]:
ensemble_stacked(X_smt,y_smt,X_test,y_test)



Ensemble Stacked Classifiers



In [None]:
voting_classifiers(X_smt,y_smt,X_test,y_test)



   Voting Classifier

Accuracy: 0.950
Weighted_Precision: 0.953
Weighted_Recall: 0.950
Weighted_F1_score: 0.946

Balanced Accuracy:  0.85
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       349
           1       1.00      0.70      0.82        70

    accuracy                           0.95       419
   macro avg       0.97      0.85      0.90       419
weighted avg       0.95      0.95      0.95       419



In [None]:
Bagging_Classifier_LR(X_smt,y_smt,X_test,y_test)



Bagging Calssifier LR

Accuracy: 0.828
Weighted_Precision: 0.847
Weighted_Recall: 0.828
Weighted_F1_score: 0.836

Balanced Accuracy:  0.7426729431027426
              precision    recall  f1-score   support

           0       0.92      0.87      0.89       349
           1       0.49      0.61      0.54        70

    accuracy                           0.83       419
   macro avg       0.70      0.74      0.72       419
weighted avg       0.85      0.83      0.84       419



In [None]:
Bagging_Classifier_SVM(X_smt,y_smt,X_test,y_test)



Bagging Calssifier SVM

Accuracy: 0.871
Weighted_Precision: 0.881
Weighted_Recall: 0.871
Weighted_F1_score: 0.875

Balanced Accuracy:  0.8027220630372492
              precision    recall  f1-score   support

           0       0.94      0.91      0.92       349
           1       0.60      0.70      0.64        70

    accuracy                           0.87       419
   macro avg       0.77      0.80      0.78       419
weighted avg       0.88      0.87      0.88       419



In [None]:
gradient_boosting(X_smt,y_smt,X_test,y_test)



   Gradient Boosting

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419



## Recommended Models


In [None]:
voting_classifiers(X_train_new_under_sampled,y_train_new_under_sampler,X_test,y_test)
# Recommended model no.1, since class 1 recall, balanced accuracy and weighted average F1-score is best for this model


   Voting Classifier

Accuracy: 0.976
Weighted_Precision: 0.977
Weighted_Recall: 0.976
Weighted_F1_score: 0.975

Balanced Accuracy:  0.9285714285714286
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       349
           1       1.00      0.86      0.92        70

    accuracy                           0.98       419
   macro avg       0.99      0.93      0.95       419
weighted avg       0.98      0.98      0.98       419



In [None]:
voting_classifiers(X_smt,y_smt,X_test,y_test)
# Recommended model no.2


   Voting Classifier

Accuracy: 0.962
Weighted_Precision: 0.963
Weighted_Recall: 0.962
Weighted_F1_score: 0.960

Balanced Accuracy:  0.8857142857142857
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       349
           1       1.00      0.77      0.87        70

    accuracy                           0.96       419
   macro avg       0.98      0.89      0.92       419
weighted avg       0.96      0.96      0.96       419



In [None]:
voting_classifiers(X_sm,y_sm,X_test,y_test)
# Recommended model no.3


   Voting Classifier

Accuracy: 0.955
Weighted_Precision: 0.957
Weighted_Recall: 0.955
Weighted_F1_score: 0.952

Balanced Accuracy:  0.8642857142857143
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       349
           1       1.00      0.73      0.84        70

    accuracy                           0.95       419
   macro avg       0.97      0.86      0.91       419
weighted avg       0.96      0.95      0.95       419



In [None]:
xg_boost(X_train,y_train,X_test,y_test)
# cannot recommend this as it is overfitted with 100% scores for all metrics for both classes - Shweta C. 06/04


             XGBoost

Accuracy: 1.000
Weighted_Precision: 1.000
Weighted_Recall: 1.000
Weighted_F1_score: 1.000

Balanced Accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       349
           1       1.00      1.00      1.00        70

    accuracy                           1.00       419
   macro avg       1.00      1.00      1.00       419
weighted avg       1.00      1.00      1.00       419

