imports

In [146]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, balanced_accuracy_score
from sklearn.model_selection import StratifiedKFold, cross_validate

In [147]:

def preprocess(file_path,split_value,target_column_index,has_header,delete_list,extraction_list):
    """this function is used for taking the dataset from .csv file making a test train split
    and x, y split (has header is either True or False)"""

    #reading the .csv
    if has_header:
        df = pd.read_csv(file_path, header=0, delimiter=",")  # First row as header
        print(f"Dataset shape: {df.shape}")
        print(f"Column names: {list(df.columns)}")
    else:
        df = pd.read_csv(file_path, header=None, delimiter=",")  # No header row
        print(f"Dataset shape: {df.shape}")

    
    #making target variable split
    x_ds = df.drop(df.columns[target_column_index], axis=1)  # All except target
    y_ds = df.iloc[:, target_column_index]  # Target column only

    x_ds = x_ds.drop(columns = delete_list + extraction_list)
    #making test train split
    x_train, x_test, y_train, y_test = train_test_split(x_ds, y_ds, test_size=split_value, random_state=10, shuffle= True)
    print(f'shape of x training: {x_train.shape}')
    print(f'shape of x testing: {x_test.shape}')

    return x_train, x_test, y_train, y_test, x_ds, y_ds

In [148]:
def implement(model):
    """this function contains implementation (modeling and fitting) on the dataset with 1 
    initiation and shows the results of the implementations.
    Note: This function doesn't have compile so it is not appoprate to use this function for perceptron learning algs.
    because it doesn't have training vs test accuracy comparison and doesn't have epochs and .argmax(axis=1) """
    #initiante ml
    model.fit(x_train, y_train)
    #model evaluation
    y_pred = model.predict(x_test)

    print(f'Confusion matrix: \n{confusion_matrix(y_test, y_pred)}')
    print(f'Classification report: {classification_report(y_test, y_pred)}')
    print(f"Accuracy score is: {accuracy_score(y_test,y_pred):.3f}")
    print(f"Balanced accuracy score is: {balanced_accuracy_score(y_test,y_pred):.3f}")
    print(f"Precision score is: {precision_score(y_test,y_pred, average='weighted'):.3f}")
    print(f"Recall score is: {recall_score(y_test,y_pred, average='weighted'):.3f}")
    
    # StratifiedKFold evaluations
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=10)
    scoring = ['balanced_accuracy', 'accuracy', 'precision_weighted', 'recall_weighted']
    cv_results = cross_validate(model, x_ds, y_ds, cv=skf, scoring=scoring)


    print(f"StratifiedKFold CV Balanced Accuracy: {cv_results['test_balanced_accuracy'].mean():.3f}\scriptsize(±{cv_results['test_balanced_accuracy'].std() * 2:.2f})")
    print(f"StratifiedKFold CV Accuracy: {cv_results['test_accuracy'].mean():.3f}\scriptsize(±{cv_results['test_accuracy'].std() * 2:.2f})")
    print(f"StratifiedKFold CV Precision: {cv_results['test_precision_weighted'].mean():.3f}\scriptsize(±{cv_results['test_precision_weighted'].std() * 2:.2f})")
    print(f"StratifiedKFold CV Recall: {cv_results['test_recall_weighted'].mean():.3f}\scriptsize(±{cv_results['test_recall_weighted'].std() * 2:.2f})")
        

  print(f"StratifiedKFold CV Balanced Accuracy: {cv_results['test_balanced_accuracy'].mean():.3f}\scriptsize(±{cv_results['test_balanced_accuracy'].std() * 2:.2f})")
  print(f"StratifiedKFold CV Accuracy: {cv_results['test_accuracy'].mean():.3f}\scriptsize(±{cv_results['test_accuracy'].std() * 2:.2f})")
  print(f"StratifiedKFold CV Precision: {cv_results['test_precision_weighted'].mean():.3f}\scriptsize(±{cv_results['test_precision_weighted'].std() * 2:.2f})")
  print(f"StratifiedKFold CV Recall: {cv_results['test_recall_weighted'].mean():.3f}\scriptsize(±{cv_results['test_recall_weighted'].std() * 2:.2f})")


Select the dataset!

Runs the preprocess for diffrent datasets.

x_train, x_test, y_train, y_test, x_ds, y_ds = preprocess("beans_kmeans.csv", 0.2, 16, True, [], [])

x_train, x_test, y_train, y_test, x_ds, y_ds = preprocess("beans_kmeans.csv", 0.2, 16, True, [],["ShapeFactor2","Compactness","ShapeFactor3","roundness","MajorAxisLength","Eccentricity","AspectRation"] )

x_train, x_test, y_train, y_test, x_ds, y_ds = preprocess("diabetes_kmeans.csv", 0.2, 8, True, [], [])

x_train, x_test, y_train, y_test, x_ds, y_ds = preprocess("diabetes_kmeans.csv", 0.2, 8, True, [], [])

x_train, x_test, y_train, y_test, x_ds, y_ds = preprocess("divorce.csv", 0.2, 54, True, [], [])

x_train, x_test, y_train, y_test, x_ds, y_ds = preprocess("divorce.csv", 0.2, 54, True, [], ['Atr32','Atr33','Atr34','Atr35','Atr36','Atr52','Atr54'])

x_train, x_test, y_train, y_test, x_ds, y_ds = preprocess("parkinsons_kmeans.csv", 0.2, 17, True, ['name'], [])

x_train, x_test, y_train, y_test, x_ds, y_ds = preprocess("parkinsons_kmeans.csv", 0.2, 17, True, ['name'], ['spread1','HNR','PPE','MDVP:APQ'])

x_train, x_test, y_train, y_test, x_ds, y_ds = preprocess("rice_binned_kmeans.csv", 0.2, 7, True, [], ['Eccentricity', 'Major_Axis_Length', 'Perimeter'])

x_train, x_test, y_train, y_test, x_ds, y_ds = preprocess("rice_binned_kmeans.csv", 0.2, 7, True, [], [])

In [149]:
x_train, x_test, y_train, y_test, x_ds, y_ds = preprocess("wdbc_binned_kmeans.csv", 0.2, 1, True, ['ID'], [])

Dataset shape: (569, 32)
Column names: ['ID', 'Diagnosis', 'Radius_Mean', 'Texture_Mean', 'Perimeter_Mean', 'Area_Mean', 'Smoothness_Mean', 'Compactness_Mean', 'Concavity_Mean', 'Concave_Points_Mean', 'Symmetry_Mean', 'Fractal_Dimension_Mean', 'Radius_SE', 'Texture_SE', 'Perimeter_SE', 'Area_SE', 'Smoothness_SE', 'Compactness_SE', 'Concavity_SE', 'Concave_Points_SE', 'Symmetry_SE', 'Fractal_Dimension_SE', 'Radius_Worst', 'Texture_Worst', 'Perimeter_Worst', 'Area_Worst', 'Smoothness_Worst', 'Compactness_Worst', 'Concavity_Worst', 'Concave_Points_Worst', 'Symmetry_Worst', 'Fractal_Dimension_Worst']
shape of x training: (455, 30)
shape of x testing: (114, 30)


x_train, x_test, y_train, y_test, x_ds, y_ds = preprocess("wdbc_binned_kmeans.csv", 0.2, 1, True, ['ID'], [])

Run the implement function for diffrent models

In [150]:
#KNN
from sklearn.neighbors import KNeighborsClassifier
implement(KNeighborsClassifier())

Confusion matrix: 
[[74  1]
 [ 1 38]]
Classification report:               precision    recall  f1-score   support

           B       0.99      0.99      0.99        75
           M       0.97      0.97      0.97        39

    accuracy                           0.98       114
   macro avg       0.98      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114

Accuracy score is: 0.982
Balanced accuracy score is: 0.981
Precision score is: 0.982
Recall score is: 0.982
StratifiedKFold CV Balanced Accuracy: 0.965\scriptsize(±0.05)
StratifiedKFold CV Accuracy: 0.970\scriptsize(±0.04)
StratifiedKFold CV Precision: 0.971\scriptsize(±0.04)
StratifiedKFold CV Recall: 0.970\scriptsize(±0.04)


In [151]:
#Ada Boost
from sklearn.ensemble import AdaBoostClassifier 
implement(AdaBoostClassifier(n_estimators=100, random_state = 10))

Confusion matrix: 
[[73  2]
 [ 0 39]]
Classification report:               precision    recall  f1-score   support

           B       1.00      0.97      0.99        75
           M       0.95      1.00      0.97        39

    accuracy                           0.98       114
   macro avg       0.98      0.99      0.98       114
weighted avg       0.98      0.98      0.98       114

Accuracy score is: 0.982
Balanced accuracy score is: 0.987
Precision score is: 0.983
Recall score is: 0.982
StratifiedKFold CV Balanced Accuracy: 0.970\scriptsize(±0.05)
StratifiedKFold CV Accuracy: 0.974\scriptsize(±0.04)
StratifiedKFold CV Precision: 0.975\scriptsize(±0.04)
StratifiedKFold CV Recall: 0.974\scriptsize(±0.04)


In [152]:
#SVM
from sklearn.svm import SVC 
implement(SVC(max_iter = -1, random_state=10))

Confusion matrix: 
[[74  1]
 [ 0 39]]
Classification report:               precision    recall  f1-score   support

           B       1.00      0.99      0.99        75
           M       0.97      1.00      0.99        39

    accuracy                           0.99       114
   macro avg       0.99      0.99      0.99       114
weighted avg       0.99      0.99      0.99       114

Accuracy score is: 0.991
Balanced accuracy score is: 0.993
Precision score is: 0.991
Recall score is: 0.991
StratifiedKFold CV Balanced Accuracy: 0.973\scriptsize(±0.05)
StratifiedKFold CV Accuracy: 0.977\scriptsize(±0.04)
StratifiedKFold CV Precision: 0.978\scriptsize(±0.04)
StratifiedKFold CV Recall: 0.977\scriptsize(±0.04)


In [153]:
#Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB 
implement(GaussianNB())

Confusion matrix: 
[[71  4]
 [ 1 38]]
Classification report:               precision    recall  f1-score   support

           B       0.99      0.95      0.97        75
           M       0.90      0.97      0.94        39

    accuracy                           0.96       114
   macro avg       0.95      0.96      0.95       114
weighted avg       0.96      0.96      0.96       114

Accuracy score is: 0.956
Balanced accuracy score is: 0.961
Precision score is: 0.958
Recall score is: 0.956
StratifiedKFold CV Balanced Accuracy: 0.934\scriptsize(±0.07)
StratifiedKFold CV Accuracy: 0.939\scriptsize(±0.07)
StratifiedKFold CV Precision: 0.939\scriptsize(±0.07)
StratifiedKFold CV Recall: 0.939\scriptsize(±0.07)


In [154]:
#Random Forests
from sklearn.ensemble import RandomForestClassifier
implement(RandomForestClassifier())


Confusion matrix: 
[[73  2]
 [ 1 38]]
Classification report:               precision    recall  f1-score   support

           B       0.99      0.97      0.98        75
           M       0.95      0.97      0.96        39

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

Accuracy score is: 0.974
Balanced accuracy score is: 0.974
Precision score is: 0.974
Recall score is: 0.974
StratifiedKFold CV Balanced Accuracy: 0.948\scriptsize(±0.08)
StratifiedKFold CV Accuracy: 0.956\scriptsize(±0.07)
StratifiedKFold CV Precision: 0.958\scriptsize(±0.06)
StratifiedKFold CV Recall: 0.956\scriptsize(±0.07)


In [155]:
#Decesion Trees
from sklearn.tree import DecisionTreeClassifier
implement(DecisionTreeClassifier())

Confusion matrix: 
[[70  5]
 [ 6 33]]
Classification report:               precision    recall  f1-score   support

           B       0.92      0.93      0.93        75
           M       0.87      0.85      0.86        39

    accuracy                           0.90       114
   macro avg       0.89      0.89      0.89       114
weighted avg       0.90      0.90      0.90       114

Accuracy score is: 0.904
Balanced accuracy score is: 0.890
Precision score is: 0.903
Recall score is: 0.904
StratifiedKFold CV Balanced Accuracy: 0.925\scriptsize(±0.08)
StratifiedKFold CV Accuracy: 0.930\scriptsize(±0.06)
StratifiedKFold CV Precision: 0.931\scriptsize(±0.07)
StratifiedKFold CV Recall: 0.930\scriptsize(±0.06)
