imports

In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold, cross_validate

In [32]:

def preprocess(file_path,split_value,target_column_index,has_header,delete_index_list,extraction_list):
    """this function is used for taking the dataset from .csv file making a test train split
    and x, y split (has header is either True or False)"""

    #reading the .csv
    if has_header:
        df = pd.read_csv(file_path, header=0, delimiter=",")  # First row as header
        print(f"Dataset shape: {df.shape}")
        print(f"Column names: {list(df.columns)}")
    else:
        df = pd.read_csv(file_path, header=None, delimiter=",")  # No header row
        print(f"Dataset shape: {df.shape}")

    df.drop(df.columns[delete_index_list], axis=1)
    
    #making target variable split
    x_ds = df.drop(df.columns[target_column_index], axis=1)  # All except target
    y_ds = df.iloc[:, target_column_index]  # Target column only

    #making feature extraction
    if extraction_list:
        x_ds = x_ds.drop(extraction_list, axis=1)


    #making test train split
    x_train, x_test, y_train, y_test = train_test_split(x_ds, y_ds, test_size=split_value, random_state=10, shuffle= True)
    print(f'shape of x training: {x_train.shape}')
    print(f'shape of x testing: {x_test.shape}')

    return x_train, x_test, y_train, y_test, x_ds, y_ds

In [33]:
def implement(model):
    """this function contains implementation (modeling and fitting) on the dataset with 1 
    initiation and shows the results of the implementations.
    Note: This function doesn't have compile so it is not appoprate to use this function for perceptron learning algs.
    because it doesn't have training vs test accuracy comparison and doesn't have epochs and .argmax(axis=1) """
    #initiante ml
    model.fit(x_train, y_train)
    #model evaluation
    y_pred = model.predict(x_test)

    print(f'Classification report: {classification_report(y_test, y_pred)}')
    print(f"Accuracy score is: {accuracy_score(y_test,y_pred)}")
    print(f"Precision score is: {precision_score(y_test,y_pred, average='weighted')}")
    print(f"Recall score is: {recall_score(y_test,y_pred, average='weighted')}")
    print(f'Confusion matrix: \n{confusion_matrix(y_test, y_pred)}')

    # StratifiedKFold evaluations
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=10)
    scoring = ['accuracy', 'precision_weighted', 'recall_weighted']
    cv_results = cross_validate(model, x_ds, y_ds, cv=skf, scoring=scoring)
    print(f"StratifiedKFold CV Accuracy: {cv_results['test_accuracy'].mean():.4f} (+/- {cv_results['test_accuracy'].std() * 2:.4f})")
    print(f"StratifiedKFold CV Precision: {cv_results['test_precision_weighted'].mean():.4f} (+/- {cv_results['test_precision_weighted'].std() * 2:.4f})")
    print(f"StratifiedKFold CV Recall: {cv_results['test_recall_weighted'].mean():.4f} (+/- {cv_results['test_recall_weighted'].std() * 2:.4f})")
        

Select the dataset!

Runs the preprocess for diffrent datasets.

x_train, x_test, y_train, y_test, x_ds, y_ds = preprocess("beans_kmeans.csv", 0.2, 16, True, [], [])

In [34]:
x_train, x_test, y_train, y_test, x_ds, y_ds = preprocess("beans_kmeans.csv", 0.2, 16, True, [],["ShapeFactor2","Compactness","ShapeFactor3","roundness","MajorAxisLength","Eccentricity","AspectRation"] )

Dataset shape: (13611, 17)
Column names: ['Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength', 'AspectRation', 'Eccentricity', 'ConvexArea', 'EquivDiameter', 'Extent', 'Solidity', 'roundness', 'Compactness', 'ShapeFactor1', 'ShapeFactor2', 'ShapeFactor3', 'ShapeFactor4', 'Class']
shape of x training: (10888, 9)
shape of x testing: (2723, 9)


x_train, x_test, y_train, y_test, x_ds, y_ds = preprocess("diabetes_kmeans.csv", 0.2, 8, True)

x_train, x_test, y_train, y_test, x_ds, y_ds = preprocess("divorce.csv", 0.2, 54, True)

x_train, x_test, y_train, y_test, x_ds, y_ds = preprocess("parkinsons_kmeans.csv", 0.2, 17, True)

x_train, x_test, y_train, y_test, x_ds, y_ds = preprocess("rice_binned_kmeans.csv", 0.2, 7, True)

x_train, x_test, y_train, y_test, x_ds, y_ds = preprocess("wdbc_binned_kmeans.csv", 0.2, 1, True)

Run the implement function for diffrent models

In [35]:
#KNN
from sklearn.neighbors import KNeighborsClassifier
implement(KNeighborsClassifier())

Classification report:               precision    recall  f1-score   support

    BARBUNYA       0.87      0.83      0.85       233
      BOMBAY       1.00      0.99      1.00       118
        CALI       0.87      0.94      0.91       330
    DERMASON       0.88      0.91      0.89       706
       HOROZ       0.95      0.88      0.92       396
       SEKER       0.92      0.92      0.92       411
        SIRA       0.81      0.79      0.80       529

    accuracy                           0.88      2723
   macro avg       0.90      0.90      0.90      2723
weighted avg       0.88      0.88      0.88      2723

Accuracy score is: 0.884318766066838
Precision score is: 0.88478534286447
Recall score is: 0.884318766066838
Confusion matrix: 
[[194   0  32   0   0   2   5]
 [  0 117   1   0   0   0   0]
 [ 13   0 311   0   3   2   1]
 [  0   0   0 640   0  13  53]
 [  3   0  13   8 350   0  22]
 [  7   0   0   8   0 379  17]
 [  5   0   0  74  15  18 417]]
StratifiedKFold CV Accuracy: 0.886

In [36]:
#Ada Boost
from sklearn.ensemble import AdaBoostClassifier 
implement(AdaBoostClassifier(n_estimators=100, random_state = 10))

Classification report:               precision    recall  f1-score   support

    BARBUNYA       0.82      0.76      0.79       233
      BOMBAY       1.00      0.92      0.96       118
        CALI       0.78      0.89      0.83       330
    DERMASON       0.82      0.89      0.85       706
       HOROZ       0.94      0.72      0.81       396
       SEKER       0.94      0.73      0.82       411
        SIRA       0.71      0.85      0.77       529

    accuracy                           0.82      2723
   macro avg       0.86      0.82      0.83      2723
weighted avg       0.84      0.82      0.82      2723

Accuracy score is: 0.8229893499816379
Precision score is: 0.8364829381240463
Recall score is: 0.8229893499816379
Confusion matrix: 
[[176   0  47   0   0   0  10]
 [  0 109   9   0   0   0   0]
 [ 28   0 293   0   7   0   2]
 [  0   0   0 629   0   5  72]
 [  2   0  22  19 284   0  69]
 [  6   0   1  74   0 302  28]
 [  3   0   4  49  10  15 448]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


StratifiedKFold CV Accuracy: 0.8041 (+/- 0.0660)
StratifiedKFold CV Precision: 0.8194 (+/- 0.0724)
StratifiedKFold CV Recall: 0.8041 (+/- 0.0660)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [37]:
#SVM
from sklearn.svm import SVC 
implement(SVC(max_iter = -1, random_state=10))

Classification report:               precision    recall  f1-score   support

    BARBUNYA       0.91      0.85      0.88       233
      BOMBAY       1.00      0.99      1.00       118
        CALI       0.86      0.94      0.90       330
    DERMASON       0.91      0.88      0.90       706
       HOROZ       0.96      0.90      0.93       396
       SEKER       0.93      0.91      0.92       411
        SIRA       0.81      0.88      0.84       529

    accuracy                           0.90      2723
   macro avg       0.91      0.91      0.91      2723
weighted avg       0.90      0.90      0.90      2723

Accuracy score is: 0.8979067205288285
Precision score is: 0.9003923774992515
Recall score is: 0.8979067205288285
Confusion matrix: 
[[198   0  28   0   1   0   6]
 [  0 117   1   0   0   0   0]
 [ 11   0 311   0   6   2   0]
 [  0   0   0 622   1  12  71]
 [  0   0  15  10 357   0  14]
 [  8   0   0  12   0 376  15]
 [  1   0   5  37   7  15 464]]
StratifiedKFold CV Accuracy: 0

In [38]:
#Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB 
implement(GaussianNB())

Classification report:               precision    recall  f1-score   support

    BARBUNYA       0.82      0.67      0.74       233
      BOMBAY       1.00      1.00      1.00       118
        CALI       0.81      0.89      0.85       330
    DERMASON       0.87      0.85      0.86       706
       HOROZ       0.85      0.80      0.82       396
       SEKER       0.77      0.82      0.79       411
        SIRA       0.77      0.80      0.78       529

    accuracy                           0.82      2723
   macro avg       0.84      0.83      0.83      2723
weighted avg       0.82      0.82      0.82      2723

Accuracy score is: 0.8233565919941241
Precision score is: 0.8248712339668623
Recall score is: 0.8233565919941241
Confusion matrix: 
[[155   0  57   0  13   0   8]
 [  0 118   0   0   0   0   0]
 [ 23   0 295   0  11   1   0]
 [  0   0   0 600   0  50  56]
 [  6   0  14  22 317   0  37]
 [  4   0   0  41   4 336  26]
 [  0   0   0  30  29  49 421]]
StratifiedKFold CV Accuracy: 0

In [39]:
#Random Forests
from sklearn.ensemble import RandomForestClassifier
implement(RandomForestClassifier())


Classification report:               precision    recall  f1-score   support

    BARBUNYA       0.88      0.83      0.85       233
      BOMBAY       1.00      1.00      1.00       118
        CALI       0.87      0.91      0.89       330
    DERMASON       0.87      0.89      0.88       706
       HOROZ       0.93      0.90      0.92       396
       SEKER       0.91      0.91      0.91       411
        SIRA       0.80      0.78      0.79       529

    accuracy                           0.88      2723
   macro avg       0.89      0.89      0.89      2723
weighted avg       0.88      0.88      0.88      2723

Accuracy score is: 0.876239441792141
Precision score is: 0.8762276543821411
Recall score is: 0.876239441792141
Confusion matrix: 
[[193   0  30   0   1   2   7]
 [  0 118   0   0   0   0   0]
 [ 17   0 301   0   8   2   2]
 [  0   0   0 629   2  15  60]
 [  1   0  13   8 358   0  16]
 [  4   0   0  11   0 375  21]
 [  5   0   2  76  15  19 412]]
StratifiedKFold CV Accuracy: 0.8

In [40]:
#Decesion Trees
from sklearn.tree import DecisionTreeClassifier
implement(DecisionTreeClassifier())

Classification report:               precision    recall  f1-score   support

    BARBUNYA       0.82      0.83      0.82       233
      BOMBAY       1.00      0.99      1.00       118
        CALI       0.87      0.88      0.87       330
    DERMASON       0.83      0.89      0.86       706
       HOROZ       0.91      0.91      0.91       396
       SEKER       0.90      0.90      0.90       411
        SIRA       0.78      0.70      0.74       529

    accuracy                           0.85      2723
   macro avg       0.87      0.87      0.87      2723
weighted avg       0.85      0.85      0.85      2723

Accuracy score is: 0.8549394050679398
Precision score is: 0.8540822806205586
Recall score is: 0.8549394050679398
Confusion matrix: 
[[194   0  26   1   3   3   6]
 [  1 117   0   0   0   0   0]
 [ 26   0 289   0  10   2   3]
 [  0   0   0 627   3  16  60]
 [  1   0  13   7 359   0  16]
 [  6   0   0  17   1 370  17]
 [ 10   0   4 103  19  21 372]]
StratifiedKFold CV Accuracy: 0