imports

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold, cross_validate

In [8]:

def preprocess(file_path,split_value,target_column_index,has_header):
    """this function is used for taking the dataset from .csv file making a test train split
    and x, y split (has header is either True or False)"""

    #reading the .csv
    if has_header:
        df = pd.read_csv(file_path, header=0, delimiter=",")  # First row as header
        print(f"Dataset shape: {df.shape}")
        print(f"Column names: {list(df.columns)}")
    else:
        df = pd.read_csv(file_path, header=None, delimiter=",")  # No header row
        print(f"Dataset shape: {df.shape}")

    #making target variable split
    x_ds = df.drop(df.columns[target_column_index], axis=1)  # All except target
    y_ds = df.iloc[:, target_column_index]  # Target column only

    #making test train split
    x_train, x_test, y_train, y_test = train_test_split(x_ds, y_ds, test_size=split_value, random_state=10, shuffle= True)
    print(f'shape of x training: {x_train.shape}')
    print(f'shape of x testing: {x_test.shape}')

    return x_train, x_test, y_train, y_test, x_ds, y_ds

making target variable numarical(optional)

In [9]:
def implement(model):
    """this function contains implementation (modeling and fitting) on the dataset with 1 
    initiations and shows the results of the implementations. (it needs to take target variable varieties ).
    Note: This function doesn't have compile it is not appoprate to use this function for perceptron learning algs.
    because it doesn't have training vs test accuracy comparison and doesn't have epochs and .argmax(axis=1) """
    #initiante ml
    model.fit(x_train, y_train)
    #model evaluation
    y_pred = model.predict(x_test)

    print(f'Classification report: {classification_report(y_test, y_pred)}')
    print(f"Accuracy score is: {accuracy_score(y_test,y_pred)}")
    print(f"Percision score is: {precision_score(y_test,y_pred, average='weighted')}")
    print(f"Recall score is: {recall_score(y_test,y_pred, average='weighted')}")
    print(f'Confusion matrix: {confusion_matrix(y_test, y_pred)}')

    # StratifiedKFold evaluations
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=10)
    scoring = ['accuracy', 'precision_weighted', 'recall_weighted']
    cv_results = cross_validate(model, x_ds, y_ds, cv=skf, scoring=scoring)
    print(f"StratifiedKFold CV Accuracy: {cv_results['test_accuracy'].mean():.4f} (+/- {cv_results['test_accuracy'].std() * 2:.4f})")
    print(f"StratifiedKFold CV Precision: {cv_results['test_precision_weighted'].mean():.4f} (+/- {cv_results['test_precision_weighted'].std() * 2:.4f})")
    print(f"StratifiedKFold CV Recall: {cv_results['test_recall_weighted'].mean():.4f} (+/- {cv_results['test_recall_weighted'].std() * 2:.4f})")
        

Run the preprocess for diffrent datasets

In [None]:
x_train, x_test, y_train, y_test, x_ds, y_ds = preprocess("beans_kmeans.csv", 0.2, 16, True)

In [11]:
x_train, x_test, y_train, y_test, x_ds, y_ds = preprocess("diabetes_kmeans.csv", 0.2, 8, True)

Dataset shape: (768, 9)
Column names: ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
shape of x training: (614, 8)
shape of x testing: (154, 8)


In [None]:
x_train, x_test, y_train, y_test, x_ds, y_ds = preprocess("divorce.csv", 0.2, 54, True)

In [None]:
x_train, x_test, y_train, y_test, x_ds, y_ds = preprocess("parkinsons_kmeans.csv", 0.2, 17, True)

In [None]:
x_train, x_test, y_train, y_test, x_ds, y_ds = preprocess("rice_binned_kmeans.csv", 0.2, 7, True)

In [None]:
x_train, x_test, y_train, y_test, x_ds, y_ds = preprocess("wdbc_binned_kmeans.csv", 0.2, 1, True)

Run the implement function for diffrent models

In [12]:
#KNN
from sklearn.neighbors import KNeighborsClassifier
implement(KNeighborsClassifier())

Classification report:               precision    recall  f1-score   support

           0       0.74      0.88      0.80        95
           1       0.72      0.49      0.59        59

    accuracy                           0.73       154
   macro avg       0.73      0.69      0.69       154
weighted avg       0.73      0.73      0.72       154

Accuracy score is: 0.7337662337662337
Percision score is: 0.7323051948051948
Recall score is: 0.7337662337662337
Confusion matrix: [[84 11]
 [30 29]]
StratifiedKFold CV Accuracy: 0.7370 (+/- 0.1250)
StratifiedKFold CV Precision: 0.7328 (+/- 0.1268)
StratifiedKFold CV Recall: 0.7370 (+/- 0.1250)


In [13]:
#Ada Boost
from sklearn.ensemble import AdaBoostClassifier 
implement(AdaBoostClassifier(n_estimators=100, random_state = 10))



Classification report:               precision    recall  f1-score   support

           0       0.74      0.82      0.78        95
           1       0.65      0.54      0.59        59

    accuracy                           0.71       154
   macro avg       0.70      0.68      0.69       154
weighted avg       0.71      0.71      0.71       154

Accuracy score is: 0.7142857142857143
Percision score is: 0.7084548104956269
Recall score is: 0.7142857142857143
Confusion matrix: [[78 17]
 [27 32]]




StratifiedKFold CV Accuracy: 0.7644 (+/- 0.0719)
StratifiedKFold CV Precision: 0.7651 (+/- 0.0680)
StratifiedKFold CV Recall: 0.7644 (+/- 0.0719)




In [14]:
#SVM
from sklearn.svm import SVC 
implement(SVC(max_iter = -1, random_state=10))

Classification report:               precision    recall  f1-score   support

           0       0.75      0.93      0.83        95
           1       0.81      0.51      0.62        59

    accuracy                           0.77       154
   macro avg       0.78      0.72      0.73       154
weighted avg       0.77      0.77      0.75       154

Accuracy score is: 0.7662337662337663
Percision score is: 0.7746157746157747
Recall score is: 0.7662337662337663
Confusion matrix: [[88  7]
 [29 30]]
StratifiedKFold CV Accuracy: 0.7656 (+/- 0.0939)
StratifiedKFold CV Precision: 0.7605 (+/- 0.1050)
StratifiedKFold CV Recall: 0.7656 (+/- 0.0939)


In [15]:
#Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB 
implement(GaussianNB())

Classification report:               precision    recall  f1-score   support

           0       0.74      0.86      0.80        95
           1       0.70      0.51      0.59        59

    accuracy                           0.73       154
   macro avg       0.72      0.69      0.69       154
weighted avg       0.72      0.73      0.72       154

Accuracy score is: 0.7272727272727273
Percision score is: 0.7230063044016534
Recall score is: 0.7272727272727273
Confusion matrix: [[82 13]
 [29 30]]
StratifiedKFold CV Accuracy: 0.7539 (+/- 0.0996)
StratifiedKFold CV Precision: 0.7512 (+/- 0.1045)
StratifiedKFold CV Recall: 0.7539 (+/- 0.0996)


In [16]:
#Random Forests
from sklearn.ensemble import RandomForestClassifier
implement(RandomForestClassifier())


Classification report:               precision    recall  f1-score   support

           0       0.76      0.88      0.82        95
           1       0.74      0.54      0.63        59

    accuracy                           0.75       154
   macro avg       0.75      0.71      0.72       154
weighted avg       0.75      0.75      0.74       154

Accuracy score is: 0.7532467532467533
Percision score is: 0.7519407054290775
Recall score is: 0.7532467532467533
Confusion matrix: [[84 11]
 [27 32]]
StratifiedKFold CV Accuracy: 0.7565 (+/- 0.0935)
StratifiedKFold CV Precision: 0.7536 (+/- 0.0922)
StratifiedKFold CV Recall: 0.7565 (+/- 0.0935)


In [17]:
#Decesion Trees
from sklearn.tree import DecisionTreeClassifier
implement(DecisionTreeClassifier())

Classification report:               precision    recall  f1-score   support

           0       0.74      0.79      0.76        95
           1       0.62      0.54      0.58        59

    accuracy                           0.69       154
   macro avg       0.68      0.67      0.67       154
weighted avg       0.69      0.69      0.69       154

Accuracy score is: 0.6948051948051948
Percision score is: 0.6893547628841746
Recall score is: 0.6948051948051948
Confusion matrix: [[75 20]
 [27 32]]
StratifiedKFold CV Accuracy: 0.6706 (+/- 0.1020)
StratifiedKFold CV Precision: 0.6725 (+/- 0.0983)
StratifiedKFold CV Recall: 0.6706 (+/- 0.1020)
