# Preprocessing

In [25]:

import sklearn
assert sklearn.__version__ >= "0.20"
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline 
import matplotlib as mpl
import matplotlib.pyplot as plt

In [26]:
input_df = pd.read_csv("input_df.csv")
input_df_pca = pd.read_csv("input_df_pca.csv")
output_df_class = pd.read_csv("output_df_class.csv")
output_df_class.drop(columns=["Unnamed: 0"], inplace=True)
input_df_pca.drop(columns=["Unnamed: 0"], inplace=True)
input_df.drop(columns=["Unnamed: 0"], inplace=True)

In [27]:
input_dict = {"Not PCA": input_df.copy(), "PCA": input_df_pca.copy()}
Y = output_df_class.copy()

target_names = ["bad_class", "good_class", "very_good_class"]
class_counter = 0
Y["multiclass"] = np.where(pd.isna(Y["class_bad"]), np.nan, 0)
for c in ["class_bad", "class_good", "class_very_good"]:
    Y[c] = np.where(Y[c] == 1, class_counter, 0)
    class_counter += 1
Y["multiclass"] = Y.sum(axis=1) + Y["multiclass"]
Y["multiclass"] = np.where(pd.isna(Y["multiclass"]), -1, Y["multiclass"])
Y = np.array(Y["multiclass"])

df_results = pd.DataFrame(columns=["Method", "Input" ])

In [28]:
def delete_unknown(y_test, y_pred):
    """Takes y_true and y_pred and supress in both the values corresponding to a -1 for y_test"""

    y_test.resize((y_test.shape[0], 1))
    y_pred.resize((y_pred.shape[0], 1))
    array = np.concatenate((y_test, y_pred), axis=1)
    array = array[np.logical_not(array[:, 0] == -1)]
    y_t = array[:, 0].astype("float64")
    y_p = array[:, 1].astype("float64")
    return y_t, y_p

# Self training

In [29]:
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    r2_score,
    f1_score,
    recall_score,
    precision_score,
)

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

In [30]:
n_splits = 10

scores_f1_micro = np.empty(n_splits)
scores_f1_macro = np.empty(n_splits)
scores_precision_macro = np.empty(n_splits)
scores_recall_macro = np.empty(n_splits)

scores_accuracy = np.empty(n_splits)

scores_r2 = np.empty(n_splits)

# class_weight_dict = {0: 0.4,1:0.2,2:0.4}
# base_classifier = SVC(C=10.0, probability=True, gamma='scale', random_state=42, class_weight=class_weight_dict, tol = 1e-6)

models = {
    "Bagging Classifier": BaggingClassifier(
        random_state=42, max_features=0.8, max_samples=0.8, n_estimators=150
    ),
    "K-Nearest Neighbors Classifier (distance)": KNeighborsClassifier(
        n_neighbors=4, p=1, weights="distance"
    ),
    "K-Nearest Neighbors Classifier (uniform)": KNeighborsClassifier(
        n_neighbors=3, weights="uniform"
    ),
    "Support Vector Classifier (rbf)": SVC(
        C=100.0, probability=True, gamma=0.01, kernel="rbf"
    ),
    "Support Vector Classifier (poly)": SVC(
        C=10.0, probability=True, degree=3, gamma=0.5, kernel="poly"
    ),
    "Support Vector Classifier (linear)": SVC(
        C=10.0, probability=True, kernel="linear"
    ),
    "Decision Tree Classifier": DecisionTreeClassifier(
        max_depth=15, min_samples_split=2
    ),
    "Random Forest Classifier": RandomForestClassifier(
        random_state=42, max_depth=10, min_samples_split=5, n_estimators=10
    ),
}


for model_name, base_classifier in models.items():
    self_training_model = SelfTrainingClassifier(
        base_classifier, threshold=0.7,
    )
    skfolds = StratifiedKFold(n_splits=n_splits)


    for key, X in input_dict.items():

        conf_mat = np.zeros((3, 3))
        for fold, (train_index, test_index) in enumerate(skfolds.split(X, Y)):
            X_train = X.iloc[train_index, :]
            y_train = Y[train_index]
            X_test = X.iloc[test_index, :]
            y_test = Y[test_index]

            self_training_model.fit(X_train, y_train)
            y_pred = self_training_model.predict(X_test)

            y_test, y_pred = delete_unknown(y_test, y_pred)

            scores_accuracy[fold] = accuracy_score(y_test, y_pred)

            scores_r2[fold] = r2_score(y_test, y_pred)

            scores_accuracy[fold] = accuracy_score(y_test, y_pred)
            scores_f1_micro[fold] = f1_score(y_test, y_pred, average="micro")
            scores_f1_macro[fold] = f1_score(y_test, y_pred, average="macro")
            scores_precision_macro[fold] = precision_score(
                y_test,
                y_pred,
                average="macro",
                zero_division=np.nan,
            )
            scores_recall_macro[fold] = recall_score(y_test, y_pred, average="macro")

            matrix = confusion_matrix(list(y_test), list(y_pred))
            conf_mat += matrix


        df_results = df_results._append(
            {
                "Method": model_name,

                "Input": key,

                "R2": "{} +- {}".format(
                    round(scores_r2.mean(), 3), round(scores_r2.std(), 3)

                ),

                "Accuracy": "{} +- {}".format(
                    round(scores_accuracy.mean(), 3), round(scores_accuracy.std(), 3)

                ),
                "F1-score (micro)": "{} +- {}".format(
                    round(scores_f1_micro.mean(), 3), round(scores_f1_micro.std(), 3)

                ),
                "F1-score (macro)": "{} +- {}".format(
                    round(scores_f1_macro.mean(), 3), round(scores_f1_macro.std(), 3)

                ),
                "Precision (macro)": "{} +- {}".format(
                    round(scores_precision_macro.mean(), 3), round(scores_precision_macro.std(), 3)

                ),
                "Recall (macro)": "{} +- {}".format(
                    round(scores_recall_macro.mean(), 3), round(scores_recall_macro.std(), 3)

                ),
                
                "Confusion Matrix": conf_mat,
            },
            ignore_index=True,
        )


df_results.sort_values("R2", ignore_index=True)

Unnamed: 0,Method,Input,R2,Accuracy,F1-score (micro),F1-score (macro),Precision (macro),Recall (macro),Confusion Matrix
0,Bagging Classifier,Not PCA,-0.015 +- 0.265,0.553 +- 0.076,0.553 +- 0.076,0.494 +- 0.098,0.6 +- 0.126,0.514 +- 0.085,"[[71.0, 83.0, 8.0], [70.0, 216.0, 37.0], [5.0,..."
1,Random Forest Classifier,Not PCA,-0.122 +- 0.538,0.556 +- 0.107,0.556 +- 0.107,0.46 +- 0.121,0.588 +- 0.158,0.483 +- 0.109,"[[47.0, 105.0, 10.0], [41.0, 250.0, 32.0], [15..."
2,K-Nearest Neighbors Classifier (distance),Not PCA,-0.292 +- 0.535,0.517 +- 0.105,0.517 +- 0.105,0.462 +- 0.11,0.536 +- 0.113,0.487 +- 0.096,"[[71.0, 73.0, 18.0], [83.0, 196.0, 44.0], [17...."
3,Support Vector Classifier (rbf),Not PCA,-0.334 +- 0.485,0.509 +- 0.109,0.509 +- 0.109,0.463 +- 0.096,0.547 +- 0.124,0.487 +- 0.089,"[[87.0, 60.0, 15.0], [92.0, 186.0, 45.0], [23...."
4,Decision Tree Classifier,Not PCA,-0.37 +- 0.342,0.487 +- 0.087,0.487 +- 0.087,0.444 +- 0.091,0.49 +- 0.115,0.464 +- 0.094,"[[82.0, 67.0, 13.0], [88.0, 179.0, 56.0], [24...."
5,Random Forest Classifier,PCA,-0.383 +- 0.525,0.477 +- 0.09,0.477 +- 0.09,0.367 +- 0.112,0.469 +- 0.098,0.401 +- 0.093,"[[25.0, 112.0, 25.0], [44.0, 228.0, 51.0], [11..."
6,Support Vector Classifier (linear),Not PCA,-0.44 +- 0.588,0.485 +- 0.128,0.485 +- 0.128,0.438 +- 0.108,0.484 +- 0.133,0.447 +- 0.111,"[[67.0, 70.0, 25.0], [70.0, 193.0, 60.0], [19...."
7,Support Vector Classifier (rbf),PCA,-0.461 +- 0.725,0.456 +- 0.142,0.456 +- 0.142,0.321 +- 0.107,0.466 +- 0.209,0.366 +- 0.115,"[[10.0, 129.0, 23.0], [29.0, 235.0, 59.0], [17..."
8,K-Nearest Neighbors Classifier (distance),PCA,-0.484 +- 0.508,0.486 +- 0.096,0.486 +- 0.096,0.431 +- 0.102,0.492 +- 0.087,0.446 +- 0.092,"[[60.0, 75.0, 27.0], [81.0, 195.0, 47.0], [22...."
9,Bagging Classifier,PCA,-0.494 +- 0.553,0.486 +- 0.107,0.486 +- 0.107,0.415 +- 0.106,0.501 +- 0.131,0.441 +- 0.083,"[[49.0, 78.0, 35.0], [60.0, 201.0, 62.0], [15...."
