# Preprocessing

In [32]:

import sklearn
assert sklearn.__version__ >= "0.20"
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline 
import matplotlib as mpl
import matplotlib.pyplot as plt

In [33]:
input_df =pd.read_csv("input_df.csv")
input_df_pca =pd.read_csv("input_df_pca.csv")
output_df_class =pd.read_csv("output_df_class.csv")
output_df_class.drop(columns=["Unnamed: 0"],inplace=True)
input_df_pca.drop(columns=["Unnamed: 0"],inplace=True)
input_df.drop(columns=["Unnamed: 0"],inplace=True)

In [34]:
input_dict = {"Not PCA" : input_df.copy(),"PCA": input_df_pca.copy()}
Y = output_df_class.copy()

target_names = ["bad_class","good_class","very_good_class"]
class_counter = 0
Y["multiclass"] = np.where(pd.isna(Y["class_bad"]), np.nan, 0)
for c in ["class_bad","class_good","class_very_good"] :
    Y[c] = np.where(Y[c] == 1, class_counter, 0) 
    class_counter+=1
Y["multiclass"] = Y.sum(axis=1) + Y["multiclass"]
Y["multiclass"] = np.where(pd.isna(Y["multiclass"]), -1, Y["multiclass"])
Y = np.array(Y["multiclass"])

df_results = pd.DataFrame(columns=['Method','Input', 'Accuracy','R2'])

In [35]:
def delete_unknown(y_test,y_pred):
    """Takes y_true and y_pred and supress in both the values corresponding to a -1 for y_test"""
    
    y_test.resize((y_test.shape[0],1))
    y_pred.resize((y_pred.shape[0],1))
    array = np.concatenate((y_test,y_pred), axis = 1)
    array = array[np.logical_not(array[:,0]==-1)]
    return array[:,0][:,np.newaxis],array[:,1][:,np.newaxis]

# Self training

In [36]:
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score ,confusion_matrix,r2_score

In [37]:
n_splits = 10
scores_accuracy = np.empty(n_splits)
scores_r2 = np.empty(n_splits)

class_weight_dict = {0: 0.4,1:0.2,2:0.4}
base_classifier = SVC(probability=True, gamma='scale', random_state=42, class_weight=class_weight_dict, tol = 1e-6)
self_training_model = SelfTrainingClassifier(base_classifier,
                                           threshold=0.7)
skfolds = StratifiedKFold(n_splits=n_splits)

for key, X in input_dict.items() :
    conf_mat = np.zeros((3,3))
    for fold, (train_index, test_index) in enumerate(skfolds.split(X, Y)):
        X_train = X.iloc[train_index,:]
        y_train = Y[train_index]
        X_test = X.iloc[test_index,:]
        y_test = Y[test_index]
        self_training_model.fit(X_train, y_train)
        y_pred = self_training_model.predict(X_test)

        y_test,y_pred = delete_unknown(y_test,y_pred)
        scores_accuracy[fold] = accuracy_score(y_test, y_pred)
        scores_r2[fold] = r2_score(y_test, y_pred)

        matrix = confusion_matrix(y_test, y_pred)
        matrix.diagonal()/matrix.sum(axis=1)
        conf_mat += matrix

    print(conf_mat)
    df_results = df_results._append({'Method': 'Self_training',
                            "Input": key,
                    'Accuracy': '{} +- {}'.format(round(scores_accuracy.mean(), 3), round(scores_accuracy.std(), 3)),
                    'R2': '{} +- {}'.format(round(scores_r2.mean(), 3), round(scores_r2.std(), 3))}, ignore_index=True)

df_results

[[ 27.  96.  39.]
 [ 32. 221.  70.]
 [ 11.  76.  75.]]
[[ 32.  91.  39.]
 [ 32. 211.  80.]
 [  9.  73.  80.]]


Unnamed: 0,Method,Input,Accuracy,R2
0,Self_training,Not PCA,0.499 +- 0.138,-0.471 +- 0.757
1,Self_training,PCA,0.498 +- 0.158,-0.453 +- 0.828


# Label propagation

In [38]:
from sklearn.semi_supervised import LabelPropagation

In [39]:
n_splits = 10
scores_accuracy = np.empty(n_splits)
scores_auc = np.empty(n_splits)

lp_model = LabelPropagation(kernel='knn', n_neighbors=30)
skfolds = StratifiedKFold(n_splits=n_splits)

for key, X in input_dict.items() :
    conf_mat = np.zeros((3,3))
    for fold, (train_index, test_index) in enumerate(skfolds.split(X, Y)):
        X_train = X.iloc[train_index,:]
        y_train = Y[train_index]
        X_test = X.iloc[test_index,:]
        y_test = Y[test_index]

        lp_model.fit(X_train, y_train)

        y_pred = lp_model.predict(X_test)

        y_test,y_pred = delete_unknown(y_test,y_pred)
        scores_accuracy[fold] = accuracy_score(y_test, y_pred)
        scores_r2[fold] = r2_score(y_test, y_pred)

        matrix = confusion_matrix(y_test, y_pred)
        matrix.diagonal()/matrix.sum(axis=1)
        conf_mat += matrix

    print(conf_mat)
    df_results = df_results._append({'Method': 'Label_propagation',
                            "Input": key,
                    'Accuracy': '{} +- {}'.format(round(scores_accuracy.mean(), 3), round(scores_accuracy.std(), 3)),
                    'R2': '{} +- {}'.format(round(scores_r2.mean(), 3), round(scores_r2.std(), 3))}, ignore_index=True)
df_results

[[ 35. 101.  26.]
 [ 46. 244.  33.]
 [ 13. 120.  29.]]
[[ 24. 115.  23.]
 [ 53. 237.  33.]
 [ 20. 116.  26.]]


Unnamed: 0,Method,Input,Accuracy,R2
0,Self_training,Not PCA,0.499 +- 0.138,-0.471 +- 0.757
1,Self_training,PCA,0.498 +- 0.158,-0.453 +- 0.828
2,Label_propagation,Not PCA,0.475 +- 0.106,-0.414 +- 0.61
3,Label_propagation,PCA,0.443 +- 0.085,-0.516 +- 0.534


# Label spreading

In [40]:
from sklearn.semi_supervised import LabelSpreading

In [31]:
n_splits = 10
scores_accuracy = np.empty(n_splits)
scores_auc = np.empty(n_splits)

lp_model = LabelSpreading(kernel='knn', n_neighbors=30)
skfolds = StratifiedKFold(n_splits=n_splits)

for key, X in input_dict.items() :
    conf_mat = np.zeros((3,3))
    for fold, (train_index, test_index) in enumerate(skfolds.split(X, Y)):
        X_train = X.iloc[train_index,:]
        y_train = Y[train_index]
        X_test = X.iloc[test_index,:]
        y_test = Y[test_index]

        lp_model.fit(X_train, y_train)

        y_pred = lp_model.predict(X_test)

        y_test,y_pred = delete_unknown(y_test,y_pred)
        scores_accuracy[fold] = accuracy_score(y_test, y_pred)
        scores_r2[fold] = r2_score(y_test, y_pred)

        matrix = confusion_matrix(y_test, y_pred)
        matrix.diagonal()/matrix.sum(axis=1)
        conf_mat += matrix

    print(conf_mat)
    df_results = df_results._append({'Method': 'Label_spreading',
                            "Input": key,
                    'Accuracy': '{} +- {}'.format(round(scores_accuracy.mean(), 3), round(scores_accuracy.std(), 3)),
                    'R2': '{} +- {}'.format(round(scores_r2.mean(), 3), round(scores_r2.std(), 3))}, ignore_index=True)
df_results

[[ 39.  95.  28.]
 [ 59. 231.  33.]
 [ 13. 118.  31.]]
[[ 29. 110.  23.]
 [ 62. 229.  32.]
 [ 18. 116.  28.]]


Unnamed: 0,Method,Input,Accuracy,R2
0,Self_training,Not PCA,0.499 +- 0.138,-0.471 +- 0.757
1,Self_training,PCA,0.498 +- 0.158,-0.453 +- 0.828
2,Label_propagation,Not PCA,0.475 +- 0.106,-0.414 +- 0.61
3,Label_propagation,PCA,0.443 +- 0.085,-0.516 +- 0.534
4,Label_spreading,Not PCA,0.464 +- 0.11,-0.454 +- 0.646
5,Label_spreading,PCA,0.441 +- 0.09,-0.5 +- 0.534
