In [86]:
import pandas as pd
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier 
import matplotlib.pyplot as plt

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

### Import and analyse datasets

In [76]:
dataset_tool_sofist = pd.read_csv("../data/model/dataset_tool_sofist.csv")
dataset_cogroo_sofist = pd.read_csv("../data/model/dataset_cogroo_sofist.csv")


In [9]:
dataset_tool_sofist['comp1'].value_counts()

comp1
1.0    880
1.5    562
0.5    424
2.0    185
0.0     93
Name: count, dtype: int64

### Pre-processing

In [91]:
X1 = dataset_tool_sofist.drop(columns=['comp1', 'essay'])
y1 = dataset_tool_sofist['comp1'].astype(str)
train_X1, test_X1, train_y1, test_y1 = train_test_split(X1,y1, test_size=0.2, random_state=42)

In [90]:
X2 = dataset_cogroo_sofist.drop(columns=['comp1', 'essay'])
y2 = dataset_cogroo_sofist['comp1'].astype(str)
train_X2, test_X2, train_y2, test_y2 = train_test_split(X2,y2, test_size=0.2, random_state=42)

In [89]:
X3 =  dataset_tool_sofist[dataset_tool_sofist['comp1'] != '0.0'].drop(columns=['comp1', 'essay'])
y3 = dataset_tool_sofist[dataset_tool_sofist['comp1'] != '0.0']['comp1'].astype(str)
train_X3, test_X3, train_y3, test_y3 = train_test_split(X3,y3, test_size=0.2, random_state=42)

In [88]:
X4 =  dataset_cogroo_sofist[dataset_cogroo_sofist['comp1'] != '0.0'].drop(columns=['comp1', 'essay'])
y4 = dataset_cogroo_sofist[dataset_cogroo_sofist['comp1'] != '0.0']['comp1'].astype(str)
train_X4, test_X4, train_y4, test_y4 = train_test_split(X4,y4, test_size=0.2, random_state=42)

### Generating Models

In [66]:
def calculate_metrics(model, _x_train, _y_train, _x_test, _y_test):
    train_metric = model.score(_x_train, _y_train)
    test_metric = model.score(_x_test, _y_test) 
    _y_pred = model.predict(_x_test)
    classes = model.classes_
    metrics = [
        ["train", train_metric],
        ["test", test_metric],
        ["accuracy", accuracy_score(_y_test, _y_pred)],
        ["f1-macro", f1_score(_y_test, _y_pred, average='macro')],
        ["mean-squared-error", mean_squared_error(_y_test, _y_pred)]
    ]
    return metrics

In [112]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler


def GetScaledModel(nameOfScaler, nameOfMulticlassPred):
    
    if nameOfScaler == 'standard':
        scaler = StandardScaler()
    elif nameOfScaler =='minmax':
        scaler = MinMaxScaler()

    if nameOfMulticlassPred == 'onevsone':
        multiclass = OneVsOneClassifier
    elif nameOfMulticlassPred == 'onevsrest':
        multiclass = OneVsRestClassifier
    pipelines = []
    pipelines.append((nameOfScaler+'KNN' , Pipeline([('Scaler', scaler),('KNN' , multiclass(KNeighborsClassifier()))])))
    pipelines.append((nameOfScaler+'NB'  , Pipeline([('Scaler', scaler),('NB'  , multiclass(GaussianNB()))])))
    pipelines.append((nameOfScaler+'SVM' , Pipeline([('Scaler', scaler),('SVM' , multiclass(SVC()))])))
    pipelines.append((nameOfScaler+'AB'  , Pipeline([('Scaler', scaler),('AB'  , multiclass(AdaBoostClassifier()))])  ))
    pipelines.append((nameOfScaler+'GBM' , Pipeline([('Scaler', scaler),('GMB' , multiclass(GradientBoostingClassifier()))])))
    pipelines.append((nameOfScaler+'RF'  , Pipeline([('Scaler', scaler),('RF'  , multiclass(RandomForestClassifier()))])))
    pipelines.append((nameOfScaler+'KNN' , Pipeline([('Scaler', scaler),('KNN' , multiclass(KNeighborsClassifier()))])))
    pipelines.append((nameOfScaler+'NB'  , Pipeline([('Scaler', scaler),('NB'  , multiclass(GaussianNB()))])))
    pipelines.append((nameOfScaler+'SVM' , Pipeline([('Scaler', scaler),('SVM' , multiclass(SVC()))])))
    pipelines.append((nameOfScaler+'AB'  , Pipeline([('Scaler', scaler),('AB'  , multiclass(AdaBoostClassifier()))])  ))
    pipelines.append((nameOfScaler+'GBM' , Pipeline([('Scaler', scaler),('GMB' , multiclass(GradientBoostingClassifier()))])))
    pipelines.append((nameOfScaler+'RF'  , Pipeline([('Scaler', scaler),('RF'  , multiclass(RandomForestClassifier()))])))
    return pipelines

In [119]:
def BasedLine2(X_train, y_train,models):
    # Test options and evaluation metric
    num_folds = 10
    scoring = 'accuracy'

    results = []
    names = []
    for name, model in models:
        kfold = StratifiedKFold(n_splits=num_folds)
        print(model)
        cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)
        
    return names, results

In [120]:
models = GetScaledModel('minmax', 'onevsone')
names,results = BasedLine2(train_X1, train_y1,models)
models

Pipeline(steps=[('Scaler', MinMaxScaler()),
                ('KNN', OneVsOneClassifier(estimator=KNeighborsClassifier()))])
minmaxKNN: 0.386648 (0.029816)
Pipeline(steps=[('Scaler', MinMaxScaler()),
                ('NB', OneVsOneClassifier(estimator=GaussianNB()))])
minmaxNB: 0.339416 (0.093299)


[('minmaxKNN',
  Pipeline(steps=[('Scaler', MinMaxScaler()),
                  ('KNN', OneVsOneClassifier(estimator=KNeighborsClassifier()))])),
 ('minmaxNB',
  Pipeline(steps=[('Scaler', MinMaxScaler()),
                  ('NB', OneVsOneClassifier(estimator=GaussianNB()))]))]