# Preprocessing

## Library Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_absolute_error
import IPython
import warnings
warnings.simplefilter("ignore", UserWarning)

## Importing the Data

In [2]:
df = pd.read_csv("DATA.csv",delimiter=";")
'''
Student ID
1- Student Age (1: 18-21, 2: 22-25, 3: above 26)
2- Sex (1: female, 2: male)
3- Graduated high-school type: (1: private, 2: state, 3: other)
4- Scholarship type: (1: None, 2: 25%, 3: 50%, 4: 75%, 5: Full)
5- Additional work: (1: Yes, 2: No)
6- Regular artistic or sports activity: (1: Yes, 2: No)
7- Do you have a partner: (1: Yes, 2: No)
8- Total salary if available (1: USD 135-200, 2: USD 201-270, 3: USD 271-340, 4: USD 341-410, 5: above 410)
9- Transportation to the university: (1: Bus, 2: Private car/taxi, 3: bicycle, 4: Other)
10- Accommodation type in Cyprus: (1: rental, 2: dormitory, 3: with family, 4: Other)
11- Mother's education: (1: primary school, 2: secondary school, 3: high school, 4: university, 5: MSc., 6: Ph.D.)
12- Father's education: (1: primary school, 2: secondary school, 3: high school, 4: university, 5: MSc., 6: Ph.D.)
13- Number of sisters/brothers (if available): (1: 1, 2:, 2, 3: 3, 4: 4, 5: 5 or above)
14- Parental status: (1: married, 2: divorced, 3: died - one of them or both)
15- Mother's occupation: (1: retired, 2: housewife, 3: government officer, 4: private sector employee, 5: self-employment, 6: other)
16- Father's occupation: (1: retired, 2: government officer, 3: private sector employee, 4: self-employment, 5: other)
17- Weekly study hours: (1: None, 2: <5 hours, 3: 6-10 hours, 4: 11-20 hours, 5: more than 20 hours)
18- Reading frequency (non-scientific books/journals): (1: None, 2: Sometimes, 3: Often)
19- Reading frequency (scientific books/journals): (1: None, 2: Sometimes, 3: Often)
20- Attendance to the seminars/conferences related to the department: (1: Yes, 2: No)
21- Impact of your projects/activities on your success: (1: positive, 2: negative, 3: neutral)
22- Attendance to classes (1: always, 2: sometimes, 3: never)
23- Preparation to midterm exams 1: (1: alone, 2: with friends, 3: not applicable)
24- Preparation to midterm exams 2: (1: closest date to the exam, 2: regularly during the semester, 3: never)
25- Taking notes in classes: (1: never, 2: sometimes, 3: always)
26- Listening in classes: (1: never, 2: sometimes, 3: always)
27- Discussion improves my interest and success in the course: (1: never, 2: sometimes, 3: always)
28- Flip-classroom: (1: not useful, 2: useful, 3: not applicable)
29- Cumulative grade point average in the last semester (/4.00): (1: <2.00, 2: 2.00-2.49, 3: 2.50-2.99, 4: 3.00-3.49, 5: above 3.49)
30- Expected Cumulative grade point average in the graduation (/4.00): (1: <2.00, 2: 2.00-2.49, 3: 2.50-2.99, 4: 3.00-3.49, 5: above 3.49)
31- Course ID
32- OUTPUT Grade (0: Fail, 1: DD, 2: DC, 3: CC, 4: CB, 5: BB, 6: BA, 7: AA)
'''
display(df)

Unnamed: 0,STUDENT ID,1,2,3,4,5,6,7,8,9,...,23,24,25,26,27,28,29,30,31,GRADE
0,STUDENT1,2,2,3,3,1,2,2,1,1,...,1,1,3,2,1,2,1,1,1,1
1,STUDENT2,2,2,3,3,1,2,2,1,1,...,1,1,3,2,3,2,2,3,1,1
2,STUDENT3,2,2,2,3,2,2,2,2,4,...,1,1,2,2,1,1,2,2,1,1
3,STUDENT4,1,1,1,3,1,2,1,2,1,...,1,2,3,2,2,1,3,2,1,1
4,STUDENT5,2,2,1,3,2,2,1,3,1,...,2,1,2,2,2,1,2,2,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,STUDENT141,2,1,2,3,1,1,2,1,1,...,1,1,2,1,2,1,3,3,9,5
141,STUDENT142,1,1,2,4,2,2,2,1,4,...,1,1,3,2,2,1,5,3,9,5
142,STUDENT143,1,1,1,4,2,2,2,1,1,...,1,1,3,3,2,1,4,3,9,1
143,STUDENT144,2,1,2,4,1,1,1,5,2,...,2,1,2,1,2,1,5,3,9,4


## Training Testing Split
Split such that the training set has 100 observations and the testing set has 45

In [3]:
X = np.array(df[df.columns[1:32]])
y = np.array(df["GRADE"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.31, random_state=0) 

# SVM

In [4]:
from sklearn import svm

## GridSearch for SVM-HyperParameter tuning

In [5]:
# this code block iterates through all combinations of kernels, C values, gamma values, class weight values, and, if applicable, degree values
# then it evaluates the SVM produced with those hyperparameters, both by accuracy and by MAE
def make_SVM_return_eval_and_params(k, c, g, cw, d, folds=10):
    SVM = svm.SVC(random_state=0, kernel=k, C=c, gamma=g, class_weight=cw, degree=d)
    accuracyScores = cross_val_score(SVM, X_train, y_train, cv=KFold(folds))
    avgAccuracyScore = np.mean(accuracyScores)
    maeScores = cross_val_score(SVM, X_train, y_train, cv=KFold(folds), scoring='neg_mean_absolute_error')
    avgMaeScore = np.mean(maeScores)
    return [avgAccuracyScore, avgMaeScore, k, c, g, cw, d]
    
all_SVM_Models = [] #each inner list will start with the average of all 10-fold scores and then contain each hyperparameter associated with the SVM that produced that score
for k in ['linear','poly','rbf','sigmoid']: # kernel
    for c in range(1,10): # C
        for g in ['scale','auto']: # gamma
            for cw in ['balanced',None]: # class weight
                if k == 'poly':
                    for d in range(1,11):
                        all_SVM_Models.append(make_SVM_return_eval_and_params(k, c, g, cw, d))
                else:
                    d=3 #degree=3 by default
                    all_SVM_Models.append(make_SVM_return_eval_and_params(k, c, g, cw, d))

## Model Selection

### Using highest accuracy as performance metric

In [6]:
SVM_bestAccuracyFirst = sorted(all_SVM_Models, key=lambda x:x[0], reverse=True) #reverse=True (highest first)
print("When using 10-fold cross validation, the top 10 models (evaluated by highest accuracy) were made with...")
for i,hyperparameters in enumerate(SVM_bestAccuracyFirst[:10]):
    avgAccuracyScore,avgMaeScore,k,c,g,cw,d = hyperparameters
    cwSTR = None if cw == None else f"'{cw}'"
    print(f"[{i+1}] kernel='{k}', C={c}, gamma='{g}', class_weight={cwSTR}, degree={d}. This gave an average training accuracy of\t\t\t{avgAccuracyScore}")

When using 10-fold cross validation, the top 10 models (evaluated by highest accuracy) were made with...
[1] kernel='poly', C=1, gamma='auto', class_weight=None, degree=1. This gave an average training accuracy of			0.25
[2] kernel='poly', C=5, gamma='auto', class_weight=None, degree=1. This gave an average training accuracy of			0.25
[3] kernel='poly', C=8, gamma='scale', class_weight=None, degree=1. This gave an average training accuracy of			0.25
[4] kernel='poly', C=1, gamma='scale', class_weight=None, degree=2. This gave an average training accuracy of			0.24000000000000005
[5] kernel='poly', C=7, gamma='scale', class_weight=None, degree=1. This gave an average training accuracy of			0.24
[6] kernel='linear', C=1, gamma='scale', class_weight='balanced', degree=3. This gave an average training accuracy of			0.23000000000000004
[7] kernel='linear', C=1, gamma='auto', class_weight='balanced', degree=3. This gave an average training accuracy of			0.23000000000000004
[8] kernel='poly',

### Using lowest MAE (highest negative MAE) as performance metric

In [7]:
SVM_bestMaeFirst = sorted(all_SVM_Models, key=lambda x:x[1], reverse=True) #reverse=True (highest first)
print("When using 10-fold cross validation, the top 10 models (evaluated by highest negative MAE) were made with...")
for i,hyperparameters in enumerate(SVM_bestMaeFirst[:10]):
    avgAccuracyScore,avgMaeScore,k,c,g,cw,d = hyperparameters
    cwSTR = None if cw == None else f"'{cw}'"
    print(f"[{i+1}] kernel='{k}', C={c}, gamma='{g}', class_weight={cwSTR}, degree={d}. This gave an average training MAE of\t\t\t{avgMaeScore}")

When using 10-fold cross validation, the top 10 models (evaluated by highest negative MAE) were made with...
[1] kernel='rbf', C=4, gamma='scale', class_weight=None, degree=3. This gave an average training MAE of			-1.5400000000000003
[2] kernel='rbf', C=5, gamma='scale', class_weight='balanced', degree=3. This gave an average training MAE of			-1.55
[3] kernel='rbf', C=4, gamma='scale', class_weight='balanced', degree=3. This gave an average training MAE of			-1.5699999999999998
[4] kernel='rbf', C=3, gamma='scale', class_weight=None, degree=3. This gave an average training MAE of			-1.57
[5] kernel='linear', C=1, gamma='scale', class_weight='balanced', degree=3. This gave an average training MAE of			-1.5799999999999998
[6] kernel='linear', C=1, gamma='auto', class_weight='balanced', degree=3. This gave an average training MAE of			-1.5799999999999998
[7] kernel='rbf', C=3, gamma='scale', class_weight='balanced', degree=3. This gave an average training MAE of			-1.5899999999999999
[8

## Testing the best 2 models
We will select the best model from each performance metric. The model with the best Accuracy will be called "bA" and the model with the best MAE will be called "bM". Then each will have its accuracy and its Mean Absolute Error computed using 10 fold cross validation - further this will be done 1000 times with random shuffling of the data and the metrics averaged to obtain metrics unbiased by sampling chance.

In [8]:
k = 10

bM_SVM = svm.SVC(kernel=SVM_bestMaeFirst[0][2], C=SVM_bestMaeFirst[0][3], gamma=SVM_bestMaeFirst[0][4], class_weight=SVM_bestMaeFirst[0][5], degree=SVM_bestMaeFirst[0][6]).fit(X_train, y_train)
SVM_bM_Accuracy = np.mean(np.array([np.mean(cross_val_score(bM_SVM, X_test, y_test, cv=KFold(k,shuffle=True))) for i in range(1000)]))
SVM_bM_Mae = np.mean(np.array([np.mean(cross_val_score(bM_SVM, X_test, y_test, cv=KFold(k,shuffle=True), scoring='neg_mean_absolute_error')) for i in range(1000)]))

bA_SVM = svm.SVC(kernel=SVM_bestAccuracyFirst[0][2], C=SVM_bestAccuracyFirst[0][3], gamma=SVM_bestAccuracyFirst[0][4], class_weight=SVM_bestAccuracyFirst[0][5], degree=SVM_bestAccuracyFirst[0][6]).fit(X_train, y_train)
SVM_bA_Accuracy = np.mean(np.array([np.mean(cross_val_score(bA_SVM, X_test, y_test, cv=KFold(k,shuffle=True))) for i in range(1000)]))
SVM_bA_Mae = np.mean(np.array([np.mean(cross_val_score(bA_SVM, X_test, y_test, cv=KFold(k,shuffle=True), scoring='neg_mean_absolute_error')) for i in range(1000)]))

### SVM Results

In [9]:
# with 0.31 = test proportion
SVM_results =\
[
    [SVM_bestMaeFirst[0][2], SVM_bestMaeFirst[0][3], SVM_bestMaeFirst[0][4], SVM_bestMaeFirst[0][5], SVM_bestMaeFirst[0][6], SVM_bestMaeFirst[0][0], SVM_bM_Accuracy, SVM_bestMaeFirst[0][1], SVM_bM_Mae],
    [SVM_bestAccuracyFirst[0][2], SVM_bestAccuracyFirst[0][3], SVM_bestAccuracyFirst[0][4], SVM_bestAccuracyFirst[0][5], SVM_bestAccuracyFirst[0][6], SVM_bestAccuracyFirst[0][0], SVM_bA_Accuracy, SVM_bestAccuracyFirst[0][1], SVM_bA_Mae]
]
SVM_results = pd.DataFrame(SVM_results, index=['bM','bA'], columns=['kernel','C','gamma','class_weight','degree', 'validation_accuracy', 'testing_accuracy','validation_MAE', 'testing_MAE'])
SVM_results

Unnamed: 0,kernel,C,gamma,class_weight,degree,validation_accuracy,testing_accuracy,validation_MAE,testing_MAE
bM,rbf,4,scale,,3,0.23,0.242185,-1.54,-1.99067
bA,poly,1,auto,,1,0.25,0.27333,-1.95,-2.167535


# MLP

In [10]:
from sklearn import neural_network

## Searching through our hyperparameters for good MLP models

In [11]:
def make_MLP_return_eval_and_params(hls, af, s, lrt, rate, folds=10):
    MLP = neural_network.MLPClassifier(hidden_layer_sizes=hls, activation=af, solver=s, learning_rate=lrt, learning_rate_init=rate)
    accuracyScores = cross_val_score(MLP, X_train, y_train, cv=KFold(folds))
    avgAccuracyScore = np.mean(accuracyScores)
    maeScores = cross_val_score(MLP, X_train, y_train, cv=KFold(folds), scoring='neg_mean_absolute_error')
    avgMaeScore = np.mean(maeScores)
    return [avgAccuracyScore, avgMaeScore, hls, af, s, lrt, rate]
    
all_MLP_Models = []
for hls in [(50,), (100,), (200,)]: #hidden layer sizes
    for af in ['identity','logistic','tanh','relu']: #activation function
        for s in ['lbfgs','sgd','adam']: #solver
            learning_rates = [0.01, 0.001, 0.0001]
            if s == 'sgd':
                for lrt in ['constant','invscaling','adaptive']: #learning rate type (called 'learning_rate' in sklearn params)
                    for rate in learning_rates: #called 'learning_rate_init' in sklearn params
                        all_MLP_Models.append(make_MLP_return_eval_and_params(hls, af, s, lrt, rate))
            else:
                lrt='constant' #default. LRT is only used by solver=='sgd'
                if s == 'adam':
                    for rate in learning_rates: #called 'learning_rate_init' in sklearn params
                        all_MLP_Models.append(make_MLP_return_eval_and_params(hls, af, s, lrt, rate))
                else:
                    rate=0.001 #default. learning_rate_init is only used by solver=='adam' or solver=='sgd'
                    all_MLP_Models.append(make_MLP_return_eval_and_params(hls, af, s, lrt, rate))



### Using highest accuracy as performance metric

In [12]:
MLP_bestAccuracyFirst = sorted(all_MLP_Models, key=lambda x:x[0], reverse=True) #reverse=True (highest first)
print("When using 10-fold cross validation, the top 10 models (evaluated by highest accuracy) were made with...")
for i,hyperparameters in enumerate(MLP_bestAccuracyFirst[:10]):
    avgAccuracyScore, avgMaeScore, hls, af, s, lrt, rate = hyperparameters
    print(f"[{i+1}] hidden_layer_sizes={hls}, activation='{af}', solver='{s}' learning_rate='{lrt}', learning_rate_init={rate}. This gave an average training accuracy of\t\t\t{avgAccuracyScore}")

When using 10-fold cross validation, the top 10 models (evaluated by highest accuracy) were made with...
[1] hidden_layer_sizes=(100,), activation='logistic', solver='lbfgs' learning_rate='constant', learning_rate_init=0.001. This gave an average training accuracy of			0.23000000000000004
[2] hidden_layer_sizes=(100,), activation='relu', solver='sgd' learning_rate='adaptive', learning_rate_init=0.001. This gave an average training accuracy of			0.23000000000000004
[3] hidden_layer_sizes=(50,), activation='tanh', solver='adam' learning_rate='constant', learning_rate_init=0.01. This gave an average training accuracy of			0.22000000000000003
[4] hidden_layer_sizes=(100,), activation='tanh', solver='sgd' learning_rate='adaptive', learning_rate_init=0.001. This gave an average training accuracy of			0.21999999999999997
[5] hidden_layer_sizes=(50,), activation='identity', solver='sgd' learning_rate='adaptive', learning_rate_init=0.001. This gave an average training accuracy of			0.2100000000

### Using lowest MAE (highest negative MAE) as performance metric

In [13]:
MLP_bestMaeFirst = sorted(all_MLP_Models, key=lambda x:x[1], reverse=True) #reverse=True (highest first)
print("When using 10-fold cross validation, the top 10 models (evaluated by highest negative MAE) were made with...")
for i,hyperparameters in enumerate(MLP_bestMaeFirst[:10]):
    avgAccuracyScore, avgMaeScore, hls, af, s, lrt, rate = hyperparameters
    print(f"[{i+1}] hidden_layer_sizes={hls}, activation='{af}', solver='{s}' learning_rate='{lrt}', learning_rate_init={rate}. This gave an average training MAE of\t\t\t{avgMaeScore}")

When using 10-fold cross validation, the top 10 models (evaluated by highest negative MAE) were made with...
[1] hidden_layer_sizes=(50,), activation='tanh', solver='lbfgs' learning_rate='constant', learning_rate_init=0.001. This gave an average training MAE of			-1.58
[2] hidden_layer_sizes=(200,), activation='relu', solver='sgd' learning_rate='adaptive', learning_rate_init=0.01. This gave an average training MAE of			-1.6
[3] hidden_layer_sizes=(100,), activation='relu', solver='sgd' learning_rate='constant', learning_rate_init=0.01. This gave an average training MAE of			-1.65
[4] hidden_layer_sizes=(50,), activation='tanh', solver='sgd' learning_rate='adaptive', learning_rate_init=0.01. This gave an average training MAE of			-1.6799999999999997
[5] hidden_layer_sizes=(200,), activation='tanh', solver='sgd' learning_rate='adaptive', learning_rate_init=0.01. This gave an average training MAE of			-1.69
[6] hidden_layer_sizes=(200,), activation='logistic', solver='lbfgs' learning_rate

## Testing the best 2 MLP models

In [14]:
k=10

bM_MLP = neural_network.MLPClassifier(hidden_layer_sizes=MLP_bestMaeFirst[0][2], activation=MLP_bestMaeFirst[0][3], solver=MLP_bestMaeFirst[0][4], learning_rate=MLP_bestMaeFirst[0][5], learning_rate_init=MLP_bestMaeFirst[0][6]).fit(X_train, y_train)
MLP_bM_Accuracy = np.mean(np.array([np.mean(cross_val_score(bM_MLP, X_test, y_test, cv=KFold(k,shuffle=True), error_score='raise')) for i in range(1000)]))
MLP_bM_Mae = np.mean(np.array([np.mean(cross_val_score(bM_MLP, X_test, y_test, cv=KFold(k,shuffle=True), scoring='neg_mean_absolute_error')) for i in range(1000)]))

bA_MLP = neural_network.MLPClassifier(hidden_layer_sizes=MLP_bestAccuracyFirst[0][2], activation=MLP_bestAccuracyFirst[0][3], solver=MLP_bestAccuracyFirst[0][4], learning_rate=MLP_bestAccuracyFirst[0][5], learning_rate_init=MLP_bestAccuracyFirst[0][6]).fit(X_train, y_train)
MLP_bA_Accuracy = np.mean(np.array([np.mean(cross_val_score(bA_MLP, X_test, y_test, cv=KFold(k,shuffle=True))) for i in range(1000)]))
MLP_bA_Mae = np.mean(np.array([np.mean(cross_val_score(bA_MLP, X_test, y_test, cv=KFold(k,shuffle=True), scoring='neg_mean_absolute_error')) for i in range(1000)]))

### MLP Results

In [15]:
# with 0.31 = test proportion
MLP_results =\
[
    [MLP_bestMaeFirst[0][2], MLP_bestMaeFirst[0][3], MLP_bestMaeFirst[0][4], MLP_bestMaeFirst[0][5], MLP_bestMaeFirst[0][6], MLP_bestMaeFirst[0][0], MLP_bM_Accuracy, MLP_bestMaeFirst[0][1], MLP_bM_Mae],
    [MLP_bestAccuracyFirst[0][2], MLP_bestAccuracyFirst[0][3], MLP_bestAccuracyFirst[0][4], MLP_bestAccuracyFirst[0][5], MLP_bestAccuracyFirst[0][6], MLP_bestAccuracyFirst[0][0], MLP_bA_Accuracy, MLP_bestAccuracyFirst[0][1], MLP_bA_Mae]
]
MLP_results = pd.DataFrame(MLP_results, index=['bM','bA'], columns=['hidden_layer_sizes', 'activation', 'solver', 'learning_rate', 'learning_rate_init', 'validation_accuracy', 'testing_accuracy','validation_MAE', 'testing_MAE'])
MLP_results

Unnamed: 0,hidden_layer_sizes,activation,solver,learning_rate,learning_rate_init,validation_accuracy,testing_accuracy,validation_MAE,testing_MAE
bM,"(50,)",tanh,lbfgs,constant,0.001,0.14,0.22699,-1.58,-1.9733
bA,"(100,)",logistic,lbfgs,constant,0.001,0.23,0.256895,-1.76,-1.77231
