# Simple ML - Testing simple models#

What it does?
- Reads the csv file created by the Statistics Parser
- Drops the string columns and keeps only the columns with the grades
- Tests a model for each course-column using the LeaveOneOut method and the XGBoost Regressor
- Prints the results in terms of MAE and RMSE

In [None]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import LeaveOneOut
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error
import math

import warnings #needed for this type of classifier
warnings.simplefilter(action='ignore', category=Warning)


In [None]:
data = pd.read_csv("csd_2021.csv")
display(data)

In [None]:
data.describe()
data.info()

### Rename Columns ###

In [None]:
# Change Courses Names
import json
from pandas.io.json import json_normalize

with open("courses_ids_600000014.json","r") as json_file:
     json_file = json.load(json_file)
coded_courses = json_normalize(json_file['courses'])
del coded_courses['ccoursecode']
course_dict = coded_courses.set_index('coursecode')['courseId'].to_dict()
#display(course_dict)

In [None]:
data.rename(columns={"Ηλικία":"age","Φύλο":"gender","Επέλεξα τη σχολή μου διότι:":"reason",
                     "Κατά μέσο όρο την εβδομάδα, διαβάζω:":"study_time",
                    "Μέσα στο εξάμηνο, παρακαλουθώ:":"lectures",
                     "Υπήρξε ανάγκη για φροντηστηριακή βοήθεια σε κάποιο μάθημα έως τώρα;":"private",
                    "Μετά το πτυχίο, θα ήθελα να ακολουθήσω:":"postgraduate",
                    "Ποιο από τα παρακάτω ισχύει;":"roomates",
                    "Η σχολή απέχει από το σπίτι μου:":"distance",
                    "Ασχολούμαι εβδομαδιαία με:":"hobbies"},inplace=True)
for key in course_dict.keys():
    data.rename(columns = {key:str(course_dict[key]).strip()}, inplace=True)
display(data)

### Categorical Values Handling - Function ###

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MultiLabelBinarizer


def handle_categorical(data):
    
    # Let's split to characteristics and courses
    data_characteristics = data.iloc[:,:10]
    data_courses = data.iloc[:,10:]

    
    # Categorical values that maintain the scaling properties "study_time","lectures","postgraduate","distance"
    # Study_time
    data_characteristics["study_time"] = data_characteristics["study_time"].replace({"0 - 2 ώρες":0.0,"2 - 5 ώρες":1.0,"> 5 ώρες":2.0})
    data_characteristics["study_time"] = pd.to_numeric(data_characteristics["study_time"])

    # Lectures
    data_characteristics["lectures"] = data_characteristics["lectures"].replace({"Λιγότερες από τις μισές διαλέξεις":0.0,
                                                 "Περίπου τις μισές διαλέξεις":1.0,
                                                 "Παραπάνω από τις μισές διαλέξεις":2.0,
                                                 "Όλες τις διαλέξεις":3.0})
    data_characteristics["lectures"] = pd.to_numeric(data_characteristics["lectures"])

    # Postgraduate
    data_characteristics["postgraduate"] = data_characteristics["postgraduate"].replace({"Τίποτα από τα δύο":0.0,
                                                         "Μεταπτυχιακές Σπούδες":1.0,
                                                         "Διδακτορικές Σπουδές":2.0})
    data_characteristics["postgraduate"] = pd.to_numeric(data_characteristics["postgraduate"])

    # Distance
    data_characteristics["distance"] = data_characteristics["distance"].replace({"< 10 λεπτά":0.0,
                                                 "10 - 25 λεπτά":1.0,
                                                 "25 - 45 λεπτά":2.0,
                                                 "> 45 λεπτά":3.0})
    data_characteristics["distance"] = pd.to_numeric(data_characteristics["distance"])

    # Gender
    data_characteristics["gender"] = data_characteristics["gender"].replace({"Κορίτσι":1,"Αγόρι":0})
    data_characteristics["gender"] = pd.to_numeric(data_characteristics["gender"])

    # private
    data_characteristics["private"] = data_characteristics["private"].replace({"Ναι":0,
                                              "Όχι":1})
    data_characteristics["private"] = pd.to_numeric(data_characteristics["private"])
    
    
    # One-hot encoder columns (only roomates)

    ohe_columns = ["roomates"]

    full_pipeline = ColumnTransformer([
        ('one_hot',OneHotEncoder(),ohe_columns)
    ])

    roomates = full_pipeline.fit_transform(data_characteristics)
    # Concat with  data_characteristics with roomates
    roomates_df = pd.DataFrame(roomates.toarray(),columns=['family','alone','friend','siblings'],dtype=np.int8)
    data_characteristics_updated = pd.concat([data_characteristics.drop("roomates",axis=1),roomates_df],axis=1)
    
    
    
    # Convert string cell with multiple values to list
    acceptable_hobbies = ["Σειρές / Ταινίες","Αθλητισμό","Video Games","Ξένη γλώσσα","Εθελοντισμός"]
    for student in range(0,data_characteristics.shape[0]):
        data_characteristics['reason'][student] = data_characteristics['reason'][student].split(", ") #There is a space after each comma
        # Hobbies transformation

        hobbies_list = data_characteristics['hobbies'][student].split(", ")
        for i in range(0,len(hobbies_list)):
            if hobbies_list[i] not in acceptable_hobbies:
                hobbies_list[i] = "Άλλο"
        data_characteristics['hobbies'][student] = hobbies_list
        
        
        # Multilabel Binarizer

        mlb = MultiLabelBinarizer()
        hobbies = mlb.fit_transform(data_characteristics['hobbies'])
#         print(mlb.classes_)
        reasons = mlb.fit_transform(data_characteristics['reason'])
#         print(mlb.classes_)

    # Convert to data_characteristics frame and concat
    hobbies_df = pd.DataFrame(hobbies,columns=["vgames","other","sports","volunteer","languange","movies"])
    data_characteristics_updated = pd.concat([data_characteristics_updated.drop(["hobbies"],axis=1),hobbies_df],axis=1)

    reasons_df = pd.DataFrame(reasons,columns=["quality","choice","subject","parents","career"])
    data_characteristics_updated = pd.concat([data_characteristics_updated.drop(["reason"],axis=1),reasons_df],axis=1)
    
    full_data = pd.concat([data_characteristics_updated,data_courses],axis=1)
    
    return full_data, data_characteristics_updated.columns, data_courses.columns




### Handle Categorical ###


In [None]:
data_updated, characteristics_cols, courses_cols = handle_categorical(data)
display(characteristics_cols)
display(courses_cols)

### Creating dictionary with courses as keys and number of students that passed the subject as values ###

In [None]:
new_dict = {new_list: [] for new_list in range(0)}
totalSum = 0
iterations = 0 
for column in courses_cols:
    temp = data_updated[data_updated[column]> -1].shape[0]
    if temp!=0:
        totalSum = totalSum + temp
        iterations = iterations + 1
    new_dict[column] = temp
mean = totalSum/iterations
print(mean)
print(new_dict)

In [None]:
#List with only the courses we are going to use for the model
alist = []
for key in new_dict.keys():
    if new_dict[key] > 10:
        alist.append(key)
display(alist)
print("length:",len(alist))

In [None]:
# Select those course columns
course_columns = alist
print(course_columns)

### Use all courses

In [None]:
# Selecting all courses
course_columns = courses_cols
print(course_columns)

### Only Courses ###

In [None]:
# Using only the courses
selected_columns = courses_cols
print(selected_columns)

### Only Characteristics ###

In [None]:
# Using only the user characteristics
selected_columns = characteristics_cols
print(selected_columns)

### Characteristics + Courses ###


In [None]:
selected_columns = characteristics_cols.to_list() + courses_cols.to_list()
print(selected_columns)

In [None]:
# Select only those columns
data_selected = data_updated.loc[:,selected_columns]


In [None]:
sum_error = 0
sum_squared_error = 0
for course_selected in course_columns: #For each course
    print(course_selected)
    errors = []
    sq_errors = []
    if course_selected in data_selected:
        X = data_selected.drop(course_selected,axis=1,inplace=False)
    else:
        X = data_selected
    y = data.loc[:,course_selected]
    loo = LeaveOneOut()
    xgb = XGBRegressor(objective = 'reg:squarederror')
    for train_index, test_index in loo.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
#         print(X_train)
        xgb.fit(X_train,y_train)
        predictions = xgb.predict(X_test)
        errors.append(mean_absolute_error(y_test,predictions))
        sq_errors.append(mean_squared_error(y_test,predictions))
    
    sum_error += np.mean(errors)
    sum_squared_error += np.mean(sq_errors)**(1/2)
    print("MAE:" + str(np.mean(errors)))
    print("RMSE:" + str((np.mean(sq_errors))**(1/2)))

print("Mean MAE:" + str( sum_error / (len(course_columns))))
print("Mean RMSE:" + str( sum_squared_error / (len(course_columns))))


### Corellation Table ###

In [None]:

corrmat = data_selected.corr(method="pearson")
top_corr_features = corrmat.index
plt.figure(figsize=(35, 35))
g = sns.heatmap(data_selected[top_corr_features].corr(), annot=True, cmap="RdYlGn")
plt.show()

## Persistence ##  

In [None]:
import pickle

for prediction_course in course_columns:
    data_persistence_selected = data_selected[data_selected[prediction_course] >= 0]
    X = data_persistence_selected.drop(prediction_course,axis=1,inplace=False)
    y = data_persistence_selected.loc[:,prediction_course]
    xgb_model = XGBRegressor(objective = 'reg:squarederror')
    xgb_model.fit(X,y)
    pickle.dump(xgb_model, open("models/version2/" + prediction_course +".dat", "wb"))

In [None]:
xgb_model.get_booster().feature_names