In [None]:
!pip install groupyr
import os
from google.colab import drive

import numpy as np
import pandas as pd
import groupyr as gl
import sklearn as sk
import sklearn.multiclass as MC
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
import seaborn as sn
import matplotlib.pyplot as plt

In [None]:
#Input Data
def import_data(): 
  Code import
  drive.mount('/content/drive')
  os.chdir("/content/drive/My Drive/CSC2515/FinalProject")
  kid_data = pd.read_csv('/data.csv',delimiter=';')
  return kid_data

In [None]:
#Data Processing
def mark_as_categorical(dataframe: pd.DataFrame, category: str):
    dataframe[category] = dataframe[category].astype('category')

def get_categories(dataframe: pd.DataFrame):
    return [col for col in dataframe.select_dtypes(include="category")]

def create_groups_from_1hot(data_frame: pd.DataFrame):
    expanded = pd.get_dummies(data_frame)
    groups = {col: [] for col in data_frame}
    for idx, col in enumerate(expanded):
        category = col.split("_")[0]
        groups[category].append(idx)

    groups = {k: np.array(v) for k, v in groups.items()}
    return expanded, groups

def expand_data(data):  
    mark_as_categorical(data, "MaritalStatus")
    mark_as_categorical(data, "ApplicationMode")
    mark_as_categorical(data, "ApplicationOrder")
    mark_as_categorical(data, "TimeOfDay")
    mark_as_categorical(data, "PreviousQualification")
    mark_as_categorical(data, "Nationality")
    mark_as_categorical(data, "MotherQualification")
    mark_as_categorical(data, "FatherQualification")
    mark_as_categorical(data, "MotherOccupation")
    mark_as_categorical(data, "FatherOccupation")
    mark_as_categorical(data, "Course")

    target = data.get("Target").replace(['Dropout', 'Graduate', 'Enrolled'], [0, 1, 2]).astype(float)

    #Scaling
    for col in data.select_dtypes(include=["float64",'int'], exclude="category"):
        data[col] /= data[col].max()

    #Hot 1 and Grouping 
    expanded_X, group_idxs = create_groups_from_1hot(data.drop(columns="Target"))

    return expanded_X, group_idxs, target

def stage_data(data, stage):   
    # staged data feed
    if stage == 'sem2':
      features = data
    elif stage == 'sem1':
      features = data.drop(columns = ['Curricular units 2nd sem (credited)',
  'Curricular units 2nd sem (enrolled)',
  'Curricular units 2nd sem (evaluations)',
  'Curricular units 2nd sem (approved)',
  'Curricular units 2nd sem (grade)',
  'Curricular units 2nd sem (without evaluations)'])
    elif stage == 'registration':
      features = data.drop(columns = ['Curricular units 1st sem (credited)',
  'Curricular units 1st sem (enrolled)',
  'Curricular units 1st sem (evaluations)',
  'Curricular units 1st sem (approved)',
  'Curricular units 1st sem (grade)',
  'Curricular units 1st sem (without evaluations)',
  'Curricular units 2nd sem (credited)',
  'Curricular units 2nd sem (enrolled)',
  'Curricular units 2nd sem (evaluations)',
  'Curricular units 2nd sem (approved)',
  'Curricular units 2nd sem (grade)',
  'Curricular units 2nd sem (without evaluations)'])
  
    expanded_X, group_idxs, target = expand_data(features)
    X = expanded_X.to_numpy(np.float64)
    y = target.to_numpy(np.float64)
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,test_size=0.2)
    
    return X_train,X_test,y_train,y_test, group_idxs




In [None]:
#EDA 

def reorder_data(data): 
  # re-order
  data = data[['MaritalStatus',
  'Nationality',
  'Displaced',
  'Gender',
  'Age',  
  'International',
  'MotherQualification',
  'FatherQualification',
  'MotherOccupation',
  'FatherOccupation',
  'SpecialNeeds',
  'Debtor',
  'TuitionPaid',
  'Scholarship',
  'Unemployment rate',
  'Inflation rate',
  'GDP',
  'ApplicationMode',
  'ApplicationOrder',
  'Course',
  'TimeOfDay',
  'PreviousQualification',
  'PreviousGrade',
  'AdmissionGrade',
  'Curricular units 1st sem (credited)',
  'Curricular units 1st sem (enrolled)',
  'Curricular units 1st sem (evaluations)',
  'Curricular units 1st sem (approved)',
  'Curricular units 1st sem (grade)',
  'Curricular units 1st sem (without evaluations)',
  'Curricular units 2nd sem (credited)',
  'Curricular units 2nd sem (enrolled)',
  'Curricular units 2nd sem (evaluations)',
  'Curricular units 2nd sem (approved)',
  'Curricular units 2nd sem (grade)',
  'Curricular units 2nd sem (without evaluations)', 
  'Target']]
    
  # correlation matrix
  plt.subplots(figsize=(10,10))
  corr_matrix = data.corr(method='pearson',)
  sn.heatmap(corr_matrix, cmap='PiYG',vmin=-1)
  return data


    

In [None]:
#from scipy.stats.stats import ModeResult
#Models

#Metrics Calculation
def standard_metrics(model, X_test, y_test,X_train,y_train,coef,best_hyperparams,label):
    y_pred = model.predict(X_test)
    f1 = sk.metrics.f1_score(y_test, y_pred, average=None)
    f1_mean = np.mean(f1)
    train_score = model.score(X_train, y_train)
    test_score = model.score(X_test, y_test)
    df = pd.DataFrame([[f1,f1_mean, train_score, test_score,coef,best_hyperparams,label]], columns=['f1','f1 mean','train_score', 'test_score','feat','best parameters','model_type'])
    print("Confusion matrix for", label)
    print(sk.metrics.confusion_matrix(y_test, y_pred))

    return df

# Logistic Regression (Group/L1/L2/LR) 
def Groupyr_model(l1_ratios,group,X_train,Y_train,alpha = None):
    model_groupyr = MC.OneVsRestClassifier(gl.LogisticSGLCV(l1_ratio = l1_ratios,\
                                                            groups = group,\
                                                            alphas= alpha,\
                                                            scoring = 'f1_macro',\
                                                            n_jobs=-1,\
                                                            random_state = 1,\
                                                            max_iter = 1000,\
                                                            suppress_solver_warnings=False)).fit(X_train,Y_train)
                                                 
    return model_groupyr

def SVM_model(X_train,y_train,hyper_params):

    svm_ovr = MC.OneVsRestClassifier(SVC(random_state = 1))
    svm_model_cv = GridSearchCV(estimator = svm_ovr,\
                        param_grid = hyper_params, \
                        scoring= 'f1_macro',\
                        verbose = 1,\
                        n_jobs = -1).fit(X_train, y_train)


    return svm_model_cv

def Grad_model(X_train,y_train,hyper_params):

    grad_ovr = MC.OneVsRestClassifier(GradientBoostingClassifier(random_state = 1))
    grad_cv = GridSearchCV(estimator= grad_ovr ,\
                           scoring = 'f1_macro',\
                           param_grid = hyper_params,\
                           verbose = 1,
                           n_jobs = -1 ).fit(X_train,y_train)
    return grad_cv

def MLP_model(X_train,y_train,hyper_params):
    mlp_gs = MC.OneVsRestClassifier(MLPClassifier(random_state=1,max_iter = 1500))
    clf = GridSearchCV(mlp_gs, hyper_params, n_jobs=-1, cv=6,scoring = 'f1_macro').fit(X_train, y_train)
    return clf

def RF_model(X_train,y_train,hyper_params):
    
    rf_ovr = MC.OneVsRestClassifier(RandomForestClassifier(random_state = 1))
    rf_cv = GridSearchCV(estimator = rf_ovr,\
                         param_grid = hyper_params, \
                         scoring = 'f1_macro',\
                         cv = 5,\
                         n_jobs =-1, \
                         verbose = 1).fit(X_train,y_train)
    return rf_cv

In [None]:
# Main 

kid_data = import_data()
data = kid_data
data = reorder_data(kid_data)
stages = ['registration','sem1','sem2']
st_metrics = pd.DataFrame() 

for stage in stages:
    X_train,X_test,y_train,y_test, group_idxs = stage_data(data,stage)

    #Logistic Regress(LR,L1,L2,Group) 
    
    LR = Groupyr_model(l1_ratios=1,group=None,X_train=X_train,Y_train=y_train,alpha = [0])
    L1 = Groupyr_model(l1_ratios=1,group=None,X_train=X_train,Y_train=y_train)
    L2 = Groupyr_model(l1_ratios=0,group=None,X_train=X_train,Y_train=y_train)
    Group = Groupyr_model(l1_ratios=0,group=list(group_idxs.values()),X_train=X_train,Y_train=y_train)
    
    print("LR Done")

    #SVM
    SVM_hyper_params = [ {'estimator__gamma': [0.01,0.001,0.0001],
                          'estimator__C': [1, 10, 100],
                           'estimator__kernel': ['rbf','linear']}]
    
    SVM = SVM_model(X_train,y_train,SVM_hyper_params)
    print("SVM Done")
    
    #GradientBoost
    BG_hyper_params = {
    'estimator__max_depth': [3,5,7],
    'estimator__max_features': np.arange(0.1,1,0.1)}

    GB = Grad_model(X_train,y_train,BG_hyper_params)
    print("GB Done")
   
    #MLP
    MLP_hyper_params = {
         'estimator__hidden_layer_sizes': [(300,100,100),(200,100)],
         'estimator__activation': ['tanh', 'relu','logistic'],
         'estimator__solver': ['sgd', 'adam'],
         'estimator__alpha': [0.1, 0.05]
         } 
    MLP = MLP_model(X_train,y_train,MLP_hyper_params)
    print("MLP Done")
    
    #RF - did not use
#     RF_hyper_params = {
#     'estimator__max_depth': [3,5,7],
#     'estimator__max_features': np.arange(0.1,1,0.1)}

#     RF = RF_model(X_train,y_train,RF_hyper_params)
#     print("RF Done")
    

    empty = "NA"
    LRfeat = np.argsort(np.std([LR.estimators_[0].coef_,LR.estimators_[1].coef_,LR.estimators_[2].coef_],axis = 0))[::-1]
    LRalpha = 0
    LR_metrics = standard_metrics(LR, X_test, y_test,X_train,y_train,LRfeat,LRalpha,label=f'LR-Stage {stage}')
    st_metrics = pd.concat([st_metrics,LR_metrics])
    
    L1feat = np.argsort(np.std([L1.estimators_[0].coef_,L1.estimators_[1].coef_,L1.estimators_[2].coef_],axis = 0))[::-1]
    L1alpha = [L1.estimators_[0].alpha_,L1.estimators_[1].alpha_,L1.estimators_[2].alpha_]
    L1_metrics = standard_metrics(L1, X_test, y_test, X_train,y_train,L1feat,L1alpha,label=f'L1-Stage {stage}')
    st_metrics = pd.concat([st_metrics,L1_metrics])
    
    L2feat = np.argsort(np.std([L2.estimators_[0].coef_,L2.estimators_[1].coef_,L2.estimators_[2].coef_],axis = 0))[::-1]
    L2alpha = [L2.estimators_[0].alpha_,L2.estimators_[1].alpha_,L2.estimators_[2].alpha_]
    L2_metrics = standard_metrics(L2, X_test, y_test,X_train,y_train,L2feat,L2alpha, label=f'L2-Stage {stage}')
    st_metrics = pd.concat([st_metrics,L2_metrics])
    
    GLfeat = np.argsort(np.std([Group.estimators_[0].coef_,Group.estimators_[1].coef_,Group.estimators_[2].coef_],axis = 0))[::-1]
    GLalpha = [Group.estimators_[0].alpha_,Group.estimators_[1].alpha_,Group.estimators_[2].alpha_]
    GL_metrics = standard_metrics(Group, X_test, y_test, X_train,y_train,GLfeat,GLalpha,label=f'Group Lasso-Stage {stage}')
    st_metrics = pd.concat([st_metrics,GL_metrics])
    

    svm_param = SVM.best_params_
    SVM_metrics = standard_metrics(SVM, X_test, y_test, X_train,y_train,empty,svm_param,label=f'SVM-Stage {stage}')
    st_metrics = pd.concat([st_metrics,SVM_metrics])
    
    GB_params = GB.best_params_
    GBfeat = np.argsort(np.std([GB.best_estimator_.estimators_[0].feature_importances_,GB.best_estimator_.estimators_[1].feature_importances_,GB.best_estimator_.estimators_[2].feature_importances_],axis = 0))[::-1]
    GB_metrics = standard_metrics(GB, X_test, y_test,X_train,y_train,GBfeat,GB_params,label=f'GB-Stage {stage}')
    st_metrics = pd.concat([st_metrics,GB_metrics])
    
#     rf_param =RF.best_params_
#     RFfeat = np.argsort(np.std([RF.best_estimator_.estimators_[0].feature_importances_,RF.best_estimator_.estimators_[1].feature_importances_,RF.best_estimator_.estimators_[2].feature_importances_],axis = 0))[::-1]
#     RF_metrics = standard_metrics(RF, X_test, y_test,X_train,y_train,RFfeat,rf_param,label=f'RF-Stage {stage}')
#     metrics = pd.concat([metrics,RF_metrics])
    
    mlp_param = MLP.best_params_
    MLP_metrics = standard_metrics(MLP, X_test, y_test, X_train,y_train,empty,mlp_param,label=f'MLP-Stage {stage}')
    st_metrics = pd.concat([st_metrics,MLP_metrics])
    
    print("Metrics Updated")
    
    st_metrics.to_csv(f'finalproject_output_{stage}.csv')

In [None]:
st_metrics