In [1]:
import kaggle
import pandas as pd
import numpy as np
import zipfile
import statsmodels.api as sm
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn import metrics 
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.neighbors import KDTree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

import seaborn as sns
import matplotlib.pyplot as plt
width = 20
height = 8
sns.set(rc = {'figure.figsize':(width, height)})



In [2]:
df_train = pd.read_csv('df_clean.csv')

In [3]:
exog = ['First_choice', 'Single', 'Application_mode_1st_phase', 'Application_mode_2nd_phase', 'Application_mode_Over_23 years_old',
        'Application_mode_Tech_Spec', 'Application_mode_Change', 'Quali_Secondary education', 'Quali_Basic education', 
        'Quali_Tech Spec course', 'Quali_Higher education', 'Quali_Other', 'Quali_12th year of schooling', 'Quali_Higher education',
        'Quali_Professional higher technical course', 'M_Quali_Secondary education', 'M_Quali_Basic education_3rd', 'M_Quali_Basic education_1st',
        'M_Quali_Basic education_2nd', 'M_Quali_Higher Education', 'M_Quali_Secondary education', 'M_Quali_Basic education_3rd',
        'M_Quali_Basic education_1st', 'M_Quali_Basic education_2nd', 'M_Quali_Higher Education', 'F_Quali_Secondary education',
        'F_Quali_Basic education_3rd', 'F_Quali_Basic education_1st', 'F_Quali_Basic education_2nd', 'F_Quali_Higher Education',
        'Daytime/evening attendance', 'Previous qualification (grade)', 'Admission grade', 'Debtor', 'Tuition fees up to date',
        'Gender', 'Scholarship holder', 'Age at enrollment', 'Curricular units 2nd sem (approved)', 'Curricular units 1st sem (approved)',
        'Curricular units 2nd sem (grade)', 'Curricular units 1st sem (grade)']

## Logit Model

In [4]:
X_train = df_train[exog]
X_train = sm.add_constant(X_train)

Y_train_g = df_train['Target_Graduate']
Y_train_e = df_train['Target_Enrolled']
Y_train_d = df_train['Target_Dropout']

X_train = X_train.astype(float)

Y_train_g = Y_train_g.astype(float)
Y_train_e = Y_train_e.astype(float)
Y_train_d = Y_train_d.astype(float)

model_g = sm.Logit(Y_train_g, X_train, missing='drop').fit_regularized()
model_e = sm.Logit(Y_train_e, X_train, missing='drop').fit_regularized()
model_d = sm.Logit(Y_train_d, X_train, missing='drop').fit_regularized()

train_pred = pd.DataFrame()
train_pred['Graduate'] = model_g.predict(X_train)
train_pred['Enrolled'] = model_e.predict(X_train)
train_pred['Dropout'] = model_d.predict(X_train)

# print(model.summary())
# np.round(np.mean((model.predict(X_train)>0.5) == df['Target_Enrolled']),4)

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.33566021230443616
            Iterations: 266
            Function evaluations: 268
            Gradient evaluations: 266
Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.40899847348800433
            Iterations: 264
            Function evaluations: 267
            Gradient evaluations: 264
Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.25758092864441823
            Iterations: 270
            Function evaluations: 273
            Gradient evaluations: 270


In [5]:
def classify(x):
    if x['Graduate'] > x['Enrolled'] and x['Graduate'] > x['Dropout']:
        return 'Graduate'
    if x['Enrolled'] > x['Graduate'] and x['Enrolled'] > x['Dropout']:
        return 'Enrolled'
    if x['Dropout'] > x['Enrolled'] and x['Dropout'] > x['Graduate']:
        return 'Dropout'
    else:
        return 'Graduate'
    

def target_to_num(x):
    if x['Target'] == 'Graduate':
        return 1
    if x['Target'] == 'Dropout':
        return -1
    if x['Target'] == 'Enrolled':
        return 0
    
def num_to_target(x):
    if x['Target_num'] == 1:
        return 'Graduate'
    if x['Target_num'] == -1:
        return 'Dropout'
    if x['Target_num'] == 0:
        return 'Enrolled'
    
df_train['Target_num'] = df_train.apply(lambda x: target_to_num(x), axis=1)

In [6]:
train_pred['Target'] = train_pred.apply(lambda x: classify(x), axis=1)
print(np.round(np.mean(train_pred['Target'] == df_train['Target']),4))
print(train_pred.head())

0.7933
   Graduate  Enrolled   Dropout    Target
0  0.915459  0.115757  0.014734  Graduate
1  0.014345  0.190726  0.848248   Dropout
2  0.021586  0.077762  0.892209   Dropout
3  0.964803  0.047911  0.013081  Graduate
4  0.745402  0.229015  0.043680  Graduate


In [7]:
df_test = pd.read_csv('df_clean_test.csv')
X_test = df_test[exog]
X_test = sm.add_constant(X_test)
X_test = X_test.astype(float)

test_pred = pd.DataFrame()
test_pred['Graduate'] = model_g.predict(X_test)
test_pred['Enrolled'] = model_e.predict(X_test)
test_pred['Dropout'] = model_d.predict(X_test)

test_pred['Target'] = test_pred.apply(lambda x: classify(x), axis=1)

submission = pd.DataFrame()
submission['id'] = df_test['id']
submission['Target'] = test_pred['Target']

print(test_pred.head())
submission.to_csv('submission.csv', index=False)

   Graduate  Enrolled   Dropout    Target
0  0.003010  0.011087  0.981896   Dropout
1  0.753243  0.316973  0.032007  Graduate
2  0.809270  0.125882  0.032804  Graduate
3  0.710388  0.133592  0.053738  Graduate
4  0.130881  0.451527  0.245270  Enrolled


##### Score: 0.79543

## Random Forest

In [8]:
X_train = df_train[exog]
Y_train_num = df_train['Target_num']

rf = RandomForestClassifier(max_features='auto', oob_score=True, random_state=0, n_jobs=-1)

# param_grid = {"criterion":['gini', 'entropy', 'log_loss'], "min_samples_leaf" : [1, 2, 5], "min_samples_split" : [20], "n_estimators": [100]}

# gs = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='accuracy', cv=3, n_jobs=-1)

# gs = gs.fit(X_train, Y_train_num)

# print(gs.best_score_)
# print(gs.best_params_)

In [9]:
clf = RandomForestClassifier(random_state=7, n_estimators=100, min_samples_split=20, min_samples_leaf=2)
clf.fit(X_train, Y_train_num)

y_pred = clf.predict(X_train)

print("ACCURACY OF THE MODEL:", metrics.accuracy_score(Y_train_num, y_pred))

ACCURACY OF THE MODEL: 0.8547977752578433


In [10]:
df_test = pd.read_csv('df_clean_test.csv')
X_test = df_test[exog]

y_pred = clf.predict(X_test)

data = {'id':df_test['id'],
        'Target_num': y_pred.astype(int)}
submission = pd.DataFrame(data)
submission['Target'] = submission.apply(lambda x: num_to_target(x), axis=1)
submission.drop('Target_num', axis=1, inplace=True)
submission.to_csv('submission.csv', index=False)

##### Score: 0.81768

## K-NN Classification

In [11]:
exog = ['First_choice', 'Single', 'Application_mode_1st_phase', 'Application_mode_2nd_phase', 'Application_mode_Over_23 years_old',
        'Application_mode_Tech_Spec', 'Application_mode_Change', 'Quali_Secondary education', 'Quali_Basic education', 
        'Quali_Tech Spec course', 'Quali_Higher education', 'Quali_Other', 'Quali_12th year of schooling', 'Quali_Higher education',
        'Quali_Professional higher technical course', 'M_Quali_Secondary education', 'M_Quali_Basic education_3rd', 'M_Quali_Basic education_1st',
        'M_Quali_Basic education_2nd', 'M_Quali_Higher Education', 'M_Quali_Secondary education', 'M_Quali_Basic education_3rd',
        'M_Quali_Basic education_1st', 'M_Quali_Basic education_2nd', 'M_Quali_Higher Education', 'F_Quali_Secondary education',
        'F_Quali_Basic education_3rd', 'F_Quali_Basic education_1st', 'F_Quali_Basic education_2nd', 'F_Quali_Higher Education',
        'Daytime/evening attendance', 'Previous qualification (grade)', 'Admission grade', 'Debtor', 'Tuition fees up to date',
        'Gender', 'Scholarship holder', 'Age at enrollment', 'Curricular units 2nd sem (approved)', 'Curricular units 1st sem (approved)',
        'Curricular units 2nd sem (grade)', 'Curricular units 1st sem (grade)']

X_train_knn = X_train[exog]
X_test_knn = X_test[exog]

scaler = StandardScaler()
X_train_knn = scaler.fit(X_train_knn).transform(X_train_knn)
X_test_knn = scaler.fit(X_test_knn).transform(X_test_knn)

In [16]:
neigh = KNeighborsClassifier(leaf_size=2, algorithm='auto', n_neighbors=5)
neigh.fit(X_train_knn, Y_train_num)
print(neigh.score(X_train_knn, Y_train_num))
y_pred = neigh.predict(X_test_knn)

0.8334413305254064


In [17]:
data = {'id':df_test['id'],
        'Target_num': y_pred.astype(int)}
submission = pd.DataFrame(data)
submission['Target'] = submission.apply(lambda x: num_to_target(x), axis=1)
submission.drop('Target_num', axis=1, inplace=True)
submission.to_csv('submission.csv', index=False)