In [115]:
"""
Data source: 
Mohan S Acharya, Asfia Armaan, Aneeta S Antony : 
A Comparison of Regression Models for Prediction of Graduate Admissions, 
IEEE International Conference on Computational Intelligence in Data Science 2019
"""
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error

In [80]:
data_file = "../data/graduate_admission.csv"
data = pd.read_csv(data_file)
data

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.00,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.80
4,5,314,103,2,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...,...
495,496,332,108,5,4.5,4.0,9.02,1,0.87
496,497,337,117,5,5.0,5.0,9.87,1,0.96
497,498,330,120,5,4.5,5.0,9.56,1,0.93
498,499,312,103,4,4.0,5.0,8.43,0,0.73


In [127]:
# Separate the data into training and validation sets
train_full = data.iloc[:400, :]
test_full = data.iloc[400:499, :]

# The last column, chance of admit, is how the accuracy of a model will be measured
train_X = train_full.iloc[:, :8]
train_y = train_full.iloc[:, 8]

test_X = test_full.iloc[:, :8]
test_y = test_full.iloc[:, 8]

In [128]:
# Threshold the admission probability
bin_train_y = train_y >= 0.8
bin_test_y = test_y >= 0.8

In [129]:
# Model 1: Logistic Regression
clf = LogisticRegression(solver="liblinear").fit(train_X, bin_train_y)
train_pred_proba = clf.predict_proba(train_X.iloc[:, :8])
pd.DataFrame({"Training predicted chance of admit" : train_pred_proba[:, 1], "Chance of admit" : train_y.values})

Unnamed: 0,Training predicted chance of admit,Chance of admit
0,0.857341,0.92
1,0.331987,0.76
2,0.025308,0.72
3,0.181544,0.80
4,0.001215,0.65
...,...,...
395,0.528686,0.82
396,0.325836,0.84
397,0.956096,0.91
398,0.079006,0.67


In [130]:
# Calculate the mean squared error on the training data
mse_train = mean_squared_error(train_y.values, train_pred_proba[:, 1])
mse_train

0.23008822802117387

In [131]:
# Compare with random predictor that assigns 0.5 to each data point
mse_rand_train = mean_squared_error(train_y.values, np.full(400, 0.5))
mse_rand_train

0.0706195

In [133]:
# Calculate the mean-squared error on the validation data
test_pred_proba = clf.predict_proba(test_X.iloc[:, :8])
pd.DataFrame({"Test predicted chance of admit" : test_pred_proba[:, 1], "Chance of admit" : test_y.values})

Unnamed: 0,Test predicted chance of admit,Chance of admit
0,0.013875,0.63
1,0.007829,0.66
2,0.398709,0.78
3,0.888340,0.91
4,0.018790,0.62
...,...,...
94,0.422916,0.68
95,0.674740,0.87
96,0.988049,0.96
97,0.995106,0.93


In [136]:
mse_test = mean_squared_error(test_y.values, test_pred_proba[:, 1])
mse_test

0.19658635925399598

In [138]:
mse_rand_test = mean_squared_error(test_y.values, np.full(99, 0.5))
mse_rand_test

0.06223535353535354