In [1]:
"""
Data source: 
Mohan S Acharya, Asfia Armaan, Aneeta S Antony : 
A Comparison of Regression Models for Prediction of Graduate Admissions, 
IEEE International Conference on Computational Intelligence in Data Science 2019
"""
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
data_file = "../data/graduate_admission.csv"
data = pd.read_csv(data_file)
data

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.00,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.80
4,5,314,103,2,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...,...
495,496,332,108,5,4.5,4.0,9.02,1,0.87
496,497,337,117,5,5.0,5.0,9.87,1,0.96
497,498,330,120,5,4.5,5.0,9.56,1,0.93
498,499,312,103,4,4.0,5.0,8.43,0,0.73


Serial No. will not be used as a feature.

In [44]:
# Separate the data into training and validation sets
train_full = data.iloc[:400, :]
test_full = data.iloc[400:500, :]

# The last column, chance of admit, is how the accuracy of a model will be measured
train_X = train_full.iloc[:, 1:8]
train_y = train_full.iloc[:, 8]

test_X = test_full.iloc[:, 1:8]
test_y = test_full.iloc[:, 8]

In [55]:
# Standardized version of data
std_train_X = (train_X - train_X.mean()) / train_X.std()
std_test_X = (test_X - test_X.mean()) / test_X.std()

In [47]:
# Baseline: a random predictor that assigns a probability of 0.5 to each data point
rand_train_mse = mean_squared_error(train_y.values, np.full(400, 0.5))
print(f"Random training MSE = {rand_train_mse}")
rand_test_mse = mean_squared_error(test_y.values, np.full(100, 0.5))
print(f"Random test MSE = {rand_test_mse}")

Random training MSE = 0.0706195
Random test MSE = 0.062769


In [48]:
# Threshold the admission probability to allow for classification via logistic regression
bin_train_y = train_y >= 0.8
bin_test_y = test_y >= 0.8

In [53]:
# Model 1: Logistic Regression
logreg = LogisticRegression(solver="liblinear").fit(train_X, bin_train_y)

 # Training
logreg_train_pred = logreg.predict_proba(train_X.iloc[:, :8])
logreg_mse_train = mean_squared_error(bin_train_y.values, logreg_train_pred[:, 1])

# Test
logreg_test_pred = logreg.predict_proba(test_X.iloc[:, :8])
logreg_mse_test = mean_squared_error(bin_test_y.values, logreg_test_pred[:, 1])

print(f"Logistic regression training MSE = {logreg_mse_train}")
print(f"Logistic regression test MSE = {logreg_mse_test}")

Logistic regression training MSE = 0.0752213477572983
Logistic regression test MSE = 0.06584125159925515


In [54]:
# Model 2: Linear Regression
linreg = LinearRegression().fit(train_X, train_y)

# Training
linreg_train_pred = linreg.predict(train_X)
linreg_mse_train = mean_squared_error(train_y.values, linreg_train_pred)

# Test
linreg_test_pred = linreg.predict(test_X)
linreg_mse_test = mean_squared_error(test_y.values, linreg_test_pred)

print(f"Linear regression training MSE = {linreg_mse_train}")
print(f"Linear regression test MSE = {linreg_mse_test}")

Linear regression training MSE = 0.003986893243246914
Linear regression test MSE = 0.001843174437604737


In [26]:
linreg_test_pred

array([0.63343077, 0.68437615, 0.81184602, 0.8987142 , 0.61151514,
       0.55455327, 0.68788483, 0.6499213 , 0.62243064, 0.57954208,
       0.55810379, 0.58739962, 0.64520655, 0.64466331, 0.78023904,
       0.8294011 , 0.65592801, 0.59916082, 0.68722836, 0.66074331,
       0.55201948, 0.85383122, 0.82892113, 0.99392554, 0.95100936,
       0.97864334, 0.7627216 , 0.77372283, 0.75743758, 0.92968493,
       0.70003556, 0.80545242, 0.88757672, 0.79516911, 0.66122397,
       0.63098933, 0.61946735, 0.65094773, 0.76144937, 0.69751974,
       0.57864803, 0.79624571, 0.95042567, 0.90453206, 0.92660186,
       0.92715275, 0.98626795, 0.8543698 , 0.7942855 , 0.80162006,
       0.85596046, 0.90821327, 0.94198217, 0.79136687, 0.65984484,
       0.57931628, 0.57586991, 0.51295516, 0.70524805, 0.90847896,
       0.80901912, 0.6471653 , 0.65515293, 0.64012042, 0.51528618,
       0.68489411, 0.78620808, 0.82574615, 0.8698346 , 0.88729763,
       0.89307043, 0.68486577, 0.95217127, 0.67746043, 0.67352

Note that linear regression gives some invalid probabilities (greater than 1).
This could be interpreted as saying that the particular candidate has a 100% chance of
being admitted.

In [60]:
# Model 3: Linear Regression with Standardization
linreg_std = LinearRegression().fit(std_train_X, train_y)

# Training
linreg_std_train_pred = linreg.predict(std_train_X)
linreg_std_mse_train = mean_squared_error(train_y.values, linreg_std_train_pred)

# Test
linreg_std_test_pred = linreg.predict(std_test_X)
linreg_std_mse_test = mean_squared_error(test_y.values, linreg_std_test_pred)

print(f"Standardized Linear regression training MSE = {linreg_std_mse_train}")
print(f"Standardized Linear regression test MSE = {linreg_std_mse_test}")

Standardized Linear regression training MSE = 3.940436523715328
Standardized Linear regression test MSE = 3.886539845330111


In [61]:
linreg_std_test_pred

array([-1.36519785, -1.33663024, -1.1671994 , -1.09418227, -1.4231102 ,
       -1.50215263, -1.34748229, -1.33643848, -1.38696219, -1.4149839 ,
       -1.47299948, -1.41274706, -1.38475132, -1.38025365, -1.25122595,
       -1.16781683, -1.39174236, -1.48015193, -1.36933672, -1.34233982,
       -1.50601627, -1.12975862, -1.15699057, -0.99777254, -1.01490691,
       -0.94149198, -1.24464211, -1.18409734, -1.22582922, -1.10165595,
       -1.31485132, -1.19055878, -1.08982442, -1.24579821, -1.36030182,
       -1.43800375, -1.47141366, -1.40417194, -1.23838226, -1.30919549,
       -1.47317959, -1.2199081 , -1.03445212, -1.08469199, -1.05502482,
       -1.08666882, -0.98177232, -1.11893437, -1.1837155 , -1.15401913,
       -1.14183599, -1.07276356, -1.02855017, -1.1792723 , -1.3885439 ,
       -1.47686639, -1.44644163, -1.52179935, -1.25878796, -1.07135219,
       -1.18653749, -1.34491659, -1.40313007, -1.42450059, -1.55209894,
       -1.31480583, -1.17529518, -1.14832823, -1.13118176, -1.10

It appears that standardizing the data has drastically worsened the accuracy of the predictions. In fact, these are not valid probabilities because they are negative.