In [1]:
"""
Data source: 
Mohan S Acharya, Asfia Armaan, Aneeta S Antony : 
A Comparison of Regression Models for Prediction of Graduate Admissions, 
IEEE International Conference on Computational Intelligence in Data Science 2019
"""
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
data_file = "../data/graduate_admission.csv"
data = pd.read_csv(data_file)
data

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.00,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.80
4,5,314,103,2,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...,...
495,496,332,108,5,4.5,4.0,9.02,1,0.87
496,497,337,117,5,5.0,5.0,9.87,1,0.96
497,498,330,120,5,4.5,5.0,9.56,1,0.93
498,499,312,103,4,4.0,5.0,8.43,0,0.73


In [33]:
# Separate the data into training and validation sets
train_full = data.iloc[:400, :]
test_full = data.iloc[400:499, :]

# The last column, chance of admit, is how the accuracy of a model will be measured
train_X = train_full.iloc[:, :8]
train_y = train_full.iloc[:, 8]

test_X = test_full.iloc[:, :8]
test_y = test_full.iloc[:, 8]

In [34]:
# Baseline: a random predictor that assigns a probability of 0.5 to each data point
rand_train_mse = mean_squared_error(train_y.values, np.full(400, 0.5))
print(f"Random training MSE = {rand_train_mse}")

rand_test_mse = mean_squared_error(test_y.values, np.full(99, 0.5))
print(f"Random test MSE = {rand_test_mse}")

Random training MSE = 0.0706195
Random test MSE = 0.06223535353535354


In [35]:
# Threshold the admission probability to allow for classification via logistic regression
bin_train_y = train_y >= 0.8
bin_test_y = test_y >= 0.8

In [36]:
# Model 1: Logistic Regression
def log_reg(X, y):
    """
    Returns the MSE for the given data X and probabilities y.
    """
    logreg = LogisticRegression(solver="liblinear").fit(X, y)
    pred_proba = logreg.predict_proba(X.iloc[:, :8])

    # Calculate the mean squared error
    mse_train = mean_squared_error(y.values, pred_proba[:, 1])
    return mse_train

In [37]:
print(f"Logistic regression training MSE = {log_reg(train_X, bin_train_y)}")
print(f"Logistic regression test MSE = {log_reg(test_X, bin_test_y)}")

Logistic regression training MSE = 0.07414247350187979
Logistic regression training MSE = 0.0616057535548663


In [43]:
# Model 2: Linear Regression
def linreg(X, y):
    linreg = LinearRegression().fit(X, y)
    pred = linreg.predict(X)
    mse = mean_squared_error(y.values, pred)
    return mse

In [44]:
print(f"Linear regression training MSE = {linreg(train_X, train_y)}")
print(f"Linear regression test MSE = {linreg(test_X, test_y)}")

Linear regression training MSE = 0.0036757124486692895
Linear regression test MSE = 0.001527311672797277
