In [1]:
import math
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
%matplotlib inline

import statsmodels.formula.api as smf
from sklearn import linear_model
from sklearn import neighbors
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from statsmodels.sandbox.regression.predstd import wls_prediction_std


In [2]:
admission = pd.read_csv('Admission_Predict.csv')
admission.head() #https://www.kaggle.com/mohansacharya/graduate-admissions/home

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [3]:
admission.columns

Index(['Serial No.', 'GRE Score', 'TOEFL Score', 'University Rating', 'SOP',
       'LOR ', 'CGPA', 'Research', 'Chance of Admit '],
      dtype='object')

In [4]:
admission_change = admission.rename(index=str, columns={"GRE Score": "GRE", "TOEFL Score": "TOEFL", "University Rating": "UniversityRating", "LOR ": "LOR", "Chance of Admit ": "ChanceofAdmit"})

In [5]:
admission_change.drop('Serial No.', axis=1, inplace=True)

In [6]:
admission_change.columns

Index(['GRE', 'TOEFL', 'UniversityRating', 'SOP', 'LOR', 'CGPA', 'Research',
       'ChanceofAdmit'],
      dtype='object')

In [7]:
admission_change.head(7)

Unnamed: 0,GRE,TOEFL,UniversityRating,SOP,LOR,CGPA,Research,ChanceofAdmit
0,337,118,4,4.5,4.5,9.65,1,0.92
1,324,107,4,4.0,4.5,8.87,1,0.76
2,316,104,3,3.0,3.5,8.0,1,0.72
3,322,110,3,3.5,2.5,8.67,1,0.8
4,314,103,2,2.0,3.0,8.21,0,0.65
5,330,115,5,4.5,3.0,9.34,1,0.9
6,321,109,3,3.0,4.0,8.2,1,0.75


# OLS

In [8]:
linear_formula = 'ChanceofAdmit ~ GRE + TOEFL + UniversityRating + SOP + LOR + CGPA'

lm = smf.ols(formula=linear_formula, data=admission_change).fit()

In [9]:
lm.params

Intercept          -1.413859
GRE                 0.002276
TOEFL               0.002753
UniversityRating    0.006062
SOP                -0.001961
LOR                 0.022749
CGPA                0.119875
dtype: float64

In [10]:
lm.pvalues

Intercept           1.962317e-29
GRE                 9.701384e-05
TOEFL               1.270649e-02
UniversityRating    2.092884e-01
SOP                 7.265309e-01
LOR                 5.861372e-05
CGPA                4.069244e-20
dtype: float64

In [11]:
lm.rsquared

0.7987118644980351

In [12]:
lm.conf_int()

Unnamed: 0,0,1
Intercept,-1.640828,-1.186891
GRE,0.00114,0.003412
TOEFL,0.000591,0.004916
UniversityRating,-0.003415,0.015539
SOP,-0.012979,0.009056
LOR,0.01174,0.033757
CGPA,0.095601,0.144149


In [13]:
correlation_matrix = admission_change.corr()
correlation_matrix

Unnamed: 0,GRE,TOEFL,UniversityRating,SOP,LOR,CGPA,Research,ChanceofAdmit
GRE,1.0,0.835977,0.668976,0.612831,0.557555,0.83306,0.580391,0.80261
TOEFL,0.835977,1.0,0.69559,0.657981,0.567721,0.828417,0.489858,0.791594
UniversityRating,0.668976,0.69559,1.0,0.734523,0.660123,0.746479,0.447783,0.71125
SOP,0.612831,0.657981,0.734523,1.0,0.729593,0.718144,0.444029,0.675732
LOR,0.557555,0.567721,0.660123,0.729593,1.0,0.670211,0.396859,0.669889
CGPA,0.83306,0.828417,0.746479,0.718144,0.670211,1.0,0.521654,0.873289
Research,0.580391,0.489858,0.447783,0.444029,0.396859,0.521654,1.0,0.553202
ChanceofAdmit,0.80261,0.791594,0.71125,0.675732,0.669889,0.873289,0.553202,1.0


# KNN

In [14]:
knn = neighbors.KNeighborsRegressor(n_neighbors=5)
X = admission_change[['GRE', 'TOEFL', 'UniversityRating', 'SOP', 'LOR', 'CGPA', 'Research']]
Y = admission_change['ChanceofAdmit']
knn.fit(X, Y)

score = cross_val_score(knn, X, Y, cv=5)
print('The Unweighted Accuracy is: %0.2f (+/- %0.2f).' % (score.mean(), score.std()))

The Unweighted Accuracy is: 0.65 (+/- 0.07).


In [15]:
knn1 = neighbors.KNeighborsRegressor(n_neighbors=5)
X = admission_change[['GRE', 'TOEFL', 'UniversityRating', 'SOP', 'LOR', 'CGPA', 'Research']]
normalized_X = preprocessing.normalize(X)
Y = admission_change['ChanceofAdmit']
knn1.fit(normalized_X, Y)

score = cross_val_score(knn1, normalized_X, Y, cv=5)
print('The Unweighted Normalized Accuracy is: %0.2f (+/- %0.2f).' % (score.mean(), score.std()))

The Unweighted Normalized Accuracy is: 0.56 (+/- 0.10).


In [18]:
knn1 = neighbors.KNeighborsRegressor(n_neighbors=5, weights="distance")
X = admission_change[['GRE', 'TOEFL', 'UniversityRating', 'SOP', 'LOR', 'CGPA', 'Research']]
normalized_X = preprocessing.normalize(X)
Y = admission_change['ChanceofAdmit']
knn1.fit(normalized_X, Y)

score = cross_val_score(knn1, normalized_X, Y, cv=5)
print('The Weighted Normalized Accuracy is: %0.2f (+/- %0.2f).' % (score.mean(), score.std()))

The Weighted Normalized Accuracy is: 0.56 (+/- 0.10).


# Summary

The weighted and unweight normalized accuracy are the same. The unweighted normalized vs. the non-normalized barely changed. Therefore, the OLS model is better for this modelset. The reason is that the p-values are less than 0.05 in all of these variables. Also, another reason is that those variables are relevant to the chances of students getting admission.