In [1]:
import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('data/Admission_Predict.csv')
data.sample(5)

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
202,203,340,120,5,4.5,4.5,9.91,1,1
237,238,329,114,5,4.5,5.0,9.19,1,1
341,342,326,110,3,3.5,3.5,8.76,1,1
367,368,311,98,1,1.0,2.5,7.46,0,0
359,360,321,107,2,2.0,1.5,8.44,0,1


### Droping the Useless Columns

In [3]:
data.drop(columns='Serial No.', axis=1, inplace=True)
data.sample(5)

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
225,296,99,2,2.5,2.5,8.03,0,0
317,300,99,1,1.0,2.5,8.01,0,0
287,324,114,5,5.0,4.5,9.08,1,1
387,307,105,2,2.0,3.5,8.1,0,0
193,336,118,5,4.5,5.0,9.53,1,1


### Split into IV, DV and Scale Numeric Data

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   GRE Score          400 non-null    int64  
 1   TOEFL Score        400 non-null    int64  
 2   University Rating  400 non-null    int64  
 3   SOP                400 non-null    float64
 4   LOR                400 non-null    float64
 5   CGPA               400 non-null    float64
 6   Research           400 non-null    int64  
 7   Chance of Admit    400 non-null    int64  
dtypes: float64(3), int64(5)
memory usage: 25.1 KB


In [5]:
df_dv = data['Chance of Admit']
df_iv = data.drop(columns='Chance of Admit', axis=1)
df_iv.sample(2)

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
391,318,106,3,2.0,3.0,8.65,0
231,319,106,3,3.5,2.5,8.33,1


In [7]:
type(df_dv)

pandas.core.series.Series

In [15]:
from sklearn import *

In [18]:
sscaler = sklearn.preprocessing.StandardScaler()

In [20]:
df_iv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   GRE Score          400 non-null    int64  
 1   TOEFL Score        400 non-null    int64  
 2   University Rating  400 non-null    int64  
 3   SOP                400 non-null    float64
 4   LOR                400 non-null    float64
 5   CGPA               400 non-null    float64
 6   Research           400 non-null    int64  
dtypes: float64(3), int64(4)
memory usage: 22.0 KB


In [21]:
df_iv['Research'] = df_iv['Research'].astype('object')
df_iv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   GRE Score          400 non-null    int64  
 1   TOEFL Score        400 non-null    int64  
 2   University Rating  400 non-null    int64  
 3   SOP                400 non-null    float64
 4   LOR                400 non-null    float64
 5   CGPA               400 non-null    float64
 6   Research           400 non-null    object 
dtypes: float64(3), int64(3), object(1)
memory usage: 22.0+ KB


In [22]:
df_iv_numeric = df_iv.select_dtypes(include=np.number)
df_iv_categorical = df_iv.select_dtypes(exclude=np.number)

In [27]:
df_iv_numeric_scaled = sscaler.fit_transform(df_iv_numeric)
df_iv_numeric_scaled = pd.DataFrame(data=df_iv_numeric_scaled, columns=df_iv_numeric.columns)
df_iv_numeric_scaled

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA
0,1.762107,1.746971,0.798829,1.093864,1.167321,1.764818
1,0.627656,-0.067635,0.798829,0.596653,1.167321,0.455151
2,-0.070467,-0.562528,-0.076600,-0.397769,0.052933,-1.005631
3,0.453126,0.427257,-0.076600,0.099442,-1.061454,0.119339
4,-0.244998,-0.727492,-0.952029,-1.392191,-0.504260,-0.653029
...,...,...,...,...,...,...
395,0.627656,0.427257,-0.076600,0.099442,0.052933,0.740592
396,0.714922,-0.067635,-0.076600,-0.397769,0.052933,0.858126
397,1.151249,1.417042,0.798829,1.591075,1.167321,1.429006
398,-0.419528,-0.727492,-0.076600,0.099442,0.610127,0.304036


In [28]:
df_iv_scaled = pd.concat([df_iv_numeric_scaled, df_iv_categorical], axis=1)
df_iv_scaled.sample(5)

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
78,-1.815775,-2.047205,-0.952029,-0.397769,-1.618648,-1.777999,1
375,-1.117652,-1.05742,-0.952029,-1.392191,-1.061454,-1.576512,0
164,1.063983,0.592221,0.798829,1.093864,0.610127,0.69022,1
365,1.151249,1.087114,0.798829,1.093864,-0.50426,0.958869,1
353,-1.466713,-0.892456,-0.0766,0.099442,-1.061454,-0.720191,0


In [29]:
df_iv_scaled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   GRE Score          400 non-null    float64
 1   TOEFL Score        400 non-null    float64
 2   University Rating  400 non-null    float64
 3   SOP                400 non-null    float64
 4   LOR                400 non-null    float64
 5   CGPA               400 non-null    float64
 6   Research           400 non-null    object 
dtypes: float64(6), object(1)
memory usage: 22.0+ KB


### Converting back IV Categorical Data into numeric
This shall ensure that there are not errors which computing VIF, Models etc

In [47]:
df_iv_scaled['Research'] = df_iv_scaled['Research'].astype(np.int64)
df_iv_scaled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   GRE Score          400 non-null    float64
 1   TOEFL Score        400 non-null    float64
 2   University Rating  400 non-null    float64
 3   SOP                400 non-null    float64
 4   LOR                400 non-null    float64
 5   CGPA               400 non-null    float64
 6   Research           400 non-null    int64  
dtypes: float64(6), int64(1)
memory usage: 22.0 KB


### Checking for Multicollinearity of Data

In [35]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

In [50]:
df_vif = pd.DataFrame()
# Since variance_inflation_factor takes ndarray, inputs is passed as DataFrame.values instead of DataFrame
df_vif['VIF'] = [vif(exog=df_iv_scaled.values, exog_idx=i ) for i in range(df_iv_scaled.shape[1])]
df_vif['feature'] = df_iv_scaled.columns
df_vif.sort_values('VIF', ascending=False)

Unnamed: 0,VIF,feature
5,5.205309,CGPA
0,4.358514,GRE Score
1,4.282118,TOEFL Score
3,3.063188,SOP
2,2.918556,University Rating
4,2.430409,LOR
6,1.189484,Research


In [55]:
print(f'''The VIF for the first feature is \n{df_vif[df_vif['VIF']>5]} > 5.
So Multicollinearity exists but it should be fine since its just above 5.
So we will not drop any column''')

The VIF for the first feature is 
        VIF feature
5  5.205309    CGPA > 5.
So Multicollinearity exists but it should be fine since its just above 5.
So we will not drop any column


### Building the model

In [56]:
from sklearn.model_selection import train_test_split

In [60]:
# Building using the OLS
import statsmodels.api as sm

In [61]:
df_iv_scaled_const = sm.add_constant(df_iv_scaled)

In [62]:
df_iv_scaled_const.sample(2)

Unnamed: 0,const,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
185,1.0,0.889453,0.92215,0.798829,1.093864,1.167321,0.858126,1
30,1.0,-1.466713,-1.717277,-0.952029,-0.397769,-0.50426,-0.837725,1


In [63]:
X_train, X_test, y_train, y_test = train_test_split(df_iv_scaled_const, df_dv, test_size=0.2, random_state=10)

In [68]:
model_stat = sm.Logit(endog=y_train, exog=X_train).fit()
model_stat.summary()

Optimization terminated successfully.
         Current function value: 0.241326
         Iterations 8


0,1,2,3
Dep. Variable:,Chance of Admit,No. Observations:,320.0
Model:,Logit,Df Residuals:,312.0
Method:,MLE,Df Model:,7.0
Date:,"Wed, 13 Oct 2021",Pseudo R-squ.:,0.6486
Time:,14:01:08,Log-Likelihood:,-77.224
converged:,True,LL-Null:,-219.78
Covariance Type:,nonrobust,LLR p-value:,9.137e-58

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.7119,0.330,-2.157,0.031,-1.359,-0.065
GRE Score,0.6095,0.447,1.365,0.172,-0.266,1.485
TOEFL Score,0.1989,0.403,0.493,0.622,-0.592,0.990
University Rating,0.5883,0.383,1.535,0.125,-0.163,1.339
SOP,0.1768,0.374,0.473,0.636,-0.555,0.909
LOR,0.5118,0.308,1.662,0.096,-0.092,1.115
CGPA,2.6273,0.544,4.832,0.000,1.562,3.693
Research,0.5819,0.465,1.251,0.211,-0.329,1.493


In [64]:
y_train

303    0
349    0
149    1
100    0
175    1
      ..
369    0
320    1
15     0
125    0
265    0
Name: Chance of Admit, Length: 320, dtype: int64

In [65]:
X_train

Unnamed: 0,const,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
303,1.0,0.540391,-0.067635,-0.076600,0.099442,0.052933,-0.082148,1
349,1.0,-0.332263,-1.057420,-0.076600,-0.894980,-0.504260,-0.938469,0
149,1.0,-0.506794,-0.232599,-0.952029,0.099442,-0.504260,-0.569076,1
100,1.0,0.453126,-0.067635,-0.076600,0.099442,0.052933,-0.233263,1
175,1.0,0.278595,0.592221,0.798829,1.093864,0.052933,0.455151,1
...,...,...,...,...,...,...,...,...
369,1.0,-1.379448,-1.552313,-1.827457,-1.392191,-0.504260,-0.955259,1
320,1.0,0.016799,-0.232599,-0.076600,0.596653,0.052933,-0.166101,1
15,1.0,-0.244998,-0.397564,-0.076600,0.099442,-1.061454,-0.501913,0
125,1.0,-1.466713,-1.222385,-0.076600,-1.392191,-0.504260,0.102549,1
