In [1]:
import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('data/Admission_Predict.csv')
data.sample(5)

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
341,342,326,110,3,3.5,3.5,8.76,1,1
116,117,299,102,3,4.0,3.5,8.62,0,0
259,260,331,119,4,5.0,4.5,9.34,1,1
109,110,304,103,5,5.0,4.0,8.64,0,0
287,288,324,114,5,5.0,4.5,9.08,1,1


### Droping the Useless Columns

In [3]:
data.drop(columns='Serial No.', axis=1, inplace=True)
data.sample(5)

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
377,290,100,1,1.5,2.0,7.56,0,0
232,312,107,2,2.5,3.5,8.27,0,0
116,299,102,3,4.0,3.5,8.62,0,0
268,327,113,4,4.5,5.0,9.14,0,1
66,327,114,3,3.0,3.0,9.02,0,0


### Split into IV, DV and Scale Numeric Data

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   GRE Score          400 non-null    int64  
 1   TOEFL Score        400 non-null    int64  
 2   University Rating  400 non-null    int64  
 3   SOP                400 non-null    float64
 4   LOR                400 non-null    float64
 5   CGPA               400 non-null    float64
 6   Research           400 non-null    int64  
 7   Chance of Admit    400 non-null    int64  
dtypes: float64(3), int64(5)
memory usage: 25.1 KB


In [5]:
df_dv = data['Chance of Admit']
df_iv = data.drop(columns='Chance of Admit', axis=1)
df_iv.sample(2)

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
314,305,105,2,3.0,4.0,8.13,0
165,322,110,5,4.5,4.0,8.97,0


In [6]:
type(df_dv)

pandas.core.series.Series

In [9]:
from sklearn.preprocessing import StandardScaler

In [10]:
sscaler = StandardScaler()

In [11]:
df_iv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   GRE Score          400 non-null    int64  
 1   TOEFL Score        400 non-null    int64  
 2   University Rating  400 non-null    int64  
 3   SOP                400 non-null    float64
 4   LOR                400 non-null    float64
 5   CGPA               400 non-null    float64
 6   Research           400 non-null    int64  
dtypes: float64(3), int64(4)
memory usage: 22.0 KB


In [12]:
df_iv['Research'] = df_iv['Research'].astype('object')
df_iv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   GRE Score          400 non-null    int64  
 1   TOEFL Score        400 non-null    int64  
 2   University Rating  400 non-null    int64  
 3   SOP                400 non-null    float64
 4   LOR                400 non-null    float64
 5   CGPA               400 non-null    float64
 6   Research           400 non-null    object 
dtypes: float64(3), int64(3), object(1)
memory usage: 22.0+ KB


In [13]:
df_iv_numeric = df_iv.select_dtypes(include=np.number)
df_iv_categorical = df_iv.select_dtypes(exclude=np.number)

In [14]:
df_iv_numeric_scaled = sscaler.fit_transform(df_iv_numeric)
df_iv_numeric_scaled = pd.DataFrame(data=df_iv_numeric_scaled, columns=df_iv_numeric.columns)
df_iv_numeric_scaled

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA
0,1.762107,1.746971,0.798829,1.093864,1.167321,1.764818
1,0.627656,-0.067635,0.798829,0.596653,1.167321,0.455151
2,-0.070467,-0.562528,-0.076600,-0.397769,0.052933,-1.005631
3,0.453126,0.427257,-0.076600,0.099442,-1.061454,0.119339
4,-0.244998,-0.727492,-0.952029,-1.392191,-0.504260,-0.653029
...,...,...,...,...,...,...
395,0.627656,0.427257,-0.076600,0.099442,0.052933,0.740592
396,0.714922,-0.067635,-0.076600,-0.397769,0.052933,0.858126
397,1.151249,1.417042,0.798829,1.591075,1.167321,1.429006
398,-0.419528,-0.727492,-0.076600,0.099442,0.610127,0.304036


In [15]:
df_iv_scaled = pd.concat([df_iv_numeric_scaled, df_iv_categorical], axis=1)
df_iv_scaled.sample(5)

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
171,1.50031,1.582006,1.674257,0.596653,1.167321,0.790963,1
299,-1.030386,0.757186,-0.0766,-0.397769,0.052933,0.085758,0
160,-0.157732,-0.727492,-1.827457,-1.889402,-1.618648,-1.2407,0
370,-0.594059,-0.727492,-0.952029,-0.89498,-1.061454,-0.602657,0
10,0.714922,-0.232599,-0.0766,0.099442,0.610127,-0.334007,1


In [16]:
df_iv_scaled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   GRE Score          400 non-null    float64
 1   TOEFL Score        400 non-null    float64
 2   University Rating  400 non-null    float64
 3   SOP                400 non-null    float64
 4   LOR                400 non-null    float64
 5   CGPA               400 non-null    float64
 6   Research           400 non-null    object 
dtypes: float64(6), object(1)
memory usage: 22.0+ KB


### Converting back IV Categorical Data into numeric
This shall ensure that there are not errors which computing VIF, Models etc

In [17]:
df_iv_scaled['Research'] = df_iv_scaled['Research'].astype(np.int64)
df_iv_scaled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   GRE Score          400 non-null    float64
 1   TOEFL Score        400 non-null    float64
 2   University Rating  400 non-null    float64
 3   SOP                400 non-null    float64
 4   LOR                400 non-null    float64
 5   CGPA               400 non-null    float64
 6   Research           400 non-null    int64  
dtypes: float64(6), int64(1)
memory usage: 22.0 KB


### Checking for Multicollinearity of Data

In [18]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

In [19]:
df_vif = pd.DataFrame()
# Since variance_inflation_factor takes ndarray, inputs is passed as DataFrame.values instead of DataFrame
df_vif['VIF'] = [vif(exog=df_iv_scaled.values, exog_idx=i ) for i in range(df_iv_scaled.shape[1])]
df_vif['feature'] = df_iv_scaled.columns
df_vif.sort_values('VIF', ascending=False)

Unnamed: 0,VIF,feature
5,5.205309,CGPA
0,4.358514,GRE Score
1,4.282118,TOEFL Score
3,3.063188,SOP
2,2.918556,University Rating
4,2.430409,LOR
6,1.189484,Research


In [20]:
print(f'''The VIF for the first feature is \n{df_vif[df_vif['VIF']>5]} > 5.
So Multicollinearity exists but it should be fine since its just above 5.
So we will not drop any column''')

The VIF for the first feature is 
        VIF feature
5  5.205309    CGPA > 5.
So Multicollinearity exists but it should be fine since its just above 5.
So we will not drop any column


### Building the model using OLS

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
# Building using the OLS
import statsmodels.api as sm

In [23]:
df_iv_scaled_const = sm.add_constant(df_iv_scaled)

In [24]:
df_iv_scaled_const.sample(2)

Unnamed: 0,const,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
326,1.0,-1.553979,-1.222385,-0.0766,-1.392191,-1.618648,-0.97205,0
368,1.0,-1.641244,-2.542098,-1.827457,-1.392191,-1.618648,-1.207118,0


In [25]:
X_train, X_test, y_train, y_test = train_test_split(df_iv_scaled_const, df_dv, test_size=0.2, random_state=10)

In [26]:
model_stat = sm.Logit(endog=y_train, exog=X_train).fit()
model_stat.summary()

Optimization terminated successfully.
         Current function value: 0.241326
         Iterations 8


0,1,2,3
Dep. Variable:,Chance of Admit,No. Observations:,320.0
Model:,Logit,Df Residuals:,312.0
Method:,MLE,Df Model:,7.0
Date:,"Thu, 14 Oct 2021",Pseudo R-squ.:,0.6486
Time:,10:52:55,Log-Likelihood:,-77.224
converged:,True,LL-Null:,-219.78
Covariance Type:,nonrobust,LLR p-value:,9.137e-58

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.7119,0.330,-2.157,0.031,-1.359,-0.065
GRE Score,0.6095,0.447,1.365,0.172,-0.266,1.485
TOEFL Score,0.1989,0.403,0.493,0.622,-0.592,0.990
University Rating,0.5883,0.383,1.535,0.125,-0.163,1.339
SOP,0.1768,0.374,0.473,0.636,-0.555,0.909
LOR,0.5118,0.308,1.662,0.096,-0.092,1.115
CGPA,2.6273,0.544,4.832,0.000,1.562,3.693
Research,0.5819,0.465,1.251,0.211,-0.329,1.493


### Backward Elemination Strategy

In [29]:
X_train1 = X_train.copy()

while(len(X_train1.columns) > 0):
    model_stat = sm.Logit(endog=y_train, exog=X_train1).fit()
    f = model_stat.pvalues[1:].idxmax()
    print(f)
    if model_stat.pvalues[1:].max() > 0.05:
        X_train1 = X_train1.drop(f, axis=1)
    else:
        break
        
print(f'The final features through backward eliminations are: {X_train1.columns}')

Optimization terminated successfully.
         Current function value: 0.241326
         Iterations 8
SOP
Optimization terminated successfully.
         Current function value: 0.241676
         Iterations 8
TOEFL Score
Optimization terminated successfully.
         Current function value: 0.242082
         Iterations 8
Research
Optimization terminated successfully.
         Current function value: 0.244507
         Iterations 8
University Rating
The final features through backward eliminations are: Index(['const', 'GRE Score', 'University Rating', 'LOR', 'CGPA'], dtype='object')


In [30]:
model_stat.summary()

0,1,2,3
Dep. Variable:,Chance of Admit,No. Observations:,320.0
Model:,Logit,Df Residuals:,315.0
Method:,MLE,Df Model:,4.0
Date:,"Thu, 14 Oct 2021",Pseudo R-squ.:,0.644
Time:,12:09:54,Log-Likelihood:,-78.242
converged:,True,LL-Null:,-219.78
Covariance Type:,nonrobust,LLR p-value:,4.8499999999999995e-60

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.4097,0.208,-1.968,0.049,-0.818,-0.002
GRE Score,0.9266,0.380,2.438,0.015,0.182,1.671
University Rating,0.7219,0.347,2.078,0.038,0.041,1.403
LOR,0.5593,0.266,2.100,0.036,0.037,1.081
CGPA,2.6964,0.520,5.183,0.000,1.677,3.716


In [31]:
model_stat.summary2()

0,1,2,3
Model:,Logit,Pseudo R-squared:,0.644
Dependent Variable:,Chance of Admit,AIC:,166.4843
Date:,2021-10-14 12:10,BIC:,185.3259
No. Observations:,320,Log-Likelihood:,-78.242
Df Model:,4,LL-Null:,-219.78
Df Residuals:,315,LLR p-value:,4.8501999999999997e-60
Converged:,1.0000,Scale:,1.0
No. Iterations:,8.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
const,-0.4097,0.2082,-1.9676,0.0491,-0.8178,-0.0016
GRE Score,0.9266,0.3801,2.4381,0.0148,0.1817,1.6715
University Rating,0.7219,0.3473,2.0784,0.0377,0.0411,1.4027
LOR,0.5593,0.2663,2.1003,0.0357,0.0374,1.0812
CGPA,2.6964,0.5203,5.1825,0.0000,1.6767,3.7162


In [None]:
print('''
One unit increase in GRE Score, will increase the logit by 0.966 times, by keeping all the other inputs constant''')

In [32]:
model_stat.params

const               -0.409713
GRE Score            0.926597
University Rating    0.721899
LOR                  0.559261
CGPA                 2.696416
dtype: float64

In [33]:
np.exp(model_stat.params)

const                 0.663841
GRE Score             2.525898
University Rating     2.058337
LOR                   1.749379
CGPA                 14.826494
dtype: float64

In [37]:
print('''
One unit increase in GRE Score, will increase the odd of getting admission by 2.526 times, by keeping all the other inputs constant
One unit increase in CGPA Score, will increase the odd of getting admission by 14.826 times, by keeping all the other inputs constant''')


One unit increase in GRE Score, will increase the odd of getting admission by 2.526 times, by keeping all the other inputs constant
One unit increase in CGPA Score, will increase the odd of getting admission by 14.826 times, by keeping all the other inputs constant


In [36]:
model_stat.pvalues[1:]

GRE Score            1.476580e-02
University Rating    3.767710e-02
LOR                  3.570475e-02
CGPA                 2.188791e-07
dtype: float64

### Building the model using sklearn

In [38]:
from sklearn.linear_model import LogisticRegression

In [39]:
model_logistic = LogisticRegression()
model_logistic.fit(X_train, y_train)

LogisticRegression()

In [40]:
y_pred = model_logistic.predict(X_test)

In [41]:
y_pred

array([0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0], dtype=int64)

In [42]:
# IF we want to understand the Probability values foreach of the prediction
model_logistic.predict_proba(X_test)

array([[9.33980319e-01, 6.60196807e-02],
       [2.05016563e-01, 7.94983437e-01],
       [9.78363353e-01, 2.16366472e-02],
       [9.98745909e-01, 1.25409079e-03],
       [9.97096908e-01, 2.90309160e-03],
       [9.50318740e-01, 4.96812601e-02],
       [1.21617181e-03, 9.98783828e-01],
       [3.55456813e-03, 9.96445432e-01],
       [5.83694729e-03, 9.94163053e-01],
       [7.59486785e-01, 2.40513215e-01],
       [1.19920537e-01, 8.80079463e-01],
       [2.61414895e-02, 9.73858511e-01],
       [6.01569344e-01, 3.98430656e-01],
       [9.94889613e-01, 5.11038674e-03],
       [1.00051667e-03, 9.98999483e-01],
       [3.17371297e-03, 9.96826287e-01],
       [9.99212509e-01, 7.87490536e-04],
       [1.55103420e-01, 8.44896580e-01],
       [2.92324577e-02, 9.70767542e-01],
       [1.51821371e-01, 8.48178629e-01],
       [1.32907447e-03, 9.98670926e-01],
       [2.19673632e-01, 7.80326368e-01],
       [1.84933052e-01, 8.15066948e-01],
       [5.18152616e-01, 4.81847384e-01],
       [6.171914

In [43]:
np.round(model_logistic.predict_proba(X_test), 3)

array([[0.934, 0.066],
       [0.205, 0.795],
       [0.978, 0.022],
       [0.999, 0.001],
       [0.997, 0.003],
       [0.95 , 0.05 ],
       [0.001, 0.999],
       [0.004, 0.996],
       [0.006, 0.994],
       [0.759, 0.241],
       [0.12 , 0.88 ],
       [0.026, 0.974],
       [0.602, 0.398],
       [0.995, 0.005],
       [0.001, 0.999],
       [0.003, 0.997],
       [0.999, 0.001],
       [0.155, 0.845],
       [0.029, 0.971],
       [0.152, 0.848],
       [0.001, 0.999],
       [0.22 , 0.78 ],
       [0.185, 0.815],
       [0.518, 0.482],
       [0.617, 0.383],
       [0.693, 0.307],
       [0.982, 0.018],
       [0.002, 0.998],
       [0.976, 0.024],
       [0.551, 0.449],
       [0.084, 0.916],
       [0.744, 0.256],
       [0.39 , 0.61 ],
       [0.16 , 0.84 ],
       [0.976, 0.024],
       [0.02 , 0.98 ],
       [0.999, 0.001],
       [0.979, 0.021],
       [0.299, 0.701],
       [0.117, 0.883],
       [0.003, 0.997],
       [0.044, 0.956],
       [0.199, 0.801],
       [0.9

In [None]:
print('''
The predict Proba is giving an array of two values for each element.
[0.934, 0.066], [0.205, 0.795], ...
[Probability for 0 class, Probability for 1 class]
If Probability for 0 class > Probability for 1 class (based on threshold), then the output will be 0 in Predict()
''')