In [81]:
# import 'Pandas' 
import pandas as pd 

# import 'Numpy' 
import numpy as np

# import subpackage of Matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

# import 'Seaborn' 
import seaborn as sns

# to suppress warnings 
from warnings import filterwarnings
filterwarnings('ignore')

# display all columns of the dataframe
pd.options.display.max_columns = None

# display all rows of the dataframe
pd.options.display.max_rows = None
 
# to display the float values upto 6 decimal places     
pd.options.display.float_format = '{:.6f}'.format

# import train-test split 
from sklearn.model_selection import train_test_split

# import various functions from statsmodels
import statsmodels
import statsmodels.api as sm

# import StandardScaler to perform scaling
from sklearn.preprocessing import StandardScaler 

# import various functions from sklearn 
from sklearn import metrics
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.metrics import classification_report, recall_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score
import statsmodels.api as sm

# import function to perform feature selection
from sklearn.feature_selection import RFE

In [2]:
df_admissions = pd.read_csv('Admission_predict.csv')
df = df_admissions.drop(['Serial No.'], axis = 1)
df.head()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,337,118,4,4.5,4.5,9.65,1,1
1,324,107,4,4.0,4.5,8.87,1,1
2,316,104,3,3.0,3.5,8.0,1,0
3,322,110,3,3.5,2.5,8.67,1,1
4,314,103,2,2.0,3.0,8.21,0,0


In [3]:
df.shape

(400, 8)

In [4]:
df.isnull().sum().sum()

0

In [5]:
df.dtypes

GRE Score              int64
TOEFL Score            int64
University Rating      int64
SOP                  float64
LOR                  float64
CGPA                 float64
Research               int64
Chance of Admit        int64
dtype: object

In [7]:
df['Chance of Admit'].value_counts()

0    220
1    180
Name: Chance of Admit, dtype: int64

In [42]:
x = df.drop('Chance of Admit',axis = 1)
y = df['Chance of Admit']
xc = sm.add_constant(x)

In [46]:
xtrain,xtest,ytrain,ytest = train_test_split(xc,y,test_size=0.2, random_state=1)

In [47]:
# Fit the OLS model
logreg = sm.Logit(ytrain, xtrain).fit()

# Print the summary
logreg.summary()
# Make sure to replace X and y with your actual data.

Optimization terminated successfully.
         Current function value: 0.269228
         Iterations 8


0,1,2,3
Dep. Variable:,Chance of Admit,No. Observations:,320.0
Model:,Logit,Df Residuals:,312.0
Method:,MLE,Df Model:,7.0
Date:,"Mon, 04 Sep 2023",Pseudo R-squ.:,0.6084
Time:,10:41:17,Log-Likelihood:,-86.153
converged:,True,LL-Null:,-220.0
Covariance Type:,nonrobust,LLR p-value:,4.7329999999999997e-54

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-56.2064,9.808,-5.731,0.000,-75.430,-36.983
GRE Score,0.0348,0.035,0.986,0.324,-0.034,0.104
TOEFL Score,0.1039,0.062,1.669,0.095,-0.018,0.226
University Rating,0.2515,0.303,0.831,0.406,-0.342,0.845
SOP,0.4403,0.358,1.228,0.219,-0.262,1.143
LOR,0.2629,0.321,0.820,0.412,-0.365,0.891
CGPA,3.4887,0.804,4.340,0.000,1.913,5.064
Research,0.7388,0.432,1.712,0.087,-0.107,1.585


In [48]:
np.exp(logreg.params)

const                0.000000
GRE Score            1.035379
TOEFL Score          1.109467
University Rating    1.285971
SOP                  1.553118
LOR                  1.300654
CGPA                32.744813
Research             2.093510
dtype: float64

In [49]:
ypred_prob = logreg.predict(xtest)
ypred_prob[0:5]

398   0.335967
125   0.117820
328   0.847480
339   0.726041
172   0.956392
dtype: float64

In [51]:
ypred = [0 if x < 0.5 else 1 for x in ypred_prob]
ypred[0:5]

[0, 0, 1, 1, 1]

In [53]:
ytest[0:5]

398    0
125    0
328    1
339    1
172    1
Name: Chance of Admit, dtype: int64

In [59]:
pd.DataFrame(ytest == ypred).value_counts()

Chance of Admit
True               72
False               8
dtype: int64

In [68]:
cm = confusion_matrix(ytest, ypred)
cm

array([[40,  3],
       [ 5, 32]], dtype=int64)

In [70]:
tn,fp,fn,tp = cm[0][0], cm[0][1], cm[1][0], cm[1][1]

In [74]:
accuracy = (tn+tp)/(tn+fp+fn+tp)
accuracy

0.9

In [75]:
accuracy_score(ytest, ypred)

0.9

In [79]:
specificity = tn/(tn+fp)
print("specificity",specificity)

specificity 0.9302325581395349


In [80]:
sensitivity = tp/(tp+fn)
print("sensitivity",sensitivity)

sensitivity 0.8648648648648649


In [88]:
recall_score(ytest, ypred) #gives sensitivity

0.8648648648648649

In [89]:
# precision negative
precision_n = tn/(tn+fn)
print('precision_n',precision_n)

precision_n 0.8888888888888888


In [92]:
# precision positive
precision_p = tp/(tp+fp)
print('precision_p',precision_p)

precision_p 0.9142857142857143


In [101]:
f1score_n = 2*(precision_n)*(specificity)/(precision_n+specificity)
print("f1score_n",f1score_n.round(3))

f1score_n 0.922


In [110]:
f1score_p = 2*(precision_p)*(sensitivity)/(precision_p+sensitivity)
print("f1score_p",f1score_p.round(3))

f1score_p 0.889


In [116]:
# weight_avg_precision = ((precision_n * ytest.value_counts()[0]) + (precision_p * ytest.value_counts()[1]))/len(ytest)
# weight_avg_recall = ((specificity * ytest.value_counts()[0]) + (sensitivity * ytest.value_counts()[1]))/len(ytest)

In [117]:
# weight_avg_precision,weight_avg_recall

In [118]:
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.89      0.93      0.91        43
           1       0.91      0.86      0.89        37

    accuracy                           0.90        80
   macro avg       0.90      0.90      0.90        80
weighted avg       0.90      0.90      0.90        80



In [120]:
from sklearn.metrics import log_loss
log_loss(ytest, ypred)

3.604365338911715