In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge,Lasso,RidgeCV,LassoCV,ElasticNet,ElasticNetCV,LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
data = pd.read_csv('Admission_Prediction.csv')
data

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337.0,118.0,4.0,4.5,4.5,9.65,1,0.92
1,2,324.0,107.0,4.0,4.0,4.5,8.87,1,0.76
2,3,,104.0,3.0,3.0,3.5,8.00,1,0.72
3,4,322.0,110.0,3.0,3.5,2.5,8.67,1,0.80
4,5,314.0,103.0,2.0,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...,...
495,496,332.0,108.0,5.0,4.5,4.0,9.02,1,0.87
496,497,337.0,117.0,5.0,5.0,5.0,9.87,1,0.96
497,498,330.0,120.0,5.0,4.5,5.0,9.56,1,0.93
498,499,312.0,103.0,4.0,4.0,5.0,8.43,0,0.73


In [5]:
from pandas_profiling import ProfileReport
pf = ProfileReport(data)

In [6]:
pf.to_widgets()

HBox(children=(HTML(value='Summarize dataset'), FloatProgress(value=0.0, max=22.0), HTML(value='')))




HBox(children=(HTML(value='Generate report structure'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Render widgets'), FloatProgress(value=0.0, max=1.0), HTML(value='')))

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [7]:
pf.to_file('Admission_Prediction.html')

HBox(children=(HTML(value='Render HTML'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Export report to file'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [8]:
data.columns

Index(['Serial No.', 'GRE Score', 'TOEFL Score', 'University Rating', 'SOP',
       'LOR', 'CGPA', 'Research', 'Chance of Admit'],
      dtype='object')

In [9]:
data['GRE Score'].mean()

316.55876288659795

In [10]:
# filling missing value with avg for now not using imputers
data['GRE Score'] = data['GRE Score'].fillna(data['GRE Score'].mean())

In [13]:
data['GRE Score'].isna().sum()

0

In [14]:
data['TOEFL Score'] = data['TOEFL Score'].fillna(data['TOEFL Score'].mean())

In [17]:
data['TOEFL Score'].isna().sum()

0

In [15]:
data['University Rating'] = data['University Rating'].fillna(data['University Rating'].mean())

In [18]:
data['University Rating'].isna().sum()

0

In [19]:
# dropping not required column 
data.drop(columns=['Serial No.'],inplace=True)

In [21]:
data.head()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,337.0,118.0,4.0,4.5,4.5,9.65,1,0.92
1,324.0,107.0,4.0,4.0,4.5,8.87,1,0.76
2,316.558763,104.0,3.0,3.0,3.5,8.0,1,0.72
3,322.0,110.0,3.0,3.5,2.5,8.67,1,0.8
4,314.0,103.0,2.0,2.0,3.0,8.21,0,0.65


In [22]:
y = data['Chance of Admit']
x = data.drop(columns=['Chance of Admit'])

In [23]:
x

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
0,337.000000,118.0,4.0,4.5,4.5,9.65,1
1,324.000000,107.0,4.0,4.0,4.5,8.87,1
2,316.558763,104.0,3.0,3.0,3.5,8.00,1
3,322.000000,110.0,3.0,3.5,2.5,8.67,1
4,314.000000,103.0,2.0,2.0,3.0,8.21,0
...,...,...,...,...,...,...,...
495,332.000000,108.0,5.0,4.5,4.0,9.02,1
496,337.000000,117.0,5.0,5.0,5.0,9.87,1
497,330.000000,120.0,5.0,4.5,5.0,9.56,1
498,312.000000,103.0,4.0,4.0,5.0,8.43,0


In [24]:
y

0      0.92
1      0.76
2      0.72
3      0.80
4      0.65
       ... 
495    0.87
496    0.96
497    0.93
498    0.73
499    0.84
Name: Chance of Admit, Length: 500, dtype: float64

In [26]:
# range of data for columns GRE score , University Rating, SOP ,CGPA , etc varying lot 
# Data has lots of variance so for model to understand relation between feature and label in a better way 
# z statistics is used here to do standardization mean=0 , standard deviation =1
scalar = StandardScaler()

In [34]:
arr = scalar.fit_transform(x)

In [35]:
arr

array([[ 1.84274116e+00,  1.78854223e+00,  7.78905651e-01, ...,
         1.09894429e+00,  1.77680627e+00,  8.86405260e-01],
       [ 6.70814288e-01, -3.10581135e-02,  7.78905651e-01, ...,
         1.09894429e+00,  4.85859428e-01,  8.86405260e-01],
       [ 5.12433309e-15, -5.27312752e-01, -1.07876604e-01, ...,
         1.73062093e-02, -9.54042814e-01,  8.86405260e-01],
       ...,
       [ 1.21170361e+00,  2.11937866e+00,  1.66568791e+00, ...,
         1.63976333e+00,  1.62785086e+00,  8.86405260e-01],
       [-4.10964364e-01, -6.92730965e-01,  7.78905651e-01, ...,
         1.63976333e+00, -2.42366993e-01, -1.12815215e+00],
       [ 9.41258951e-01,  9.61451165e-01,  7.78905651e-01, ...,
         1.09894429e+00,  7.67219636e-01, -1.12815215e+00]])

In [38]:
df = pd.DataFrame(arr)

In [39]:
pf2 = ProfileReport(df)
pf2.to_widgets()

HBox(children=(HTML(value='Summarize dataset'), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value='Generate report structure'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Render widgets'), FloatProgress(value=0.0, max=1.0), HTML(value='')))

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [40]:
df

Unnamed: 0,0,1,2,3,4,5,6
0,1.842741e+00,1.788542,0.778906,1.137360,1.098944,1.776806,0.886405
1,6.708143e-01,-0.031058,0.778906,0.632315,1.098944,0.485859,0.886405
2,5.124333e-15,-0.527313,-0.107877,-0.377773,0.017306,-0.954043,0.886405
3,4.905178e-01,0.465197,-0.107877,0.127271,-1.064332,0.154847,0.886405
4,-2.306679e-01,-0.692731,-0.994659,-1.387862,-0.523513,-0.606480,-1.128152
...,...,...,...,...,...,...,...
495,1.392000e+00,0.134360,1.665688,1.137360,0.558125,0.734118,0.886405
496,1.842741e+00,1.623124,1.665688,1.642404,1.639763,2.140919,0.886405
497,1.211704e+00,2.119379,1.665688,1.137360,1.639763,1.627851,0.886405
498,-4.109644e-01,-0.692731,0.778906,0.632315,1.639763,-0.242367,-1.128152


In [41]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# To remove multicollinearity using Variance Inflation Factor VIF

variance_inflation_factor()

In [42]:
arr.shape

(500, 7)

In [43]:
arr.shape[1]

7

In [45]:
vif_df = pd.DataFrame()
vif_df['vif'] = [variance_inflation_factor(arr,i) for i in range(arr.shape[1])]

In [48]:
vif_df['feature'] = x.columns

In [49]:
vif_df

Unnamed: 0,vif,feature
0,4.153268,GRE Score
1,3.792866,TOEFL Score
2,2.508768,University Rating
3,2.77575,SOP
4,2.037308,LOR
5,4.65167,CGPA
6,1.459311,Research


# VIF is not > 10 for any feature so we can use all columns as it is 

In [59]:
x_train,x_test,y_train,y_test = train_test_split(arr,y,test_size=0.25) # Randomly splitting data in train and test data with 25%,75% ratio

In [60]:
x_train

array([[ 2.0230376 ,  1.12686938,  1.66568791, ...,  1.09894429,
         1.95886287,  0.88640526],
       [-1.94348412, -1.35440382, -1.88144112, ..., -2.14596996,
        -1.66571863, -1.12815215],
       [ 0.49051785,  0.46519653, -0.1078766 , ...,  0.01730621,
        -0.95404281, -1.12815215],
       ...,
       [-0.05037148,  0.63061474,  0.77890565, ...,  1.63976333,
        -0.06031039, -1.12815215],
       [-1.13215013, -0.69273097,  1.66568791, ...,  0.55812525,
         0.10519562, -1.12815215],
       [-0.50111259, -0.52731275, -0.99465886, ...,  0.01730621,
        -0.15961399, -1.12815215]])

In [61]:
x_train,x_test,y_train,y_test = train_test_split(arr,y,test_size=0.25,random_state=345) # to fix random selection just like seed using random_state

In [82]:
x_train,x_test,y_train,y_test = train_test_split(arr,y,test_size=0.25,random_state=1000) # to fix random selection just like seed using random_state

In [83]:
x_train

array([[-0.95185369,  0.46519653, -0.99465886, ...,  0.55812525,
        -0.20926579, -1.12815215],
       [ 0.76096251,  0.63061474, -0.1078766 , ...,  0.01730621,
         0.20449922, -1.12815215],
       [-2.39422523, -0.52731275,  0.77890565, ..., -1.06433187,
        -1.84777524, -1.12815215],
       ...,
       [ 1.75259294,  0.79603295,  1.66568791, ...,  1.63976333,
         1.95886287,  0.88640526],
       [ 0.03977674, -0.03105811, -0.99465886, ..., -0.52351283,
        -0.490626  , -1.12815215],
       [-0.68140903, -0.36189454, -0.99465886, ...,  0.55812525,
        -1.48366203, -1.12815215]])

In [84]:
lr =LinearRegression()
lr.fit(x_train,y_train)

LinearRegression()

In [85]:
lr.score(x_test,y_test) # r square accurracy

0.8032472634599743

In [86]:
lr.score(x_train,y_train)

0.8256240842466435

In [87]:
import pickle
pickle.dump(lr,open('admission_lr_model.pickle','wb'))

In [88]:
saved_model = pickle.load(open('admission_lr_model.pickle','rb'))

In [89]:
data

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,337.000000,118.0,4.0,4.5,4.5,9.65,1,0.92
1,324.000000,107.0,4.0,4.0,4.5,8.87,1,0.76
2,316.558763,104.0,3.0,3.0,3.5,8.00,1,0.72
3,322.000000,110.0,3.0,3.5,2.5,8.67,1,0.80
4,314.000000,103.0,2.0,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...
495,332.000000,108.0,5.0,4.5,4.0,9.02,1,0.87
496,337.000000,117.0,5.0,5.0,5.0,9.87,1,0.96
497,330.000000,120.0,5.0,4.5,5.0,9.56,1,0.93
498,312.000000,103.0,4.0,4.0,5.0,8.43,0,0.73


In [90]:
saved_model.predict([[337.000000,118.0,4.0,4.5,4.5,9.65,1]]) # Here output is something different 
# bcoz while training model we have transformed data 
# so need to do with testing data set that too with same object -> scalar

array([8.69530225])

In [91]:
test1 = scalar.transform([[337.000000,118.0,4.0,4.5,4.5,9.65,1]])

In [92]:
saved_model.predict(test1)

array([0.95140909])

In [93]:
test1 = scalar.transform([[312.000000,103.0,4.0,4.0,5.0,8.43,0]])
saved_model.predict(test1)

array([0.703954])

In [94]:
# calculation adjusted R- squared
def adj_r2(x,y,model):
    r2 = model.score(x,y)
    n = x.shape[0]
    p = x.shape[1]
    adjusted_r2 = 1 - (1-r2)*(n-1)/(n-p-1)
    return adjusted_r2

adj_r2(x_test,y_test,saved_model)

0.7914757322139898

In [95]:
adj_r2(x_train,y_train,saved_model)

0.822298113101484

# Now to increase accruacy changing random_state to 1000

In [97]:
# using regularization 

lassocv = LassoCV(cv=10,max_iter=200000,normalize=True)