In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge,Lasso ,RidgeCV ,LassoCV,ElasticNet,ElasticNetCV,LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from pandas_profiling import ProfileReport
import pickle 
import warnings

In [2]:
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('Admission_Prediction.csv')

In [4]:
df.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337.0,118.0,4.0,4.5,4.5,9.65,1,0.92
1,2,324.0,107.0,4.0,4.0,4.5,8.87,1,0.76
2,3,,104.0,3.0,3.0,3.5,8.0,1,0.72
3,4,322.0,110.0,3.0,3.5,2.5,8.67,1,0.8
4,5,314.0,103.0,2.0,2.0,3.0,8.21,0,0.65


In [5]:
pf = ProfileReport(df)

In [6]:
pf.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

Some of the insights that we got from above analysis
1. There is multicollineartiy in the dataset.
2. There are missing values present in the dataset.
3. The data is noramlly distibuted.

In [7]:
df['GRE Score'] = df['GRE Score'].fillna(df['GRE Score'].mean())

In [8]:
df['TOEFL Score'] = df['TOEFL Score'].fillna(df['TOEFL Score'].mean())

In [9]:
df['University Rating'] = df['University Rating'].fillna(df['University Rating'].mean())

In [10]:
df.describe()  #now count of each column is 500 means no missing values

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,250.5,316.558763,107.187755,3.121649,3.374,3.484,8.57644,0.56,0.72174
std,144.481833,11.103952,6.051338,1.128802,0.991004,0.92545,0.604813,0.496884,0.14114
min,1.0,290.0,92.0,1.0,1.0,1.0,6.8,0.0,0.34
25%,125.75,309.0,103.0,2.0,2.5,3.0,8.1275,0.0,0.63
50%,250.5,316.558763,107.0,3.0,3.5,3.5,8.56,1.0,0.72
75%,375.25,324.0,112.0,4.0,4.0,4.0,9.04,1.0,0.82
max,500.0,340.0,120.0,5.0,5.0,5.0,9.92,1.0,0.97


In [11]:
df.drop(columns= ['Serial No.'],inplace= True)

In [12]:
df.head()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,337.0,118.0,4.0,4.5,4.5,9.65,1,0.92
1,324.0,107.0,4.0,4.0,4.5,8.87,1,0.76
2,316.558763,104.0,3.0,3.0,3.5,8.0,1,0.72
3,322.0,110.0,3.0,3.5,2.5,8.67,1,0.8
4,314.0,103.0,2.0,2.0,3.0,8.21,0,0.65


In [13]:
y = df['Chance of Admit']

In [14]:
x = df.drop(columns=['Chance of Admit'])   # this will give us all the faetures removing the label

In [15]:
y

0      0.92
1      0.76
2      0.72
3      0.80
4      0.65
       ... 
495    0.87
496    0.96
497    0.93
498    0.73
499    0.84
Name: Chance of Admit, Length: 500, dtype: float64

In [16]:
x

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
0,337.000000,118.0,4.0,4.5,4.5,9.65,1
1,324.000000,107.0,4.0,4.0,4.5,8.87,1
2,316.558763,104.0,3.0,3.0,3.5,8.00,1
3,322.000000,110.0,3.0,3.5,2.5,8.67,1
4,314.000000,103.0,2.0,2.0,3.0,8.21,0
...,...,...,...,...,...,...,...
495,332.000000,108.0,5.0,4.5,4.0,9.02,1
496,337.000000,117.0,5.0,5.0,5.0,9.87,1
497,330.000000,120.0,5.0,4.5,5.0,9.56,1
498,312.000000,103.0,4.0,4.0,5.0,8.43,0


Now we will use StandardScaler because we can see there is a lot of dispersion in the data which will create difficulty fro our model to understand the relation. So we bring the data to lower level. 

In [17]:
scaler = StandardScaler()

In [18]:
arr = scaler.fit_transform(x)

In [19]:
arr

array([[ 1.84274116e+00,  1.78854223e+00,  7.78905651e-01, ...,
         1.09894429e+00,  1.77680627e+00,  8.86405260e-01],
       [ 6.70814288e-01, -3.10581135e-02,  7.78905651e-01, ...,
         1.09894429e+00,  4.85859428e-01,  8.86405260e-01],
       [ 5.12433309e-15, -5.27312752e-01, -1.07876604e-01, ...,
         1.73062093e-02, -9.54042814e-01,  8.86405260e-01],
       ...,
       [ 1.21170361e+00,  2.11937866e+00,  1.66568791e+00, ...,
         1.63976333e+00,  1.62785086e+00,  8.86405260e-01],
       [-4.10964364e-01, -6.92730965e-01,  7.78905651e-01, ...,
         1.63976333e+00, -2.42366993e-01, -1.12815215e+00],
       [ 9.41258951e-01,  9.61451165e-01,  7.78905651e-01, ...,
         1.09894429e+00,  7.67219636e-01, -1.12815215e+00]])

In [20]:
df1 = pd.DataFrame(arr)    #mean has become close to zero and standard deviation is close to 1 and dispersion of values is also less

In [21]:
df1

Unnamed: 0,0,1,2,3,4,5,6
0,1.842741e+00,1.788542,0.778906,1.137360,1.098944,1.776806,0.886405
1,6.708143e-01,-0.031058,0.778906,0.632315,1.098944,0.485859,0.886405
2,5.124333e-15,-0.527313,-0.107877,-0.377773,0.017306,-0.954043,0.886405
3,4.905178e-01,0.465197,-0.107877,0.127271,-1.064332,0.154847,0.886405
4,-2.306679e-01,-0.692731,-0.994659,-1.387862,-0.523513,-0.606480,-1.128152
...,...,...,...,...,...,...,...
495,1.392000e+00,0.134360,1.665688,1.137360,0.558125,0.734118,0.886405
496,1.842741e+00,1.623124,1.665688,1.642404,1.639763,2.140919,0.886405
497,1.211704e+00,2.119379,1.665688,1.137360,1.639763,1.627851,0.886405
498,-4.109644e-01,-0.692731,0.778906,0.632315,1.639763,-0.242367,-1.128152


In [22]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_df = pd.DataFrame()

In [23]:
vif_df['vif'] = [variance_inflation_factor(arr,i) for i in range(arr.shape[1])]

In [24]:
vif_df['feature'] = x.columns

In [25]:
vif_df   

Unnamed: 0,vif,feature
0,4.153268,GRE Score
1,3.792866,TOEFL Score
2,2.508768,University Rating
3,2.77575,SOP
4,2.037308,LOR
5,4.65167,CGPA
6,1.459311,Research


As vif values is less than 10 that means we don't need to drop any of the columns i.e there is no multicollinearity in the dataset and we can go forward with arr dataset

In [26]:
x_train,x_test,y_train,y_test = train_test_split(arr,y,test_size = 0.15,random_state=100)

In [27]:
x_train


array([[ 0.85111073,  0.46519653, -0.1078766 , ...,  0.01730621,
         0.30380282,  0.88640526],
       [-1.58289124, -1.1889856 , -1.88144112, ..., -1.60515091,
        -1.13609942, -1.12815215],
       [ 0.67081429,  0.63061474, -0.1078766 , ..., -2.14596996,
         0.35345462,  0.88640526],
       ...,
       [-1.04200191, -0.85814918, -0.99465886, ..., -1.06433187,
        -0.65613201, -1.12815215],
       [-0.50111259, -0.85814918, -0.1078766 , ...,  0.55812525,
         0.10519562,  0.88640526],
       [-1.31244657, -0.85814918, -1.88144112, ..., -2.14596996,
        -0.95404281, -1.12815215]])

In [28]:
lr = LinearRegression()

In [29]:
lr.fit(x_train,y_train)

LinearRegression()

In [30]:
pickle.dump(lr,open('admission_lr_model.pickle','wb'))

In [31]:
lr.predict([[337.000000,118.0,4.0,4.5,4.5,9.65,1]]) # its giving wrong output because we have to pass transformed data

array([10.08318535])

In [36]:
lr.predict(test1)

array([0.95117594])

In [37]:
lr.predict(test2)

array([0.65214989])

In [38]:
df

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,337.000000,118.0,4.0,4.5,4.5,9.65,1,0.92
1,324.000000,107.0,4.0,4.0,4.5,8.87,1,0.76
2,316.558763,104.0,3.0,3.0,3.5,8.00,1,0.72
3,322.000000,110.0,3.0,3.5,2.5,8.67,1,0.80
4,314.000000,103.0,2.0,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...
495,332.000000,108.0,5.0,4.5,4.0,9.02,1,0.87
496,337.000000,117.0,5.0,5.0,5.0,9.87,1,0.96
497,330.000000,120.0,5.0,4.5,5.0,9.56,1,0.93
498,312.000000,103.0,4.0,4.0,5.0,8.43,0,0.73


In [39]:
test1 =scaler.transform([[337.000000,118.0,4.0,4.5,4.5,9.65,1]])

In [40]:
test2 =scaler.transform([[316.558763,104.0,3.0,3.0,3.5,8.00,1]])

In [41]:
model = pickle.load(open('admission_lr_model.pickle','rb'))

In [42]:
model.predict(test2)

array([0.65214989])

In [43]:
lr.score(x_test,y_test)    #R^2

0.8420039560601401

In [44]:
#we have to find adjusted R^2
def adj_r2(x,y):
    r2 = lr.score(x_test,y_test)
    n = x.shape[0]
    p = x.shape[1]
    adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
    return adjusted_r2

In [45]:
adj_r2(x_test,y_test)

0.8254969066932891

In [46]:
lr.coef_

array([ 0.01912905,  0.01780082,  0.00550634, -0.00025051,  0.01844312,
        0.07254151,  0.01195331])

In [47]:
lr.intercept_

0.7203289055688045

In [48]:
lassocv = LassoCV(alphas= None,cv=5,max_iter=2000000,normalize=True)

In [49]:
lassocv.fit(x_train,y_train)

LassoCV(cv=5, max_iter=2000000, normalize=True)

In [50]:
lassocv.alpha_

4.203962663551952e-05

In [51]:
lasso = Lasso(alpha = lassocv.alpha_)

In [52]:
lasso.fit(x_train,y_train)

Lasso(alpha=4.203962663551952e-05)

In [53]:
lasso.score(x_test,y_test)

0.8421260048013872

In [54]:
ridgecv= RidgeCV(alphas= np.random.uniform(0,10,50),cv= 10,normalize= True)

In [55]:
ridgecv.fit(x_train,y_train)

RidgeCV(alphas=array([6.68204965, 3.19637106, 1.03026743, 3.64993363, 2.97398927,
       8.3454853 , 8.38807006, 3.88454206, 2.91640114, 5.26553035,
       6.45401583, 3.83464608, 4.18854292, 2.30884052, 1.93438421,
       8.5798477 , 1.43522014, 4.03772832, 1.50891667, 3.76786624,
       8.52663741, 1.62264038, 1.38339071, 5.92936648, 0.13692136,
       2.73567118, 7.07637867, 3.50700217, 9.41369405, 6.24604438,
       3.23920146, 3.91004311, 3.72087749, 5.95067496, 4.62111776,
       6.39288164, 7.48675185, 4.21076565, 6.39268389, 9.21218936,
       9.9335562 , 6.1579896 , 9.50609126, 1.93408987, 4.38271384,
       5.98524073, 1.94024016, 2.12090484, 3.78909938, 3.36217317]),
        cv=10, normalize=True)

In [56]:
ridgecv.alpha_

0.13692135678246786

In [57]:
np.random.uniform(0,10,50)

array([8.12728134, 2.96776034, 4.49695048, 6.63531763, 5.3711624 ,
       1.18056992, 1.38999668, 7.70516934, 2.32506583, 1.22754864,
       3.24027985, 8.36053979, 5.36938124, 5.66179225, 2.44195783,
       5.39009068, 6.1688799 , 0.76450699, 3.87241082, 0.80357198,
       7.38958885, 8.82260589, 6.0197307 , 7.60575407, 8.05278134,
       9.02605073, 2.66771879, 8.73980439, 1.11317483, 0.57935148,
       8.837884  , 1.75077521, 9.58912137, 5.76054904, 7.60464536,
       2.85047548, 5.53689032, 2.50136458, 3.65650481, 9.89727941,
       6.86003409, 7.25348964, 2.88065064, 4.46626946, 0.13784789,
       1.82600849, 5.94600074, 3.0908736 , 6.99724547, 7.33115416])

In [58]:
ridge_lr = Ridge(alpha = ridgecv.alpha_)
ridge_lr.fit(x_train,y_train)

Ridge(alpha=0.13692135678246786)

In [59]:
ridge_lr.score(x_test,y_test)

0.8420062079645798

In [60]:
elastic = ElasticNetCV(alphas= None ,cv = 10)
elastic.fit(x_train,y_train)

ElasticNetCV(cv=10)

In [61]:
elastic.alpha_

0.001391101145529104

In [62]:
elastic.l1_ratio

0.5

In [63]:
elastic_lr = ElasticNet(alpha = elastic.alpha_,l1_ratio= elastic.l1_ratio)

In [64]:
elastic_lr.fit(x_train,y_train)

ElasticNet(alpha=0.001391101145529104)