In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso,Ridge,ElasticNet,LassoCV,RidgeCV,ElasticNetCV,LinearRegression
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv('Admission_Prediction.csv')

In [4]:
df.shape

(500, 9)

In [5]:
df.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337.0,118.0,4.0,4.5,4.5,9.65,1,0.92
1,2,324.0,107.0,4.0,4.0,4.5,8.87,1,0.76
2,3,,104.0,3.0,3.0,3.5,8.0,1,0.72
3,4,322.0,110.0,3.0,3.5,2.5,8.67,1,0.8
4,5,314.0,103.0,2.0,2.0,3.0,8.21,0,0.65


### Hangling missing values

In [6]:
df.isnull().sum()

Serial No.            0
GRE Score            15
TOEFL Score          10
University Rating    15
SOP                   0
LOR                   0
CGPA                  0
Research              0
Chance of Admit       0
dtype: int64

In [7]:
df['GRE Score'].fillna(df['GRE Score'].mean(),inplace=True)
df['TOEFL Score'].fillna(df['TOEFL Score'].mean(),inplace=True)
df['University Rating'].fillna(df['University Rating'].mean(),inplace=True)

In [8]:
df.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337.0,118.0,4.0,4.5,4.5,9.65,1,0.92
1,2,324.0,107.0,4.0,4.0,4.5,8.87,1,0.76
2,3,316.558763,104.0,3.0,3.0,3.5,8.0,1,0.72
3,4,322.0,110.0,3.0,3.5,2.5,8.67,1,0.8
4,5,314.0,103.0,2.0,2.0,3.0,8.21,0,0.65


In [11]:
#drop Serial no which is constant
df.drop(columns=['Serial No.'],axis=1,inplace=True)


In [12]:
df.head()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,337.0,118.0,4.0,4.5,4.5,9.65,1,0.92
1,324.0,107.0,4.0,4.0,4.5,8.87,1,0.76
2,316.558763,104.0,3.0,3.0,3.5,8.0,1,0.72
3,322.0,110.0,3.0,3.5,2.5,8.67,1,0.8
4,314.0,103.0,2.0,2.0,3.0,8.21,0,0.65


In [15]:
X=df.drop(columns=['Chance of Admit'],axis=1)
Y=df['Chance of Admit']

In [18]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   GRE Score          500 non-null    float64
 1   TOEFL Score        500 non-null    float64
 2   University Rating  500 non-null    float64
 3   SOP                500 non-null    float64
 4   LOR                500 non-null    float64
 5   CGPA               500 non-null    float64
 6   Research           500 non-null    int64  
dtypes: float64(6), int64(1)
memory usage: 27.5 KB


In [21]:
std_scalar=StandardScaler()

In [23]:
arr1=std_scalar.fit_transform(X)

In [25]:
df1=pd.DataFrame(arr1,columns=X.columns)

### muilticoliniaruty`

In [26]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [28]:
vif_df=pd.DataFrame()
vif_df['vif_score']=[variance_inflation_factor(arr1,i) for i in range(arr1.shape[1])]

In [30]:
vif_df['feature']=X.columns


In [31]:
vif_df

Unnamed: 0,vif_score,feature
0,4.153268,GRE Score
1,3.792866,TOEFL Score
2,2.508768,University Rating
3,2.77575,SOP
4,2.037308,LOR
5,4.65167,CGPA
6,1.459311,Research


In [32]:
# Note: No need to remove the columns because no muilticolinearity <5 or <10

In [35]:

xtrain,xtest,ytrain,ytest=train_test_split(arr1,Y,test_size=.2,random_state=0)

In [36]:
lr=LinearRegression()

In [37]:
lr.fit(xtrain,ytrain)

In [38]:
print(lr.coef_,lr.intercept_)

[0.02408022 0.01583773 0.00677457 0.00046322 0.01858513 0.07048533
 0.01237107] 0.7209291542718489


In [39]:
lr.score(xtrain,ytrain)

0.8319899358289431

### Loaad and read pickle model

In [41]:
import pickle

pickle.dump(lr,open("Admission_lr_model.sav",'wb'))

In [47]:
pickle.load(open('Admission_lr_model.sav','rb'))

### predict the data

In [53]:
test1=X.iloc[0]
test1=std_scalar.transform([test1])



In [54]:
lr.predict(test1)

array([0.95606139])

In [58]:
y_pred=lr.predict(xtest)

In [61]:
#prediction for test size
lr.score(xtest,ytest)

0.7589292574503157

In [62]:
from sklearn.metrics import r2_score

In [63]:
r2_score(ytest,y_pred)

0.7589292574503157

### Lasso Regression

In [68]:
lassocv=LassoCV(max_iter=200000,cv=10)

In [70]:
lassocv.fit(xtrain,ytrain)

In [71]:
lassocv.alpha_

0.00072900935750929

In [72]:
lasso=Lasso(alpha=lassocv.alpha_)

In [73]:
lasso.fit(xtrain,ytrain)

In [74]:
lasso.score(xtest,ytest)

0.7602369359609997

### Ridge regression

In [91]:
ridgecv=RidgeCV(cv=10,alphas=(0.1, 1.0, 10.0))

In [92]:
ridgecv.fit(xtrain,ytrain)

In [93]:
ridgecv.alpha_

10.0

In [95]:
ridge=Ridge(alpha=ridgecv.alpha_)

In [97]:
ridge.fit(xtest,ytest)

In [98]:
ridge.score(xtest,ytest)

0.77157876946993

### Elastic net

In [99]:
elasticnet=ElasticNetCV(alphas=None,max_iter=30000,cv=10)

In [101]:
elasticnet.fit(xtrain,ytrain)

In [102]:
elasticnet.alpha_

0.0014580187150185788

In [103]:
elastic=ElasticNet(alpha=elasticnet.alpha_)

In [104]:
elastic.fit(xtrain,ytrain)

In [105]:
elastic.score(xtest,ytest)

0.7602154446296342