### Import Modules

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV, LinearRegression
from sklearn.model_selection import train_test_split

### Import Dataset

In [2]:
df = pd.read_csv('Admission_Predict.csv')
df.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


First and foremost, some basic analysis.

### Data Analysis

In [3]:
from pandas_profiling import ProfileReport as PR
pf = PR(df)
pf.to_widgets()

  from pandas_profiling import ProfileReport as PR


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

### Data Preprocessing

We can drop `Serial No.` for our prediction.

In [4]:
df.drop(columns = ['Serial No.'], inplace = True)
df.head()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,337,118,4,4.5,4.5,9.65,1,0.92
1,324,107,4,4.0,4.5,8.87,1,0.76
2,316,104,3,3.0,3.5,8.0,1,0.72
3,322,110,3,3.5,2.5,8.67,1,0.8
4,314,103,2,2.0,3.0,8.21,0,0.65


Now, we need to define our training and target features.

In [9]:
y = df['Chance of Admit ']
X = df.drop(columns = ['Chance of Admit '])

In [10]:
X.head(2)

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
0,337,118,4,4.5,4.5,9.65,1
1,324,107,4,4.0,4.5,8.87,1


In [11]:
y.head(2)

0    0.92
1    0.76
Name: Chance of Admit , dtype: float64

### Normalization / Standardization

In [13]:
scaler = StandardScaler()
arr = scaler.fit_transform(X)

In [14]:
df1 = pd.DataFrame(arr)

### Multicollinearity

In [15]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [17]:
col_num = arr.shape[1]

In [19]:
vif_df = pd.DataFrame()
vif_df['features'] = X.columns

In [20]:
vif_df['vif'] = [variance_inflation_factor(arr, i) for i in range(col_num)]

In [21]:
vif_df

Unnamed: 0,features,vif
0,GRE Score,4.615516
1,TOEFL Score,4.288959
2,University Rating,2.919606
3,SOP,3.075504
4,LOR,2.431258
5,CGPA,5.207403
6,Research,1.543312


All the features have `VIF` values under 10. So we can keep all the features.

### Data Split into Training and Test Sets

In [22]:
X_train, X_test, y_train, y_test = train_test_split(arr, y, test_size=0.2, random_state=1)
X_train

array([[-1.37944785, -1.71727709, -0.95202863, ..., -0.50426044,
        -1.20711841,  0.90911166],
       [ 1.50031044,  1.91193482,  1.67425725, ...,  1.16732114,
         1.84877129,  0.90911166],
       [-1.03038624,  0.75718558, -0.07660001, ...,  0.05293342,
         0.085758  , -1.09997489],
       ...,
       [ 0.71492181, -0.06763531, -0.07660001, ...,  0.05293342,
         0.85812573,  0.90911166],
       [ 0.80218721,  0.5922214 ,  1.67425725, ...,  0.61012728,
         1.05961296,  0.90911166],
       [-1.46671326, -0.39756367, -1.82745726, ..., -1.61864817,
        -1.34144323, -1.09997489]])

### Model Algorithm

#### Linear Regression

In [23]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [25]:
import pickle

In [26]:
pickle.dump(lr, open('Admission_Predict_lr_model.pickle', 'wb'))

#### Linear Regression Prediction

In [27]:
lr.predict(X_test)

array([0.7226944 , 0.69413229, 0.78864499, 0.78216045, 0.85343383,
       0.64491036, 0.63638313, 0.55607164, 0.49601148, 0.93154092,
       0.81489263, 0.93486376, 0.88212268, 0.64871558, 0.72652977,
       0.68591465, 0.81796893, 0.86744325, 0.50758006, 0.6920451 ,
       0.667625  , 0.78064666, 0.83421581, 0.91967463, 0.65430912,
       0.56652414, 0.7241488 , 0.70847631, 0.90514058, 0.65497936,
       0.94911478, 0.63848498, 0.79257757, 0.78496168, 0.71720411,
       0.65418614, 0.43262218, 0.65210164, 0.90479229, 0.76249685,
       0.84265544, 0.68014996, 0.88404992, 0.64896391, 0.97727002,
       0.7111686 , 0.73740191, 0.82754289, 0.63802488, 0.6669232 ,
       0.78417803, 0.56425656, 0.80796338, 0.73154013, 0.70870657,
       0.89037172, 0.47935505, 0.52449211, 0.78599642, 0.78868035,
       0.92965198, 0.73217127, 0.87659205, 0.72122612, 0.53196654,
       0.77603428, 0.85923897, 0.75137956, 0.65711657, 0.79869589,
       0.81106646, 0.57971478, 0.70359047, 0.63237323, 0.83828

We get the prediction, but we need to know the accuracy of our model.

#### Linear Regression Accuracy

##### R-square

In [29]:
lr_score = lr.score(X_test, y_test)
print(f"The accuracy of our linear regression model is: {round(lr_score, 2)}%")

The accuracy of our linear regression model is: 0.81%


##### Adjusted R-square

In [33]:
def adj_R2(model, X, y):
    R2 = model.score(X, y)
    n = X.shape[0]
    p = X.shape[1]
    adjusted_R2 = 1 - (n-1)*(1-R2)/(n-p-1)
    
    return adjusted_R2

In [34]:
lr_adjusted_score = adj_R2(lr, X_test, y_test)
print(f"The adjusted R2 score of our linear regression model is: {round(lr_adjusted_score, 2)}%")

The adjusted R2 score of our linear regression model is: 0.79%


##### Model Coefficients

In [35]:
print(f"The coefficient is: {lr.coef_}")
print("------------------")
print(f"The intercept is: {lr.intercept_}")

The coefficient is: [ 0.01731767  0.01906805  0.00941677 -0.00567902  0.01949352  0.06868488
  0.01299137]
------------------
The intercept is: 0.7266315501964594


#### Lasso

In [36]:
lassocv = LassoCV(cv = 10, max_iter=2000000, normalize=True)
lassocv_model = lassocv.fit(X_train, y_train)
lassocv_model.alpha_

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Lasso())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * np.sqrt(n_samples). 


9.010505902091334e-05

In [37]:
lasso = Lasso(alpha=lassocv_model.alpha_)
lasso_model = lasso.fit(X_train, y_train)

In [38]:
lasso_model.score(X_test, y_test)

0.8080095483913834

In [39]:
pickle.dump(lasso_model, open('Admission_Predict_lasso_model.pickle', 'wb'))

#### Ridge

In [45]:
ridgecv = RidgeCV(alphas= np.random.uniform(0, 10, 50), cv=10, normalize=True)
ridgecv_model = ridgecv.fit(X_train, y_train)
ridgecv_model.alpha_

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * n_samples. 
If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alp

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * n_samples. 
If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alp

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * n_samples. 
If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alp

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * n_samples. 
If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alp

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * n_samples. 
If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alp

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * n_samples. 
If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alp

0.054235706992609156

In [46]:
ridge = Ridge(alpha = ridgecv_model.alpha_)
ridge_model = ridge.fit(X_train, y_train)

In [47]:
ridge_model.score(X_test, y_test)

0.8078942136942281

In [48]:
pickle.dump(ridge_model, open('Admission_Predict_ridge_model.pickle', 'wb'))

#### Elastic Net

In [49]:
elasticnetcv = ElasticNetCV(alphas=None, cv=10)
elasticcv_model = elasticnetcv.fit(X_train, y_train)

In [51]:
elastic = ElasticNet(alpha=elasticcv_model.alpha_, l1_ratio=elasticcv_model.l1_ratio_)
elastic_model = elastic.fit(X_train, y_train)
elastic_model.score(X_test, y_test)

0.8076312058618265

In [52]:
pickle.dump(elastic_model, open('Admission_Predict_elasticnet_model.pickle', 'wb'))

### Conclusion

All linear regression models are having the same accuracy, so there is no overfitting and we get an accuracy of approximately **$81$** **%**