In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("Student_Performance.csv")
df

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0
...,...,...,...,...,...,...
9995,1,49,Yes,4,2,23.0
9996,7,64,Yes,8,5,58.0
9997,6,83,Yes,8,5,74.0
9998,9,97,Yes,7,0,95.0


In [3]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Hours Studied,10000.0,4.9929,2.589309,1.0,3.0,5.0,7.0,9.0
Previous Scores,10000.0,69.4457,17.343152,40.0,54.0,69.0,85.0,99.0
Sleep Hours,10000.0,6.5306,1.695863,4.0,5.0,7.0,8.0,9.0
Sample Question Papers Practiced,10000.0,4.5833,2.867348,0.0,2.0,5.0,7.0,9.0
Performance Index,10000.0,55.2248,19.212558,10.0,40.0,55.0,71.0,100.0


In [4]:
df.duplicated().sum()

127

In [5]:
df.drop_duplicates(inplace=True)

In [6]:
df.shape

(9873, 6)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9873 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     9873 non-null   int64  
 1   Previous Scores                   9873 non-null   int64  
 2   Extracurricular Activities        9873 non-null   object 
 3   Sleep Hours                       9873 non-null   int64  
 4   Sample Question Papers Practiced  9873 non-null   int64  
 5   Performance Index                 9873 non-null   float64
dtypes: float64(1), int64(4), object(1)
memory usage: 539.9+ KB


In [8]:
df["Extracurricular Activities"].unique()

array(['Yes', 'No'], dtype=object)

In [9]:
df["Extracurricular Activities"] = df["Extracurricular Activities"].map({"Yes":1,"No":0})

In [10]:
df.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,1,9,1,91.0
1,4,82,0,4,2,65.0
2,8,51,1,7,2,45.0
3,5,52,1,5,2,36.0
4,7,75,0,8,5,66.0


In [11]:
X = df.drop("Performance Index",axis=1)
y = df['Performance Index']

In [12]:
X.shape, y.shape

((9873, 5), (9873,))

In [13]:
from sklearn.model_selection import train_test_split
X_train , X_test ,y_train , y_test = train_test_split(X,y , test_size = 0.29 , random_state = 1)

In [14]:
X_train.shape , X_test.shape ,y_train.shape , y_test.shape

((7009, 5), (2864, 5), (7009,), (2864,))

In [15]:
from sklearn.linear_model import LinearRegression

In [16]:
model = LinearRegression()
model.fit(X_train,y_train)

In [17]:
model.get_params()

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'positive': False}

In [18]:
model.score(X_test,y_test)

0.9887706526470462

In [19]:
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
lasso = Lasso()
lasso.fit(X_train,y_train)
lasso.score(X_test,y_test)


0.9867625668521814

In [20]:
ridge = Ridge()
ridge.fit(X_train,y_train)
ridge.score(X_test,y_test)*100

98.87707197511759

Decision Tree Regressor

In [21]:
from sklearn.tree import DecisionTreeRegressor
D_model = DecisionTreeRegressor()
D_model.fit(X_train,y_train)
D_model.score(X_test,y_test)*100

97.56180860985137

In [22]:
from sklearn.model_selection import GridSearchCV

param = {"criterion":["squared_error", 'friedman_mse', 'absolute_error', 'poisson'],
         "splitter":['best', 'random'],
         "max_depth":[10,11,6,7,8,9]

}

model = GridSearchCV(estimator=DecisionTreeRegressor(),param_grid=param , cv=5,verbose=3,n_jobs=-1)
model.fit(X_train,y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [23]:
model.best_estimator_

In [24]:
model.best_params_

{'criterion': 'squared_error', 'max_depth': 9, 'splitter': 'best'}

In [25]:
model.best_score_

0.982032908132385

In [26]:
y_pred = model.predict(X_test)
model.score(X_test,y_test)

0.9839064456333072

In [27]:
from sklearn.svm import SVR
model = SVR()
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.9847446100310665

In [28]:
def model_sec(models , X ,y,X_test , y_test):
    scores = []

    for model in models:
        model.fit(X,y)
        score = model.score(X_test , y_test)
        scores.append(score)

    return scores 


In [29]:
lr = LinearRegression()
dt = DecisionTreeRegressor()
svr = SVR()
models = (lr,dt,svr)
model_sec(models,X_train,y_train,X_test,y_test)

[0.9887706526470462, 0.9751372690604028, 0.9847446100310665]