In [43]:
import pandas as pd
import numpy as np

In [44]:
data=pd.read_csv('/content/Student_Performance.csv')

In [45]:
data.head()

Unnamed: 0,Hours_Studied,Previous_Scores,Extracurricular_Activities,Sleep_Hours,Sample_Question_Papers_Practiced,Performance_Index
0,7,99,Yes,9,1,91
1,4,82,No,4,2,65
2,8,51,Yes,7,2,45
3,5,52,Yes,5,2,36
4,7,75,No,8,5,66


In [46]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   Hours_Studied                     10000 non-null  int64 
 1   Previous_Scores                   10000 non-null  int64 
 2   Extracurricular_Activities        10000 non-null  object
 3   Sleep_Hours                       10000 non-null  int64 
 4   Sample_Question_Papers_Practiced  10000 non-null  int64 
 5   Performance_Index                 10000 non-null  int64 
dtypes: int64(5), object(1)
memory usage: 468.9+ KB


In [47]:
data.isna().sum()

Hours_Studied                       0
Previous_Scores                     0
Extracurricular_Activities          0
Sleep_Hours                         0
Sample_Question_Papers_Practiced    0
Performance_Index                   0
dtype: int64

In [48]:
data

Unnamed: 0,Hours_Studied,Previous_Scores,Extracurricular_Activities,Sleep_Hours,Sample_Question_Papers_Practiced,Performance_Index
0,7,99,Yes,9,1,91
1,4,82,No,4,2,65
2,8,51,Yes,7,2,45
3,5,52,Yes,5,2,36
4,7,75,No,8,5,66
...,...,...,...,...,...,...
9995,1,49,Yes,4,2,23
9996,7,64,Yes,8,5,58
9997,6,83,Yes,8,5,74
9998,9,97,Yes,7,0,95


# outliers treatment

In [49]:
data.head()

Unnamed: 0,Hours_Studied,Previous_Scores,Extracurricular_Activities,Sleep_Hours,Sample_Question_Papers_Practiced,Performance_Index
0,7,99,Yes,9,1,91
1,4,82,No,4,2,65
2,8,51,Yes,7,2,45
3,5,52,Yes,5,2,36
4,7,75,No,8,5,66


In [50]:
def outlier_treat(x):
  if ((x.dtype=='i') | (x.dtype=='f')):
    x=x.clip(lower=x.quantile(0.01),upper=x.quantile(0.99))
  return x

In [51]:
data=data.apply(outlier_treat)

In [52]:
data.head()

Unnamed: 0,Hours_Studied,Previous_Scores,Extracurricular_Activities,Sleep_Hours,Sample_Question_Papers_Practiced,Performance_Index
0,7,99,Yes,9,1,91
1,4,82,No,4,2,65
2,8,51,Yes,7,2,45
3,5,52,Yes,5,2,36
4,7,75,No,8,5,66


# one hot encoding

In [63]:
data=pd.get_dummies(columns=['Extracurricular_Activities'],data=data,drop_first=True)

In [64]:
data.head()

Unnamed: 0,Hours_Studied,Previous_Scores,Sleep_Hours,Sample_Question_Papers_Practiced,Performance_Index,Extracurricular_Activities_Yes
0,7,99,9,1,91,1
1,4,82,4,2,65,0
2,8,51,7,2,45,1
3,5,52,5,2,36,1
4,7,75,8,5,66,0


In [65]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype
---  ------                            --------------  -----
 0   Hours_Studied                     10000 non-null  int64
 1   Previous_Scores                   10000 non-null  int64
 2   Sleep_Hours                       10000 non-null  int64
 3   Sample_Question_Papers_Practiced  10000 non-null  int64
 4   Performance_Index                 10000 non-null  int64
 5   Extracurricular_Activities_Yes    10000 non-null  uint8
dtypes: int64(5), uint8(1)
memory usage: 400.5 KB


# Define X and Y variables

In [66]:
X=data[data.columns.difference(['Performance_Index'])]
y=data.Performance_Index

In [67]:
X

Unnamed: 0,Extracurricular_Activities_Yes,Hours_Studied,Previous_Scores,Sample_Question_Papers_Practiced,Sleep_Hours
0,1,7,99,1,9
1,0,4,82,2,4
2,1,8,51,2,7
3,1,5,52,2,5
4,0,7,75,5,8
...,...,...,...,...,...
9995,1,1,49,2,4
9996,1,7,64,5,8
9997,1,6,83,5,8
9998,1,9,97,0,7


In [68]:
y

0       91
1       65
2       45
3       36
4       66
        ..
9995    23
9996    58
9997    74
9998    95
9999    64
Name: Performance_Index, Length: 10000, dtype: int64

In [69]:
from sklearn.model_selection import train_test_split

In [70]:
train_X, test_X, train_y, test_y = train_test_split(X,y, test_size=0.2, random_state=123 )

In [71]:
train_X.shape

(8000, 5)

In [72]:
test_X.shape

(2000, 5)

In [73]:
from sklearn.linear_model import LinearRegression


In [74]:
model = LinearRegression()
model.fit(train_X,train_y)

In [75]:
from sklearn.metrics import r2_score, mean_squared_error


In [76]:
print('Model Performance on Train dataset')
r2 = r2_score(train_y,model.predict(train_X))
print(f'R2 Score: {r2}')
# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(train_y,model.predict(train_X))
print(f'Mean Squared Error: {mse}')

Model Performance on Train dataset
R2 Score: 0.9890045863948551
Mean Squared Error: 4.070071401362252


In [77]:
print('Model Performance on Test dataset')
r2 = r2_score(test_y,model.predict(test_X))
print(f'R2 Score: {r2}')
# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(test_y,model.predict(test_X))
print(f'Mean Squared Error: {mse}')

Model Performance on Test dataset
R2 Score: 0.9877171781991884
Mean Squared Error: 4.479630448731185


# Pickle File

In [78]:
import pickle

with open('model_lin.pkl','wb') as f:
  pickle.dump(model,f)

In [79]:
import pickle
with open('model_lin.pkl','rb') as f:
  mod=pickle.load(f)

In [80]:
mod