In [3]:
import pandas as pd
import warnings
import os
warnings.filterwarnings('ignore')

In [7]:
student_df= pd.read_csv('Student_Performance.csv')


In [9]:
student_df.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0


In [10]:
student_df.shape

(10000, 6)

In [11]:
student_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     10000 non-null  int64  
 1   Previous Scores                   10000 non-null  int64  
 2   Extracurricular Activities        10000 non-null  object 
 3   Sleep Hours                       10000 non-null  int64  
 4   Sample Question Papers Practiced  10000 non-null  int64  
 5   Performance Index                 10000 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 468.9+ KB


In [12]:
student_df.isnull().sum()

Hours Studied                       0
Previous Scores                     0
Extracurricular Activities          0
Sleep Hours                         0
Sample Question Papers Practiced    0
Performance Index                   0
dtype: int64

In [13]:
student_df.duplicated().sum()

127

In [16]:
student_df=student_df.drop_duplicates()

In [17]:
student_df.duplicated().sum()

0

In [18]:
student_df.describe()

Unnamed: 0,Hours Studied,Previous Scores,Sleep Hours,Sample Question Papers Practiced,Performance Index
count,9873.0,9873.0,9873.0,9873.0,9873.0
mean,4.9921,69.441102,6.531652,4.583004,55.216651
std,2.589081,17.325601,1.697683,2.867202,19.20857
min,1.0,40.0,4.0,0.0,10.0
25%,3.0,54.0,5.0,2.0,40.0
50%,5.0,69.0,7.0,5.0,55.0
75%,7.0,85.0,8.0,7.0,70.0
max,9.0,99.0,9.0,9.0,100.0


standardizing the data

In [20]:
clean_df= student_df.copy()
clean_df=pd.get_dummies(clean_df,columns=['Extracurricular Activities'])

In [21]:
clean_df=clean_df.replace({True:1, False:0})
clean_df

Unnamed: 0,Hours Studied,Previous Scores,Sleep Hours,Sample Question Papers Practiced,Performance Index,Extracurricular Activities_No,Extracurricular Activities_Yes
0,7,99,9,1,91.0,0,1
1,4,82,4,2,65.0,1,0
2,8,51,7,2,45.0,0,1
3,5,52,5,2,36.0,0,1
4,7,75,8,5,66.0,1,0
...,...,...,...,...,...,...,...
9995,1,49,4,2,23.0,0,1
9996,7,64,8,5,58.0,0,1
9997,6,83,8,5,74.0,0,1
9998,9,97,7,0,95.0,0,1


Seprating Training And Test Data

In [27]:
X= clean_df.drop(['Performance Index'], axis=1)
Y=clean_df['Performance Index']


In [29]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2, random_state=7)

MODELING

Support Vector Model

In [30]:
from sklearn.svm import SVR
svr=SVR()
svr.fit(X_train,Y_train)
svr_pred=svr.predict(X_test)

In [31]:
svr_pred

array([62.45432415, 46.79600369, 47.1230006 , ..., 61.01047619,
       71.78419509, 44.74498287])

In [33]:
result_df=pd.DataFrame({'True Performance': Y_test, 'Perdicted Performance': svr_pred})
result_df

Unnamed: 0,True Performance,Perdicted Performance
3265,59.0,62.454324
175,45.0,46.796004
4225,48.0,47.123001
4653,64.0,65.910198
1321,24.0,22.829439
...,...,...
541,64.0,67.761135
5355,34.0,33.710474
2544,60.0,61.010476
3888,70.0,71.784195


Model Evaluation

In [36]:
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.metrics import r2_score

svr_mae = mean_absolute_error(Y_test, svr_pred)
svr_mae

1.8743404061125548

In [38]:
R2Score = r2_score(Y_test, svr_pred)
print(f"R^2 Score is: {R2Score*100}")

R^2 Score is: 98.41435343742626


Linear Regression Mode

In [40]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train, Y_train)

In [41]:
lin_pred = lin_reg.predict(X_test)
lin_pred

array([60.70228604, 47.46488821, 47.61635294, ..., 61.69489914,
       70.32626614, 44.30172536])

In [43]:
results_df = pd.DataFrame({'True Performance': Y_test, 'Predicted Performance': lin_pred})
results_df

Unnamed: 0,True Performance,Predicted Performance
3265,59.0,60.702286
175,45.0,47.464888
4225,48.0,47.616353
4653,64.0,65.771226
1321,24.0,20.788038
...,...,...
541,64.0,67.257694
5355,34.0,33.915870
2544,60.0,61.694899
3888,70.0,70.326266


In [44]:
lin_mae = mean_absolute_error(Y_test, lin_pred)
lin_mae

1.6344251815821633

In [46]:
R2Score = r2_score(Y_test, lin_pred)
print(f"R^2 Score is: {R2Score*100}")

R^2 Score is: 98.79370299201616


Model Optimization

In [47]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

lin_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])

lin_pipeline.fit(X_train, Y_train)
lin_pipeline_pred = lin_pipeline.predict(X_test)

In [49]:
results_df = pd.DataFrame({'True Performance': Y_test, 'Predicted Performance': lin_pipeline_pred})
results_df

Unnamed: 0,True Performance,Predicted Performance
3265,59.0,60.702286
175,45.0,47.464888
4225,48.0,47.616353
4653,64.0,65.771226
1321,24.0,20.788038
...,...,...
541,64.0,67.257694
5355,34.0,33.915870
2544,60.0,61.694899
3888,70.0,70.326266


In [51]:
lin_pipeline_mae = mean_absolute_error(Y_test, lin_pipeline_pred)
lin_pipeline_mae

1.6344251815821635

In [53]:
R2Score = r2_score(Y_test, lin_pipeline_pred)
print(f"R^2 Score is: {R2Score*100}")

R^2 Score is: 98.79370299201616
