In [476]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, r2_score

In [477]:
data = pd.read_csv('Dataset/StudentPerformanceFactors.csv')
data.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70


In [478]:
data.describe()

Unnamed: 0,Hours_Studied,Attendance,Sleep_Hours,Previous_Scores,Tutoring_Sessions,Physical_Activity,Exam_Score
count,6607.0,6607.0,6607.0,6607.0,6607.0,6607.0,6607.0
mean,19.975329,79.977448,7.02906,75.070531,1.493719,2.96761,67.235659
std,5.990594,11.547475,1.46812,14.399784,1.23057,1.031231,3.890456
min,1.0,60.0,4.0,50.0,0.0,0.0,55.0
25%,16.0,70.0,6.0,63.0,1.0,2.0,65.0
50%,20.0,80.0,7.0,75.0,1.0,3.0,67.0
75%,24.0,90.0,8.0,88.0,2.0,4.0,69.0
max,44.0,100.0,10.0,100.0,8.0,6.0,101.0


In [479]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6607 entries, 0 to 6606
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Hours_Studied               6607 non-null   int64 
 1   Attendance                  6607 non-null   int64 
 2   Parental_Involvement        6607 non-null   object
 3   Access_to_Resources         6607 non-null   object
 4   Extracurricular_Activities  6607 non-null   object
 5   Sleep_Hours                 6607 non-null   int64 
 6   Previous_Scores             6607 non-null   int64 
 7   Motivation_Level            6607 non-null   object
 8   Internet_Access             6607 non-null   object
 9   Tutoring_Sessions           6607 non-null   int64 
 10  Family_Income               6607 non-null   object
 11  Teacher_Quality             6529 non-null   object
 12  School_Type                 6607 non-null   object
 13  Peer_Influence              6607 non-null   obje

In [480]:
data.isna().sum()

Hours_Studied                  0
Attendance                     0
Parental_Involvement           0
Access_to_Resources            0
Extracurricular_Activities     0
Sleep_Hours                    0
Previous_Scores                0
Motivation_Level               0
Internet_Access                0
Tutoring_Sessions              0
Family_Income                  0
Teacher_Quality               78
School_Type                    0
Peer_Influence                 0
Physical_Activity              0
Learning_Disabilities          0
Parental_Education_Level      90
Distance_from_Home            67
Gender                         0
Exam_Score                     0
dtype: int64

In [481]:
data = data.fillna(data.mode().iloc[0])

In [482]:
data.isna().sum()

Hours_Studied                 0
Attendance                    0
Parental_Involvement          0
Access_to_Resources           0
Extracurricular_Activities    0
Sleep_Hours                   0
Previous_Scores               0
Motivation_Level              0
Internet_Access               0
Tutoring_Sessions             0
Family_Income                 0
Teacher_Quality               0
School_Type                   0
Peer_Influence                0
Physical_Activity             0
Learning_Disabilities         0
Parental_Education_Level      0
Distance_from_Home            0
Gender                        0
Exam_Score                    0
dtype: int64

In [483]:
features = ['Hours_Studied', 'Sleep_Hours', 'Attendance']
X = data[features]
y = data['Exam_Score']


In [484]:

plt.figure(figsize=(12, 6))
for i, feature in enumerate(features):
    plt.subplot(1, len(features), i+1)
    sns.scatterplot(x=X[feature], y=y)
    plt.title(f'{feature} vs Exam Score')
    plt.show()
plt.tight_layout()
plt.savefig('feature_scatter.png')

In [485]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [486]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)

In [487]:
polyreg = make_pipeline(PolynomialFeatures(2), LinearRegression())
polyreg.fit(X_train, y_train)
poly_y_pred = polyreg.predict(X_test)


In [488]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Linear Regression Performance:")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score: {r2:.2f}")

Linear Regression Performance:
Mean Squared Error: 5.82
R² Score: 0.59


In [489]:
mse = mean_squared_error(y_test, poly_y_pred)
r2 = r2_score(y_test, poly_y_pred)
print(f"Polynomial Regression Performance:")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score: {r2:.2f}")

Polynomial Regression Performance:
Mean Squared Error: 5.83
R² Score: 0.59


In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(X_test[features[0]], y_test, color='blue', label='Actual')
plt.scatter(X_test[features[0]], y_pred, color='red', label='Linear Prediction')
plt.xlabel(features[0])
plt.ylabel('Exam Score')
plt.title('Actual vs Predicted Scores')
plt.legend()
plt.show()
plt.savefig('Linear_predictions.png')

In [491]:
plt.figure(figsize=(10, 6))
plt.scatter(X_test[features[0]], y_test, color='blue', label='Actual')
plt.scatter(X_test[features[0]], poly_y_pred, color='green', label='Polynomial Prediction')
plt.xlabel(features[0])
plt.ylabel('Exam Score')
plt.title('Actual vs Predicted Scores')
plt.legend()
plt.show()
plt.savefig('Polynomial_predictions.png')

In [492]:
X_StudyHours = data[['Hours_Studied']]
X_train_StudyHours, X_test_StudyHours, _, _ = train_test_split(X_StudyHours, y, test_size=0.2, random_state=42)
lr_StudyHours = LinearRegression()
lr_StudyHours.fit(X_train_StudyHours, y_train)
y_pred_StudyHours = lr_StudyHours.predict(X_test_StudyHours)


In [493]:
mse = mean_squared_error(y_test, y_pred_StudyHours)
r2 = r2_score(y_test, y_pred_StudyHours)
print(f"Linear Regression Performance:")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score: {r2:.2f}")

Linear Regression Performance:
Mean Squared Error: 10.86
R² Score: 0.23
