# Student Performance

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("nikhil7280/student-performance-multiple-linear-regression")

print("Path to dataset files:", path)

df = pd.read_csv(path + '/Student_Performance.csv')

Path to dataset files: /root/.cache/kagglehub/datasets/nikhil7280/student-performance-multiple-linear-regression/versions/1


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     10000 non-null  int64  
 1   Previous Scores                   10000 non-null  int64  
 2   Extracurricular Activities        10000 non-null  object 
 3   Sleep Hours                       10000 non-null  int64  
 4   Sample Question Papers Practiced  10000 non-null  int64  
 5   Performance Index                 10000 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 468.9+ KB


In [None]:
df.isna().sum()

Unnamed: 0,0
Hours Studied,0
Previous Scores,0
Extracurricular Activities,0
Sleep Hours,0
Sample Question Papers Practiced,0
Performance Index,0


In [None]:
df.describe()

Unnamed: 0,Hours Studied,Previous Scores,Sleep Hours,Sample Question Papers Practiced,Performance Index
count,10000.0,10000.0,10000.0,10000.0,10000.0
mean,4.9929,69.4457,6.5306,4.5833,55.2248
std,2.589309,17.343152,1.695863,2.867348,19.212558
min,1.0,40.0,4.0,0.0,10.0
25%,3.0,54.0,5.0,2.0,40.0
50%,5.0,69.0,7.0,5.0,55.0
75%,7.0,85.0,8.0,7.0,71.0
max,9.0,99.0,9.0,9.0,100.0


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

X = df.drop(columns=['Performance Index'])
y = df['Performance Index']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

num_cols = X_train.select_dtypes(include=[np.number]).columns
cat_cols = X_train.select_dtypes(exclude=[np.number]).columns


num_pipe = Pipeline(
    [
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

cat_pipe = Pipeline(
    [
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder())
    ]
)

ct = ColumnTransformer(
    [
        ('num', num_pipe, num_cols),
        ('cat', cat_pipe, cat_cols)
    ]
)

X_train_transformed = pd.DataFrame(ct.fit_transform(X_train), columns=ct.get_feature_names_out())
X_test_transformed = pd.DataFrame(ct.transform(X_test), columns=ct.get_feature_names_out())


lin_reg = LinearRegression()
lin_reg.fit(X_train_transformed, y_train)
print(f'lin_reg score on test data: {lin_reg.score(X_test_transformed, y_test)}', '\n')
print('10 fold cross validation of training data: ', cross_val_score(lin_reg, X_train_transformed, y_train, cv=10, scoring='r2'), '\n')
print('mean of 10 fold cross validation score on training data: ', cross_val_score(lin_reg, X_train_transformed, y_train, cv=10, scoring='r2').mean())


lin_reg score on test data: 0.9889832909573145 

10 fold cross validation of training data:  [0.98891085 0.98853575 0.98755674 0.98919292 0.98910427 0.98870485
 0.98828862 0.98827805 0.98817951 0.98980904] 

mean of 10 fold cross validation score on training data:  0.9886560588002133
