In [1]:
# Import Libraries

import numpy as np  
import pandas as pd         
import seaborn as sns   
import matplotlib.pyplot as plt 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [6]:
class evalModel:
    
    def __init__(self, X, y, random_state=42, model=None, scaler = None):
        self.X = X
        self.y = y
        self.random_state = random_state
        self.model = model if model is not None else LinearRegression()
        self.scaler = scaler if scaler is not None else StandardScaler()
        
    def split_data(self):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y,
                                                                                test_size=0.2, 
                                                                                random_state=self.random_state
                                                                                    )
        return self
    
    def transform_data(self):
        self.X_train = self.scaler.fit_transform(self.X_train)
        self.X_test = self.scaler.transform(self.X_test)
        return self
        
    def fit_model(self):
        self.linear = self.model.fit(self.X_train, self.y_train)
        return self
        
    def prediction(self):
        self.predict = self.linear.predict(self.y_test)
        return self
        
    def model_metrics(self):
        self.mae = mean_absolute_error(self.y_test, self.predict)
        self.mse = mean_squared_error(self.y_test, self.predict)
        self.r2 = r2_score(self.y_test, self.predict)
        return {"mae": round(self.mae, 3), "mse": round(self.mse, 3), "r2": round(self.r2, 3)}

    def run(self):
        self.split_data().transform_data().fit_model().prediction()
        return self.model_metrics()    
        

In [3]:
df = pd.read_csv("StudentPerformance.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     10000 non-null  int64  
 1   Previous Scores                   10000 non-null  int64  
 2   Extracurricular Activities        10000 non-null  object 
 3   Sleep Hours                       10000 non-null  int64  
 4   Sample Question Papers Practiced  10000 non-null  int64  
 5   Performance Index                 10000 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 468.9+ KB


In [4]:
df.columns = df.columns.str.replace(" ", "_").str.lower().str.strip()
df.columns

Index(['hours_studied', 'previous_scores', 'extracurricular_activities',
       'sleep_hours', 'sample_question_papers_practiced', 'performance_index'],
      dtype='object')

In [7]:
# X = df.drop(columns=['extracurricular_activities','hours_studied']).values
X = df[['hours_studied']].values

y = df['previous_scores'].values.reshape(-1, 1)

print(X.shape)
print(y.shape)

evaluator = evalModel(X, y).run()
print(evaluator)


(10000, 1)
(10000, 1)
{'mae': 19.939, 'mse': 582.951, 'r2': -0.94}
