Steps of Multiple Linear Regression Model Train assignment
- Loading Data
- Cleaning Data
- Preprocessing
- Model Training
- evaluation

In [2]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import gradio as gr

In [3]:
df=pd.read_csv("Student_Performance_Missing.csv")
df.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7.0,99.0,Yes,9.0,1.0,91.0
1,4.0,82.0,No,4.0,2.0,65.0
2,8.0,51.0,Yes,7.0,2.0,45.0
3,5.0,52.0,Yes,5.0,2.0,36.0
4,7.0,,No,8.0,5.0,66.0


In [4]:
df.isnull().sum()

Hours Studied                       1046
Previous Scores                      991
Extracurricular Activities           976
Sleep Hours                          954
Sample Question Papers Practiced    1035
Performance Index                    998
dtype: int64

In [5]:
num_cols = df.select_dtypes('number').columns
print(num_cols.tolist())

['Hours Studied', 'Previous Scores', 'Sleep Hours', 'Sample Question Papers Practiced', 'Performance Index']


In [6]:
cat_cols = df.select_dtypes(exclude='number').columns
print(cat_cols.tolist())

['Extracurricular Activities']


In [7]:
num_imp = SimpleImputer(strategy='mean')
cat_imp = SimpleImputer(strategy='most_frequent')

In [8]:
numerical_features = ['Study Hours', 'Previous Scores', 'Sleep Hours']
categorical_features = ['Extracurricular Activities', 'Sample Paper Practice']

In [9]:
df[num_cols] = num_imp.fit_transform(df[num_cols])
df

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7.0,99.000000,Yes,9.000000,1.0,91.000000
1,4.0,82.000000,No,4.000000,2.0,65.000000
2,8.0,51.000000,Yes,7.000000,2.0,45.000000
3,5.0,52.000000,Yes,5.000000,2.0,36.000000
4,7.0,69.448995,No,8.000000,5.0,66.000000
...,...,...,...,...,...,...
9995,1.0,49.000000,Yes,4.000000,2.0,23.000000
9996,7.0,64.000000,Yes,8.000000,5.0,58.000000
9997,6.0,83.000000,,8.000000,5.0,55.213175
9998,9.0,97.000000,Yes,6.532169,0.0,95.000000


In [10]:
df[cat_cols] =  cat_imp.fit_transform(df[cat_cols])
df

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7.0,99.000000,Yes,9.000000,1.0,91.000000
1,4.0,82.000000,No,4.000000,2.0,65.000000
2,8.0,51.000000,Yes,7.000000,2.0,45.000000
3,5.0,52.000000,Yes,5.000000,2.0,36.000000
4,7.0,69.448995,No,8.000000,5.0,66.000000
...,...,...,...,...,...,...
9995,1.0,49.000000,Yes,4.000000,2.0,23.000000
9996,7.0,64.000000,Yes,8.000000,5.0,58.000000
9997,6.0,83.000000,No,8.000000,5.0,55.213175
9998,9.0,97.000000,Yes,6.532169,0.0,95.000000


In [11]:
x = df.drop(columns=['Performance Index'])
y = df['Performance Index']

In [12]:
print(x)

      Hours Studied  Previous Scores Extracurricular Activities  Sleep Hours  \
0               7.0        99.000000                        Yes     9.000000   
1               4.0        82.000000                         No     4.000000   
2               8.0        51.000000                        Yes     7.000000   
3               5.0        52.000000                        Yes     5.000000   
4               7.0        69.448995                         No     8.000000   
...             ...              ...                        ...          ...   
9995            1.0        49.000000                        Yes     4.000000   
9996            7.0        64.000000                        Yes     8.000000   
9997            6.0        83.000000                         No     8.000000   
9998            9.0        97.000000                        Yes     6.532169   
9999            7.0        74.000000                         No     6.532169   

      Sample Question Papers Practiced 

In [13]:
print(y)

0       91.000000
1       65.000000
2       45.000000
3       36.000000
4       66.000000
          ...    
9995    23.000000
9996    58.000000
9997    55.213175
9998    95.000000
9999    64.000000
Name: Performance Index, Length: 10000, dtype: float64


In [14]:
xtrain, xtest, ytrain, ytest = train_test_split(x,y, test_size=0.2, random_state=0)

In [15]:
cat_cols = xtrain.select_dtypes(exclude='number').columns
num_cols = xtrain.select_dtypes('number').columns

In [16]:
num_pipeline = Pipeline(
    steps=[
        ('impute', SimpleImputer(strategy='mean')),
        ('scale', SimpleImputer())
    ]
)

In [17]:
cat_pipeline = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ]
)

In [18]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', num_pipeline, num_cols),
        ('cat', cat_pipeline, cat_cols)
    ]
)

In [19]:
model = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('lr', LinearRegression)
    ]
)

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
import pandas as pd

In [21]:
categorical_cols = xtrain.select_dtypes(include='object').columns
numerical_cols = xtrain.select_dtypes(include='number').columns

In [22]:
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

In [23]:
model = Pipeline([
    ('preprocessing', preprocessor),
    ('regressor', LinearRegression())
])

In [24]:
model.fit(xtrain, ytrain)

0,1,2
,steps,"[('preprocessing', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [25]:
from joblib import dump, load
print("Model Dumped Successfully")

Model Dumped Successfully


In [26]:
import gradio as gr
import pandas as pd

def prediction(Hourse_Studied, Previous_Scores, Extracurricular_Activities, Simple_Question_Paper_Practiced, Performances_Index):
    df = pd.DataFrame({
        'Hourse_Studied': [Hourse_Studied],
        'Previous_Scores': [Previous_Scores],
        'Extracurricular_Activities': [Extracurricular_Activities],
        'Simple_Question_Paper_Practiced': [Simple_Question_Paper_Practiced],
        'Performances_Index': [Performances_Index]
    })
    Performances = model.predict(df)
    return Performances[0]

# Use a built-in theme (e.g., "default", "soft", "compact", "glass")
ui = gr.Interface(
    fn=prediction,
    inputs=['number', 'number', 'text', 'number', 'number'],
    outputs=gr.Text(),
    title='Student Performance',
    examples=[[7.0, 99, 'Yes', 9, 1.0]],
    theme=gr.themes.Glass()  # or gr.themes.Glass(), etc.
)

ui.launch()

* Running on local URL:  http://127.0.0.1:7861
* To create a public link, set `share=True` in `launch()`.


