In [110]:
import pandas as pd

df = pd.read_csv('/content/StudentsPerformance.csv')
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [35]:
df.columns

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score'],
      dtype='object')

In [9]:
df.shape

(1000, 8)

In [3]:
df.gender.value_counts()

gender
female    518
male      482
Name: count, dtype: int64

In [4]:
df['race/ethnicity'].value_counts()

race/ethnicity
group C    319
group D    262
group B    190
group E    140
group A     89
Name: count, dtype: int64

In [5]:
df['parental level of education'].value_counts()

parental level of education
some college          226
associate's degree    222
high school           196
some high school      179
bachelor's degree     118
master's degree        59
Name: count, dtype: int64

In [6]:
df.lunch.value_counts()

lunch
standard        645
free/reduced    355
Name: count, dtype: int64

In [7]:
df['test preparation course'].value_counts()

test preparation course
none         642
completed    358
Name: count, dtype: int64

In [111]:
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder,StandardScaler
from sklearn.pipeline import Pipeline

In [112]:
class LabelEncodertr:

  def __init__(self):
    self.label_encoders = {}

  def fit(self,x,y=None):
    for column in x.columns:
      le = LabelEncoder()
      le.fit(x[column])
      self.label_encoders[column] = le
    return self

  def transform(self,x):
    x_transformed = x.copy()
    for column in x.columns:
      x_transformed[column] = self.label_encoders[column].transform(x[column])
    return x_transformed

  def get_params(self, deep=True):
    return {}

  def set_params(self, **params):
      for key, value in params.items():
          setattr(self, key, value)
      return self

In [113]:
trf = ColumnTransformer(transformers=[
    ('trf1',LabelEncodertr(),['gender']),
    ('trf2',LabelEncodertr(),['race/ethnicity'])
],remainder = 'passthrough')

In [114]:
trf.fit(df)

In [115]:
df_transformed = trf.transform(df)

In [116]:
df_transformed = pd.DataFrame(df_transformed)

In [117]:
df_transformed

Unnamed: 0,0,1,2,3,4,5,6,7
0,0,1,bachelor's degree,standard,none,72,72,74
1,0,2,some college,standard,completed,69,90,88
2,0,1,master's degree,standard,none,90,95,93
3,1,0,associate's degree,free/reduced,none,47,57,44
4,1,2,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,0,4,master's degree,standard,completed,88,99,95
996,1,2,high school,free/reduced,none,62,55,55
997,0,2,high school,free/reduced,completed,59,71,65
998,0,3,some college,standard,completed,68,78,77


In [118]:
df_transformed.rename(columns={0: 'gender_encoded', 1: 'race_encoded', 2: 'parental level of education', 3: 'lunch', 4: 'test preparation course', 5: 'math score', 6: 'reading score', 7: 'writing score'}, inplace=True)

In [119]:
df_transformed

Unnamed: 0,gender_encoded,race_encoded,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,0,1,bachelor's degree,standard,none,72,72,74
1,0,2,some college,standard,completed,69,90,88
2,0,1,master's degree,standard,none,90,95,93
3,1,0,associate's degree,free/reduced,none,47,57,44
4,1,2,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,0,4,master's degree,standard,completed,88,99,95
996,1,2,high school,free/reduced,none,62,55,55
997,0,2,high school,free/reduced,completed,59,71,65
998,0,3,some college,standard,completed,68,78,77


In [120]:
df_scores = df_transformed[['math score','reading score','writing score']]

In [121]:
df_scores

Unnamed: 0,math score,reading score,writing score
0,72,72,74
1,69,90,88
2,90,95,93
3,47,57,44
4,76,78,75
...,...,...,...
995,88,99,95
996,62,55,55
997,59,71,65
998,68,78,77


In [122]:
df_transformed['total'] = (df_scores['math score'] + df_scores['reading score'] + df_scores['writing score']) / 3

In [123]:
df_transformed = df_transformed.drop(columns=['math score','reading score','writing score'])

In [131]:
df_transformed['total'] = df_transformed['total'].astype(int)


In [132]:
df_transformed['total']

0      72
1      82
2      92
3      49
4      76
       ..
995    94
996    57
997    65
998    74
999    83
Name: total, Length: 1000, dtype: int64

In [133]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(df_transformed.drop(columns=['total']),df_transformed['total'],test_size = 0.2)

In [135]:
y_train

375    43
959    75
34     88
989    78
519    74
       ..
171    86
230    68
505    81
502    59
52     46
Name: total, Length: 800, dtype: int64

In [136]:
pipe2 = Pipeline(steps = [
    ('order',OrdinalEncoder(categories = [["master's degree", "bachelor's degree", 'some college', "associate's degree", 'high school', 'some high school'],['standard', 'free/reduced'],['none', 'completed']]))
])

In [137]:
ordinal_features = ['parental level of education', 'lunch','test preparation course']

In [138]:
trfx = ColumnTransformer(transformers=[
    ('pipe',pipe2,ordinal_features)
])

In [139]:
from sklearn.linear_model import LinearRegression

In [140]:
clf = Pipeline(steps=[
    ('trfx',trfx),
    ('regressor',LinearRegression())
])

In [141]:
clf.fit(x_train,y_train)