In [36]:
import os 
import sys 
from dataclasses import dataclass
import numpy as np 
import pandas as pd 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

@dataclass
class DataTransformationConfig:
    preprocessor_path = os.path.join('artifacts', 'preprocessor.pkl')

class DataTransformation:
    def __init__(self):
        self.transformation_config = DataTransformationConfig()

    def get_data_transformer(self): 
        ''' This function is used to get the data transformation pipeline'''
        
        try:
            numeric_features = ['writing_score', 'reading_score']
            categorical_features = ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']

            num_pipeline = Pipeline(
                steps=[
                    ('imputer', SimpleImputer(strategy='median')),
                    ('scaler', StandardScaler(with_mean=False))
                ]
            )
    

            cat_pipeline = Pipeline(
                steps=[
                    ('imputer', SimpleImputer(strategy='most_frequent')),
                    ('onehot', OneHotEncoder()),
                    ('scaler', StandardScaler(with_mean=False))
                ]
            )

    

            preprocessor = ColumnTransformer(
                transformers=[
                    ('num_vars', num_pipeline, numeric_features),
                    ('cat_vars', cat_pipeline, categorical_features)
                ]
            )

            return preprocessor

        except Exception as e:
            # raise CustomException(e, sys)
            pass



In [37]:
train_df = pd.read_csv('/Users/agnatkalra/Desktop/Coding/Python/CompleteEndToEndMLProject/artifacts/train.csv')
test_df = pd.read_csv('/Users/agnatkalra/Desktop/Coding/Python/CompleteEndToEndMLProject/artifacts/test.csv')

data_transformation = DataTransformation()
preprocessors = data_transformation.get_data_transformer()


In [38]:
train_df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group D,master's degree,standard,none,62,70,75
1,female,group C,bachelor's degree,free/reduced,completed,66,83,83
2,female,group D,some college,free/reduced,none,79,89,86
3,male,group C,master's degree,free/reduced,none,61,67,66
4,male,group E,high school,standard,none,73,64,57


In [39]:

target = 'math_score'
numerical_columns = ["writing_score", "reading_score"]

input_features_train = train_df.drop(columns=[target], axis=1)
print(input_features_train.columns)
target_feature_train = train_df[target]




Index(['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch',
       'test_preparation_course', 'reading_score', 'writing_score'],
      dtype='object')


In [40]:

input_features_test = test_df.drop(columns=[target], axis=1)
print(input_features_test.columns)
target_feature_test = test_df[target]
print(target_feature_test)




Index(['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch',
       'test_preparation_course', 'reading_score', 'writing_score'],
      dtype='object')
0      91
1      53
2      80
3      74
4      84
       ..
195    52
196    62
197    74
198    65
199    61
Name: math_score, Length: 200, dtype: int64


In [41]:
input_scaled_train = preprocessors.fit_transform(input_features_train)
input_scaled_test = preprocessors.transform(input_features_test)




In [None]:
train_arr = np.c_[input_scaled_train, np.array(target_feature_train)]
test_arr = np.c_[input_scaled_test, np.array(target_feature_test)]