In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

df = pd.read_csv("data/raw.csv")

df.head()

Unnamed: 0,MyUnknownColumn,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,0,ritz,2014,335000.0,559000.0,27000,Petrol,Dealer,Manual,0
1,1,sx4,2013,475000.0,954000.0,43000,Diesel,Dealer,Manual,0
2,2,ciaz,2017,725000.0,985000.0,6900,Petrol,Dealer,Manual,0
3,3,wagon r,2011,285000.0,415000.0,5200,Petrol,Dealer,Manual,0
4,4,swift,2014,460000.0,687000.0,42450,Diesel,Dealer,Manual,0


In [2]:
class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop(columns=self.columns_to_drop, axis=1)

In [3]:
drop_columns = ['MyUnknownColumn', 'Car_Name']
numerical_columns = ['Year', 'Selling_Price', 'Kms_Driven', 'Owner']
categorical_columns = ['Fuel_Type', 'Seller_Type', 'Transmission']

In [9]:
num_col_transformation = Pipeline(
    steps=[
        ('impute', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

cat_column_transformation = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OrdinalEncoder()),
        ('scale', StandardScaler())
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('numerical_column_transformation', num_col_transformation, numerical_columns),
        ('categorical_column_transformation', cat_column_transformation, categorical_columns)
    ],
    remainder='passthrough'
)

pipe = Pipeline(
    steps=[
        ('drop', DropColumns(drop_columns)),
        ('preprocessing', preprocessor),
        ('pca', PCA(n_components=5))
    ]
)

In [10]:
pipe

In [11]:
x = df.drop(columns = ['Present_Price'], axis = 1)
y = df['Present_Price']

In [12]:
new_data = pipe.fit_transform(x)

new_data = pd.DataFrame(new_data, columns = [f"Column {i}" for i in range(1,6)])

new_data.head()

Unnamed: 0,Column 1,Column 2,Column 3,Column 4,Column 5
0,-0.120664,-0.386014,0.455359,0.068709,0.824582
1,1.191925,0.310509,1.0222,0.659255,-0.790969
2,0.457538,-1.451806,0.031404,0.270743,0.788115
3,-0.390248,-0.073456,0.598647,0.108142,1.038609
4,1.224334,0.075988,0.951738,0.682637,-0.842195
