In [1]:
import pandas as pd

df = pd.read_csv('../data/cleaned_data.csv')
df.head()

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price,Brand,Model,Age
0,Mumbai,2010,72000,CNG,Manual,First,11.438,998.0,58.16,5.0,1.75,Maruti,Wagon,10
1,Pune,2015,41000,Diesel,Manual,First,19.67,1582.0,126.2,5.0,12.5,Hyundai,Creta,5
2,Chennai,2011,46000,Petrol,Manual,First,18.2,1199.0,88.7,5.0,4.5,Honda,Jazz,9
3,Chennai,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,6.0,Maruti,Ertiga,8
4,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968.0,140.8,5.0,17.74,Audi,A4,7


# Data Splitting

* Category Columns > Encoding
* Numeric Columns > Handle missing values and Scaling

In [2]:
# Data Splitting

X = df.drop(columns=['Price', 'Year'])
y = df['Price']

In [3]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
X_train.shape

(4495, 12)

In [5]:
# Splitting Columns

num_cols = ['Kilometers_Driven', 'Mileage', 'Engine', 'Power', 'Age', 'Seats']  # Impute Missing Values and Scale
nom_cat_cols = ['Location', 'Fuel_Type', 'Transmission', 'Brand', 'Model']  # USE Binary ENCODING
ord_cat_cols = ['Owner_Type']   # USE ORDINAL ENCODING

# Data Preprocessing

In [6]:
# !pip install category-encoders

In [7]:
from category_encoders import BinaryEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')),
                                         ('scaler', StandardScaler())])

# Preprocessing for categorical data
nominal_categorical_transformer = Pipeline(steps=[('binary', BinaryEncoder())])
ordinal_categorical_transformer = Pipeline(steps=[('ordinal', OrdinalEncoder(categories=[['Fourth & Above', 'Third', 'Second', 'First']]))])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(transformers=[('num_prep', numerical_transformer, num_cols),
                                                ('nom_prep', nominal_categorical_transformer, nom_cat_cols),
                                                ('ord_prep', ordinal_categorical_transformer, ord_cat_cols)], remainder='passthrough')

X_train_prep = preprocessor.fit_transform(X_train)
X_test_prep = preprocessor.transform(X_test)

In [8]:
X_train_prep

array([[-1.08423424, -0.4720888 , -0.72760357, ...,  0.        ,
         1.        ,  3.        ],
       [ 0.4521476 ,  0.02363623,  2.62788967, ...,  1.        ,
         0.        ,  3.        ],
       [-1.14876227,  0.14573599, -0.72572795, ...,  1.        ,
         1.        ,  3.        ],
       ...,
       [ 2.21735226, -1.42935092,  2.61851154, ...,  0.        ,
         1.        ,  2.        ],
       [ 0.74634838,  0.61459907, -1.10272859, ...,  0.        ,
         0.        ,  2.        ],
       [-0.72465551, -0.14730344, -0.72760357, ...,  1.        ,
         1.        ,  3.        ]])

In [9]:
preprocessor.named_transformers_['num_prep'].named_steps['imputer'].statistics_  # Median values of the numerical features

array([5.200e+04, 1.850e+01, 1.493e+03, 9.370e+01, 6.000e+00, 5.000e+00])

In [10]:
for col in num_cols:
    print(f'Median of {col}: {X_train[col].median()}')

Median of Kilometers_Driven: 52000.0
Median of Mileage: 18.5
Median of Engine: 1493.0
Median of Power: 93.7
Median of Age: 6.0
Median of Seats: 5.0


In [11]:
preprocessor.get_feature_names_out()

array(['num_prep__Kilometers_Driven', 'num_prep__Mileage',
       'num_prep__Engine', 'num_prep__Power', 'num_prep__Age',
       'num_prep__Seats', 'nom_prep__Location_0', 'nom_prep__Location_1',
       'nom_prep__Location_2', 'nom_prep__Location_3',
       'nom_prep__Fuel_Type_0', 'nom_prep__Fuel_Type_1',
       'nom_prep__Fuel_Type_2', 'nom_prep__Transmission_0',
       'nom_prep__Transmission_1', 'nom_prep__Brand_0',
       'nom_prep__Brand_1', 'nom_prep__Brand_2', 'nom_prep__Brand_3',
       'nom_prep__Brand_4', 'nom_prep__Model_0', 'nom_prep__Model_1',
       'nom_prep__Model_2', 'nom_prep__Model_3', 'nom_prep__Model_4',
       'nom_prep__Model_5', 'nom_prep__Model_6', 'nom_prep__Model_7',
       'ord_prep__Owner_Type'], dtype=object)

In [12]:
from sklearn.linear_model import LinearRegression

# Linear Regression
lr = LinearRegression()
lr.fit(X_train_prep, y_train)   # Learn Weights from training data Using OLS (Normal Equation)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [13]:
import plotly.express as px

feature_importance = dict(zip(preprocessor.get_feature_names_out(), lr.coef_))
feature_importance = pd.Series(feature_importance).sort_values()
px.bar(feature_importance, width=800, height=900, orientation='h', title='Feature Importance')

In [14]:
X_train_prep.shape, X_test_prep.shape

((4495, 29), (1124, 29))

In [15]:
import pickle

# Save the unprocessed data
with open('../data/unprocessed_data.pkl', 'wb') as f:
    pickle.dump((X_train, y_train, X_test, y_test), f)

# Save the preprocessed data
with open('../data/preprocessed_data.pkl', 'wb') as f:
    pickle.dump((X_train_prep, y_train, X_test_prep, y_test), f)

