In [33]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')
import joblib

In [34]:
df=pd.read_csv('../data/raw/co2_emmission.csv')

In [35]:
df.head()

Unnamed: 0,Make,Model,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)
0,ACURA,ILX,COMPACT,2.0,4.0,AS5,Premium Petrol,9.9,6.7,8.5,33.0,196
1,ACURA,ILX,COMPACT,2.4,4.0,M6,Premium Petrol,11.2,7.7,9.6,29.0,221
2,ACURA,ILX HYBRID,COMPACT,1.5,4.0,AV7,Premium Petrol,6.0,5.8,5.9,48.0,136
3,ACURA,MDX 4WD,SUV - SMALL,3.5,6.0,AS6,Premium Petrol,12.7,9.1,11.1,25.0,255
4,ACURA,RDX AWD,SUV - SMALL,3.5,,AS6,Premium Petrol,12.1,8.7,10.6,27.0,244


In [36]:
for col in df.columns:
    if df[col].dtype in ['object']:
            df[col].fillna(df[col].mode()[0],inplace=True)
    else:
        df[col].fillna(df[col].mean(),inplace=True)

In [37]:
df.isna().sum()

Make                                0
Model                               0
Vehicle Class                       0
Engine Size(L)                      0
Cylinders                           0
Transmission                        0
Fuel Type                           0
Fuel Consumption City (L/100 km)    0
Fuel Consumption Hwy (L/100 km)     0
Fuel Consumption Comb (L/100 km)    0
Fuel Consumption Comb (mpg)         0
CO2 Emissions(g/km)                 0
dtype: int64

In [38]:
df.drop('Fuel Consumption Comb (mpg)',axis=1,inplace=True)

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7390 entries, 0 to 7389
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Make                              7390 non-null   object 
 1   Model                             7390 non-null   object 
 2   Vehicle Class                     7390 non-null   object 
 3   Engine Size(L)                    7390 non-null   float64
 4   Cylinders                         7390 non-null   float64
 5   Transmission                      7390 non-null   object 
 6   Fuel Type                         7390 non-null   object 
 7   Fuel Consumption City (L/100 km)  7390 non-null   float64
 8   Fuel Consumption Hwy (L/100 km)   7390 non-null   float64
 9   Fuel Consumption Comb (L/100 km)  7390 non-null   float64
 10  CO2 Emissions(g/km)               7390 non-null   int64  
dtypes: float64(5), int64(1), object(5)
memory usage: 635.2+ KB


In [40]:
X = df.drop(columns=["CO2 Emissions(g/km)"])
y = df["CO2 Emissions(g/km)"]


In [41]:
# Select categorical and numerical columns
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

print("Categorical:", categorical_cols)
print("Numerical:", numeric_cols)


Categorical: ['Make', 'Model', 'Vehicle Class', 'Transmission', 'Fuel Type']
Numerical: ['Engine Size(L)', 'Cylinders', 'Fuel Consumption City (L/100 km)', 'Fuel Consumption Hwy (L/100 km)', 'Fuel Consumption Comb (L/100 km)']


In [42]:
numeric_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ("scaler", StandardScaler())
])

# Categorical preprocessing: one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

In [43]:
# Apply transformations to correct columns
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)


In [44]:
# Full preprocessing pipeline
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor)
])


In [46]:
preprocessor.fit(X)

In [45]:
joblib.dump(preprocessor, '../outputs/models/preprocessor.pkl')

['../outputs/models/preprocessor.pkl']

In [48]:
X_processed = preprocessor.transform(X)
import numpy as np
np.savez_compressed('../data/processed/X_processed.npz', X_processed)
joblib.dump(y, '../data/processed/y.pkl')

print("Preprocessing pipeline has been fitted and saved.")

Preprocessing pipeline has been fitted and saved.
