# Modelling of $CO_2$ Emissions

In [1]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [2]:
#import dataset
df= pd.read_csv("CO2 Emissions_Canada.csv")

In [3]:
df.head(5)

Unnamed: 0,Make,Model,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)
0,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7385 entries, 0 to 7384
Data columns (total 12 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Make                              7385 non-null   object 
 1   Model                             7385 non-null   object 
 2   Vehicle Class                     7385 non-null   object 
 3   Engine Size(L)                    7385 non-null   float64
 4   Cylinders                         7385 non-null   int64  
 5   Transmission                      7385 non-null   object 
 6   Fuel Type                         7385 non-null   object 
 7   Fuel Consumption City (L/100 km)  7385 non-null   float64
 8   Fuel Consumption Hwy (L/100 km)   7385 non-null   float64
 9   Fuel Consumption Comb (L/100 km)  7385 non-null   float64
 10  Fuel Consumption Comb (mpg)       7385 non-null   int64  
 11  CO2 Emissions(g/km)               7385 non-null   int64  
dtypes: flo

In [5]:
df.isnull().sum()

Make                                0
Model                               0
Vehicle Class                       0
Engine Size(L)                      0
Cylinders                           0
Transmission                        0
Fuel Type                           0
Fuel Consumption City (L/100 km)    0
Fuel Consumption Hwy (L/100 km)     0
Fuel Consumption Comb (L/100 km)    0
Fuel Consumption Comb (mpg)         0
CO2 Emissions(g/km)                 0
dtype: int64

In [6]:
df.corr(numeric_only=True)

Unnamed: 0,Engine Size(L),Cylinders,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)
Engine Size(L),1.0,0.927653,0.831379,0.761526,0.81706,-0.757854,0.851145
Cylinders,0.927653,1.0,0.800702,0.715252,0.780534,-0.719321,0.832644
Fuel Consumption City (L/100 km),0.831379,0.800702,1.0,0.94818,0.99381,-0.927059,0.919592
Fuel Consumption Hwy (L/100 km),0.761526,0.715252,0.94818,1.0,0.977299,-0.890638,0.883536
Fuel Consumption Comb (L/100 km),0.81706,0.780534,0.99381,0.977299,1.0,-0.925576,0.918052
Fuel Consumption Comb (mpg),-0.757854,-0.719321,-0.927059,-0.890638,-0.925576,1.0,-0.907426
CO2 Emissions(g/km),0.851145,0.832644,0.919592,0.883536,0.918052,-0.907426,1.0


In [7]:
df.select_dtypes('object').nunique()

Make               42
Model            2053
Vehicle Class      16
Transmission       27
Fuel Type           5
dtype: int64

In [8]:
df['Vehicle Class']

0              COMPACT
1              COMPACT
2              COMPACT
3          SUV - SMALL
4          SUV - SMALL
             ...      
7380       SUV - SMALL
7381       SUV - SMALL
7382       SUV - SMALL
7383    SUV - STANDARD
7384    SUV - STANDARD
Name: Vehicle Class, Length: 7385, dtype: object

### Preprocessing

In [9]:
# remove duplicate columns
df = df.drop_duplicates()

### Simple Linear Regression

In [101]:
X = df[['Engine Size(L)']]
y = df['CO2 Emissions(g/km)']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [102]:
lr = LinearRegression()
lr.fit(X_train,y_train)

In [103]:
print(f'Root Mean Squared Error on training set is:{np.sqrt(mean_squared_error(y_train,lr.predict(X_train)))}')
print(f'Root Mean Squared Error on test set is:{np.sqrt(mean_squared_error(y_test,lr.predict(X_test)))}')

Root Mean Squared Error on training set is:30.480117495837952
Root Mean Squared Error on test set is:31.886595514802206


In [104]:
print(f'R2 score is {r2_score(lr.predict(X_test),y_test)}')

R2 score is 0.574734445339548


In [105]:
print(f'Co-efficients are {lr.coef_}')
print(f'Intercept is {lr.intercept_}')

Co-efficients are [37.07950694]
Intercept is 133.95452773830056


### Multiple linear Regression

In [57]:
#categorical Variables
cat_var = [ 'Vehicle Class', 'Transmission', 'Fuel Type','Cylinders']

num_var = ['Engine Size(L)', 'Fuel Consumption City (L/100 km)',
       'Fuel Consumption Hwy (L/100 km)']
target = 'CO2 Emissions(g/km)'

Since we have categorical variables in our dataset, we will deal with them using **One Hot Encoding**, we will use column transformer to deal with numerical variables and categorical variables separately

In [58]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [76]:
X = df[num_var+cat_var]
y = df[target]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [77]:
cat_pipeline = Pipeline(steps=[
    ('one-hot',OneHotEncoder(handle_unknown='ignore', sparse=False))
])

In [78]:
from sklearn.compose import ColumnTransformer
col_trans = ColumnTransformer(transformers=[
    ('cat_pipeline',cat_pipeline,cat_var)
    ],
    remainder='passthrough',
    n_jobs=-1)

In [79]:
lr = LinearRegression()
lr_pipeline = Pipeline(steps=[
    ('col_trans', col_trans),
    ('model', lr)
])

In [80]:
lr_pipeline.fit(X_train,y_train)

In [81]:
y_p = lr_pipeline.predict(X_test)

In [82]:
mean_squared_error(y_test,y_p)

29.34019928222245

In [83]:
r2_score(y_p,y_test)

0.9915117173982055

In [95]:
print(f'Roor Mean Squared Error on training set is:{np.sqrt(mean_squared_error(y_train,lr_pipeline.predict(X_train)))}')
print(f'Root Mean Squared Error on test set is:{np.sqrt(mean_squared_error(y_test,lr_pipeline.predict(X_test)))}')

Roor Mean Squared Error on training set is:5.258361338231659
Root Mean Squared Error on test set is:5.4166594209182515


In [85]:
print(f'R2 score is {r2_score(lr_pipeline.predict(X_test),y_test)}')

R2 score is 0.9915117173982055


https://www.kaggle.com/code/ishadss/regression-models-on-co2-emissions-and-eda#Simple-Linear-Regression

### Ridge Regression

In [86]:
from sklearn.linear_model import Ridge

In [87]:
rr = Ridge()
rr_pipeline = Pipeline(steps=[
    ('col_trans', col_trans),
    ('model', rr)
])

In [88]:
rr_pipeline.fit(X_train,y_train)

In [89]:
y_rp = rr_pipeline.predict(X_test)

In [90]:
mean_squared_error(y_test,y_rp)

29.434481222736277

In [91]:
r2_score(y_rp,y_test)

0.9914721086825573

Our previous model was not overfitting hence there is not much difference in results of Ridge and Linear Regression