In [3]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt

In [4]:
df = pd.read_csv(filepath_or_buffer='data/FuelConsumption.csv')
df.head()

Unnamed: 0,MODELYEAR,MAKE,MODEL,VEHICLECLASS,ENGINESIZE,CYLINDERS,TRANSMISSION,FUELTYPE,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
0,2014,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,2014,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,2014,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,2014,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,2014,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244


In [5]:
msk = df.dtypes == np.object_
msk

MODELYEAR                   False
MAKE                         True
MODEL                        True
VEHICLECLASS                 True
ENGINESIZE                  False
CYLINDERS                   False
TRANSMISSION                 True
FUELTYPE                     True
FUELCONSUMPTION_CITY        False
FUELCONSUMPTION_HWY         False
FUELCONSUMPTION_COMB        False
FUELCONSUMPTION_COMB_MPG    False
CO2EMISSIONS                False
dtype: bool

In [6]:
categorical_columns = df.columns[msk]
categorical_columns

Index(['MAKE', 'MODEL', 'VEHICLECLASS', 'TRANSMISSION', 'FUELTYPE'], dtype='object')

In [7]:
df.drop(
    columns=['MAKE', 'MODEL', 'VEHICLECLASS', 'TRANSMISSION', 'FUELTYPE'],
    index=1,
    inplace=True
)

In [8]:
df.head()

Unnamed: 0,MODELYEAR,ENGINESIZE,CYLINDERS,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
0,2014,2.0,4,9.9,6.7,8.5,33,196
2,2014,1.5,4,6.0,5.8,5.9,48,136
3,2014,3.5,6,12.7,9.1,11.1,25,255
4,2014,3.5,6,12.1,8.7,10.6,27,244
5,2014,3.5,6,11.9,7.7,10.0,28,230


In [9]:
cdf = df[['FUELCONSUMPTION_CITY', 'FUELCONSUMPTION_COMB', 'ENGINESIZE', 'CYLINDERS', 'CO2EMISSIONS']]
cdf.head()

Unnamed: 0,FUELCONSUMPTION_CITY,FUELCONSUMPTION_COMB,ENGINESIZE,CYLINDERS,CO2EMISSIONS
0,9.9,8.5,2.0,4,196
2,6.0,5.9,1.5,4,136
3,12.7,11.1,3.5,6,255
4,12.1,10.6,3.5,6,244
5,11.9,10.0,3.5,6,230


In [17]:
from scipy.stats import skew
from scipy.stats import boxcox

df_box = cdf.copy()

for col in cdf.columns:
    df_box[col], _ = boxcox(cdf[col])


for col in df_box.columns:
    print(f'{col} çarpıklık (skewness): {skew(df_box[col]):.2f}\n')

FUELCONSUMPTION_CITY çarpıklık (skewness): -0.00

FUELCONSUMPTION_COMB çarpıklık (skewness): 0.00

ENGINESIZE çarpıklık (skewness): -0.02

CYLINDERS çarpıklık (skewness): 0.06

CO2EMISSIONS çarpıklık (skewness): -0.00



In [19]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df_box, train_size=0.8, random_state = 42)
print(f'Train Set: {train_df.shape}\nTest Set: {test_df.shape}')

Train Set: (852, 5)
Test Set: (214, 5)


In [20]:
from sklearn import linear_model

regression = linear_model.LinearRegression()

train_x = np.asanyarray(train_df[['FUELCONSUMPTION_CITY', 'FUELCONSUMPTION_COMB', 'ENGINESIZE']])
train_y = np.asanyarray(train_df[['CO2EMISSIONS']])

regression.fit(train_x, train_y)

coefficient = regression.coef_[0][0]
intercept = regression.intercept_[0]

print(f'Coefficient: {coefficient:.2f}\n')
print(f'Intercept: {intercept:.2f}\n')

Coefficient: -0.09

Intercept: 3.17

