In [1]:
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [2]:
# DATA PREPROCESSING

df = pd.read_csv("CO2Emissions.csv")
df # view data frame

Unnamed: 0,Make,Model,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)
0,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244
...,...,...,...,...,...,...,...,...,...,...,...,...
7380,VOLVO,XC40 T5 AWD,SUV - SMALL,2.0,4,AS8,Z,10.7,7.7,9.4,30,219
7381,VOLVO,XC60 T5 AWD,SUV - SMALL,2.0,4,AS8,Z,11.2,8.3,9.9,29,232
7382,VOLVO,XC60 T6 AWD,SUV - SMALL,2.0,4,AS8,Z,11.7,8.6,10.3,27,240
7383,VOLVO,XC90 T5 AWD,SUV - STANDARD,2.0,4,AS8,Z,11.2,8.3,9.9,29,232


In [3]:
# Handling missing values
pd.isnull(df).any() # checking which columns have nan values
# data does not have any nan values!!

Make                                False
Model                               False
Vehicle Class                       False
Engine Size(L)                      False
Cylinders                           False
Transmission                        False
Fuel Type                           False
Fuel Consumption City (L/100 km)    False
Fuel Consumption Hwy (L/100 km)     False
Fuel Consumption Comb (L/100 km)    False
Fuel Consumption Comb (mpg)         False
CO2 Emissions(g/km)                 False
dtype: bool

In [4]:
# extracting independent values (engine size, cylinders, transmission, fuel type, fuel consumption)
x = df.loc[:, ['Engine Size(L)', 'Cylinders', 'Transmission', 'Fuel Type', 'Fuel Consumption Comb (L/100 km)']]
x

Unnamed: 0,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption Comb (L/100 km)
0,2.0,4,AS5,Z,8.5
1,2.4,4,M6,Z,9.6
2,1.5,4,AV7,Z,5.9
3,3.5,6,AS6,Z,11.1
4,3.5,6,AS6,Z,10.6
...,...,...,...,...,...
7380,2.0,4,AS8,Z,9.4
7381,2.0,4,AS8,Z,9.9
7382,2.0,4,AS8,Z,10.3
7383,2.0,4,AS8,Z,9.9


In [5]:
# extracting the dependent value (CO2 Emissions)
y = df.loc[:, ['CO2 Emissions(g/km)']]
y

Unnamed: 0,CO2 Emissions(g/km)
0,196
1,221
2,136
3,255
4,244
...,...
7380,219
7381,232
7382,240
7383,232


In [8]:
# encoding categorical values (transmission & fueltype)
categorical_columns = ['Transmission', 'Fuel Type']

# creating column transformer to encode columns
transformed_data = ColumnTransformer(
    transformers = [
        ('cat', OneHotEncoder(), categorical_columns), 
    ],
    remainder = "passthrough"
)

# transforming current x data to fit transformed req
x = transformed_data.fit_transform(x)

print(x)

  (0, 14)	1.0
  (0, 31)	1.0
  (0, 32)	2.0
  (0, 33)	4.0
  (0, 34)	8.5
  (1, 25)	1.0
  (1, 31)	1.0
  (1, 32)	2.4
  (1, 33)	4.0
  (1, 34)	9.6
  (2, 22)	1.0
  (2, 31)	1.0
  (2, 32)	1.5
  (2, 33)	4.0
  (2, 34)	5.9
  (3, 15)	1.0
  (3, 31)	1.0
  (3, 32)	3.5
  (3, 33)	6.0
  (3, 34)	11.1
  (4, 15)	1.0
  (4, 31)	1.0
  (4, 32)	3.5
  (4, 33)	6.0
  (4, 34)	10.6
  :	:
  (7380, 17)	1.0
  (7380, 31)	1.0
  (7380, 32)	2.0
  (7380, 33)	4.0
  (7380, 34)	9.4
  (7381, 17)	1.0
  (7381, 31)	1.0
  (7381, 32)	2.0
  (7381, 33)	4.0
  (7381, 34)	9.9
  (7382, 17)	1.0
  (7382, 31)	1.0
  (7382, 32)	2.0
  (7382, 33)	4.0
  (7382, 34)	10.3
  (7383, 17)	1.0
  (7383, 31)	1.0
  (7383, 32)	2.0
  (7383, 33)	4.0
  (7383, 34)	9.9
  (7384, 17)	1.0
  (7384, 31)	1.0
  (7384, 32)	2.0
  (7384, 33)	4.0
  (7384, 34)	10.7


In [17]:
# splitting datasets into training and testing groups
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

#x_test.toarray() # need to say .toarray because x_test is a sparse matrix. 

In [18]:
st_x = StandardScaler(with_mean=False)

x_train = st_x.fit_transform(x_train)
x_test = st_x.transform(x_test)

ValueError: Cannot center sparse matrices: pass `with_mean=False` instead. See docstring for motivation and alternatives.