# Model Evaluation for Regression problems

In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

## I. Loading the dataset

In [31]:
# data source: https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/FuelConsumptionCo2.csv
# Load the dataset
df = pd.read_csv('data/FuelConsumptionCo2.csv')
# Limit to numeric data
df = df._get_numeric_data()

# set data parts aside
target = 'CO2EMISSIONS'
X = df.drop(columns=[target])
# Separate the labels
y = df[target]


In [32]:
df.sample(5)

Unnamed: 0,MODELYEAR,ENGINESIZE,CYLINDERS,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
466,2014,2.4,4,11.5,8.2,10.0,28,230
524,2014,1.6,4,9.9,7.5,8.8,32,202
995,2014,3.5,6,14.2,10.3,12.4,23,285
945,2014,2.5,4,9.6,7.4,8.6,33,198
590,2014,2.0,4,10.2,7.8,9.1,31,209


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1067 entries, 0 to 1066
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   MODELYEAR                 1067 non-null   int64  
 1   ENGINESIZE                1067 non-null   float64
 2   CYLINDERS                 1067 non-null   int64  
 3   FUELCONSUMPTION_CITY      1067 non-null   float64
 4   FUELCONSUMPTION_HWY       1067 non-null   float64
 5   FUELCONSUMPTION_COMB      1067 non-null   float64
 6   FUELCONSUMPTION_COMB_MPG  1067 non-null   int64  
 7   CO2EMISSIONS              1067 non-null   int64  
dtypes: float64(4), int64(4)
memory usage: 66.8 KB


## II. Modeling
### II.1. Train-test split

In [33]:
# X: input
# y: output (labels) 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

print('Shape of TRAIN set: ', X_train.shape)
print('Shape of TEST set: ', X_test.shape)

Shape of TRAIN set:  (714, 7)
Shape of TEST set:  (353, 7)


### II.2. Train the model


In [34]:
regr = LinearRegression()
regr.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

### II.3. Testing the model's performance

In [35]:
prediction = regr.predict(X_test)


### Mean Absolute Error (MEA)

In [36]:
MAE = mean_absolute_error(y_test, prediction)
print('Mean Absolute Error: ', MAE)

Mean Absolute Error:  13.435273769170282


### Mean Squared Error (MSE)

In [37]:
MSE = mean_squared_error(y_test, prediction)
print('MSE: ',MSE)

MSE:  429.54198122304246


### R2 Score

In [38]:
R2 = r2_score(y_test, prediction)
print('R2: ', R2)

R2:  0.8945203026294815
