# Example

In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv("FuelConsumption.csv")
cdf = df[['ENGINESIZE','CYLINDERS','FUELCONSUMPTION_CITY','FUELCONSUMPTION_HWY','FUELCONSUMPTION_COMB','CO2EMISSIONS']]

msk = np.random.rand(len(df)) < 0.8
train = cdf[msk]
test = cdf[~msk]

## Training the Model

In [3]:
from sklearn import linear_model
import numpy as np
regr = linear_model.LinearRegression()
x = np.asanyarray(train[['ENGINESIZE','CYLINDERS','FUELCONSUMPTION_COMB']])
y = np.asanyarray(train[['CO2EMISSIONS']])
regr.fit(x, y)
# The coefficients
print('Coefficients: ', regr.coef_)

Coefficients:  [[11.84949997  7.34460279  9.14662133]]


## Accuracy Estimation

In [12]:
y_hat= regr.predict(test[['ENGINESIZE','CYLINDERS','FUELCONSUMPTION_COMB']])
y_hat_manual = regr.intercept_ + np.dot(regr.coef_, x.T)
x = np.asanyarray(test[['ENGINESIZE','CYLINDERS','FUELCONSUMPTION_COMB']])
y = np.asanyarray(test[['CO2EMISSIONS']])
print("Residual sum of squares: %.2f"
      % np.mean((y_hat - y) ** 2))

# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(x, y))
# np.squeeze(np.asarray(y_hat_manual)) == np.squeeze(y_hat.T) # === True
(y_hat_manual.T == y_hat).all() # == True, which means two arrays are the same

Residual sum of squares: 535.68
Variance score: 0.87


True

If $\hat{y}$ is the estimated target output, y the corresponding (correct) target output, and Var is Variance, the square of the standard deviation, then the explained variance is estimated as follow:

$\texttt{explainedVariance}(y, \hat{y}) = 1 - \frac{Var\{ y - \hat{y}\}}{Var\{y\}}$  
The best possible score is 1.0, lower values are worse.

# Practice

Try to use a multiple linear regression with the same dataset but this time use __FUEL CONSUMPTION in CITY__ and 
__FUEL CONSUMPTION in HWY__ instead of FUELCONSUMPTION_COMB. Does it result in better accuracy?

In [13]:
cdf.head()

Unnamed: 0,ENGINESIZE,CYLINDERS,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,CO2EMISSIONS
0,2.0,4,9.9,6.7,8.5,196
1,2.4,4,11.2,7.7,9.6,221
2,1.5,4,6.0,5.8,5.9,136
3,3.5,6,12.7,9.1,11.1,255
4,3.5,6,12.1,8.7,10.6,244


In [14]:
regr = linear_model.LinearRegression()
x = np.asanyarray(train[['ENGINESIZE','CYLINDERS','FUELCONSUMPTION_CITY', 'FUELCONSUMPTION_HWY']])
y = np.asanyarray(train[['CO2EMISSIONS']])
regr.fit(x, y)
# The coefficients
print('Coefficients: ', regr.coef_)

Coefficients:  [[11.89833499  7.17904014  5.43667063  3.5735984 ]]


In [16]:
y_hat = regr.predict(test[['ENGINESIZE','CYLINDERS','FUELCONSUMPTION_CITY', 'FUELCONSUMPTION_HWY']])
x = np.asanyarray(test[['ENGINESIZE','CYLINDERS','FUELCONSUMPTION_CITY', 'FUELCONSUMPTION_HWY']])
y = np.asanyarray(test[['CO2EMISSIONS']])
print("Residual sum of squares (r^2): %.2f"
      % np.mean((y_hat - y) ** 2))

# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(x, y))

Residual sum of squares (r^2): 534.46
Variance score: 0.87


No it doesn't result in better accuracy. However, it ought to...

In [17]:
y_hat= regr.predict(test[['ENGINESIZE','CYLINDERS','FUELCONSUMPTION_CITY', 'FUELCONSUMPTION_HWY']])

In [18]:
y[:5] - y_hat[:5] # compare the first 5

array([[-2.43369137],
       [ 0.56141196],
       [-8.39078158],
       [-8.39078158],
       [21.51106127]])

In [19]:
compare = np.concatenate((y[:5], y_hat[:5], np.absolute(y[:5] - y_hat[:5])),
                         axis=1)
compare_df = pd.DataFrame(compare, columns=["Factual, y", "Estimation, y_hat", "Absolute Difference, |y-y_hat|"])
compare_df

Unnamed: 0,"Factual, y","Estimation, y_hat","Absolute Difference, |y-y_hat|"
0,196.0,198.433691,2.433691
1,255.0,254.438588,0.561412
2,359.0,367.390782,8.390782
3,359.0,367.390782,8.390782
4,338.0,316.488939,21.511061
