# Example

In [22]:
import pandas as pd
df = pd.read_csv("FuelConsumption.csv")
cdf = df[['ENGINESIZE','CYLINDERS','FUELCONSUMPTION_CITY','FUELCONSUMPTION_HWY','FUELCONSUMPTION_COMB','CO2EMISSIONS']]

msk = np.random.rand(len(df)) < 0.8
train = cdf[msk]
test = cdf[~msk]

## Training the Model

In [59]:
from sklearn import linear_model
import numpy as np
regr = linear_model.LinearRegression()
x = np.asanyarray(train[['ENGINESIZE','CYLINDERS','FUELCONSUMPTION_COMB']])
y = np.asanyarray(train[['CO2EMISSIONS']])
regr.fit (x, y)
# The coefficients
print ('Coefficients: ', regr.coef_)


Coefficients:  [[9.98556269 7.9360493  9.65979187]]


array([65.05084194])

## Accuracy Estimation

In [81]:
y_hat= regr.predict(test[['ENGINESIZE','CYLINDERS','FUELCONSUMPTION_COMB']])
y_hat_manual = regr.intercept_ + np.dot(regr.coef_, x.T)
x = np.asanyarray(test[['ENGINESIZE','CYLINDERS','FUELCONSUMPTION_COMB']])
y = np.asanyarray(test[['CO2EMISSIONS']])
print("Residual sum of squares: %.2f"
      % np.mean((y_hat - y) ** 2))

# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(x, y))
# np.squeeze(np.asarray(y_hat_manual)) == np.squeeze(y_hat.T) # === True
all(y_hat_manual.T == y_hat) # == True, which means two arrays are the same

Residual sum of squares: 436.97
Variance score: 0.88


True

If $\hat{y}$ is the estimated target output, y the corresponding (correct) target output, and Var is Variance, the square of the standard deviation, then the explained variance is estimated as follow:

$\texttt{explainedVariance}(y, \hat{y}) = 1 - \frac{Var\{ y - \hat{y}\}}{Var\{y\}}$  
The best possible score is 1.0, lower values are worse.

# Practice

Try to use a multiple linear regression with the same dataset but this time use __FUEL CONSUMPTION in CITY__ and 
__FUEL CONSUMPTION in HWY__ instead of FUELCONSUMPTION_COMB. Does it result in better accuracy?

In [25]:
cdf.head()

Unnamed: 0,ENGINESIZE,CYLINDERS,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,CO2EMISSIONS
0,2.0,4,9.9,6.7,8.5,196
1,2.4,4,11.2,7.7,9.6,221
2,1.5,4,6.0,5.8,5.9,136
3,3.5,6,12.7,9.1,11.1,255
4,3.5,6,12.1,8.7,10.6,244


In [48]:
regr = linear_model.LinearRegression()
x = np.asanyarray(train[['ENGINESIZE','CYLINDERS','FUELCONSUMPTION_CITY', 'FUELCONSUMPTION_HWY']])
y = np.asanyarray(train[['CO2EMISSIONS']])
regr.fit(x, y)
# The coefficients
print('Coefficients: ', regr.coef_)

Coefficients:  [[10.02501632  7.51996346  6.50446557  2.78004577]]


In [57]:
y_hat = regr.predict(test[['ENGINESIZE','CYLINDERS','FUELCONSUMPTION_CITY', 'FUELCONSUMPTION_HWY']])
x = np.asanyarray(test[['ENGINESIZE','CYLINDERS','FUELCONSUMPTION_CITY', 'FUELCONSUMPTION_HWY']])
y = np.asanyarray(test[['CO2EMISSIONS']])
print("Residual sum of squares: %.2f"
      % np.mean((y_hat - y) ** 2))

# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(x, y))

Residual sum of squares: 441.10
Variance score: 0.88


No it doesn't result in better accuracy. However, it ought to...

In [28]:
y_hat= regr.predict(test[['ENGINESIZE','CYLINDERS','FUELCONSUMPTION_CITY', 'FUELCONSUMPTION_HWY']])

In [33]:
y[:5] - y_hat[:5] # compare the first 5

array([[  6.23342401],
       [-15.38714675],
       [-13.8487185 ],
       [ 28.70501425],
       [-18.34876793]])

In [106]:
compare = np.concatenate((y[:5], y_hat[:5], np.absolute(y[:5] - y_hat[:5])),
                         axis=1)
compare_df = pd.DataFrame(compare, columns=["Factual, y", "Estimation, y_hat", "Absolute Difference, |y-y_hat|"])
compare_df

Unnamed: 0,"Factual, y","Estimation, y_hat","Absolute Difference, |y-y_hat|"
0,221.0,213.494392,7.505608
1,230.0,244.214526,14.214526
2,232.0,245.180505,13.180505
3,354.0,324.232176,29.767824
4,260.0,277.637135,17.637135
