# Regression Model Evaluation metrics
The different metrics are:
1. R squared (coefficient of determination)
2. Mean Absolute Error (MAE)
3. Mean Squared Error (MSE)

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [5]:
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
housing

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [6]:
housing_df = pd.DataFrame(housing['data'], columns=housing['feature_names'])
housing_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [7]:
housing_df['MedHouseVal'] = housing['target']
housing_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [8]:
housing_df['Target'] = housing['target']
housing_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal,Target
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422,3.422
...,...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847,0.847


In [9]:
housing_df = housing_df.drop('MedHouseVal', axis=1)
housing_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


#### R square Score
It compares your model predictions to the mean of the targets. Values can range from negative infinity (a very poor model) to 1.
For example, if all your model does is predict the mean of the targets, its R^2 value will be 0 and if your model perfectly predicts a range of numbers then the R^2 value will be 1

In [10]:
from sklearn.ensemble import RandomForestRegressor
np.random.seed(0)
X = housing_df.drop('Target', axis=1)
y = housing_df['Target']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = RandomForestRegressor()
model.fit(X_train, y_train)

RandomForestRegressor()

In [11]:
model.score(X_test, y_test)

0.7977645205563335

In [14]:
y_test.mean()

2.052795009689917

In [12]:
from sklearn.metrics import r2_score
#fill an array with y test mean
y_test_mean = np.full(len(y_test), y_test.mean())

In [13]:
y_test_mean[:10]

array([2.05279501, 2.05279501, 2.05279501, 2.05279501, 2.05279501,
       2.05279501, 2.05279501, 2.05279501, 2.05279501, 2.05279501])

In [15]:
r2_score(y_true=y_test, y_pred=y_test_mean)

0.0

In [16]:
r2_score(y_true=y_test, y_pred=y_test)

1.0

#### Mean Absolute Error (MAE)
MAE is the average of absolute differences between predictions and actual values. 
It gives you an idea of how wrong your model's predictions are.

In [17]:
from sklearn.metrics import mean_absolute_error
y_preds = model.predict(X_test)
mae = mean_absolute_error(y_test, y_preds)
mae

0.3353786242248063

In [18]:
y_test

14740    1.369
10101    2.413
20566    2.007
2670     0.725
15709    4.600
         ...  
6655     1.695
3505     2.046
1919     1.286
1450     2.595
4148     1.676
Name: Target, Length: 4128, dtype: float64

In [19]:
y_preds

array([1.41548  , 2.50677  , 1.46188  , ..., 1.51764  , 2.8108901,
       1.81979  ])

In [20]:
df = pd.DataFrame(data={'actual': y_test, 'predicted': y_preds})
df

Unnamed: 0,actual,predicted
14740,1.369,1.415480
10101,2.413,2.506770
20566,2.007,1.461880
2670,0.725,0.854710
15709,4.600,4.288763
...,...,...
6655,1.695,1.996390
3505,2.046,2.024840
1919,1.286,1.517640
1450,2.595,2.810890


In [21]:
df['difference'] = df['predicted']-df['actual']
df

Unnamed: 0,actual,predicted,difference
14740,1.369,1.415480,0.046480
10101,2.413,2.506770,0.093770
20566,2.007,1.461880,-0.545120
2670,0.725,0.854710,0.129710
15709,4.600,4.288763,-0.311237
...,...,...,...
6655,1.695,1.996390,0.301390
3505,2.046,2.024840,-0.021160
1919,1.286,1.517640,0.231640
1450,2.595,2.810890,0.215890


In [23]:
#this includes the negative values of the differences column, MAE doesnt include the negative values
df['difference'].mean()

0.025396378875968545

In [26]:
#using the abs function, the negative values are converted to positive and then mean is calculated.
#MAE using formula 
np.abs(df['difference']).mean()

0.33537862422480685

#### Mean Squared Error (MSE)
MSE is the mean of the square of errors between the predicted values and actual values

In [28]:
from sklearn.metrics import mean_squared_error
y_preds = model.predict(X_test)
mse = mean_squared_error(y_test, y_preds)
mse

0.2637059448425481

In [30]:
df['squared differences'] = df['difference']*df['difference']
df

Unnamed: 0,actual,predicted,difference,squared differences
14740,1.369,1.415480,0.046480,0.002160
10101,2.413,2.506770,0.093770,0.008793
20566,2.007,1.461880,-0.545120,0.297156
2670,0.725,0.854710,0.129710,0.016825
15709,4.600,4.288763,-0.311237,0.096868
...,...,...,...,...
6655,1.695,1.996390,0.301390,0.090836
3505,2.046,2.024840,-0.021160,0.000448
1919,1.286,1.517640,0.231640,0.053657
1450,2.595,2.810890,0.215890,0.046609


In [31]:
squared = np.square(df['difference'])
squared.mean()

0.26370594484254795

In [32]:
df_new = df.copy()
df_new

Unnamed: 0,actual,predicted,difference,squared differences
14740,1.369,1.415480,0.046480,0.002160
10101,2.413,2.506770,0.093770,0.008793
20566,2.007,1.461880,-0.545120,0.297156
2670,0.725,0.854710,0.129710,0.016825
15709,4.600,4.288763,-0.311237,0.096868
...,...,...,...,...
6655,1.695,1.996390,0.301390,0.090836
3505,2.046,2.024840,-0.021160,0.000448
1919,1.286,1.517640,0.231640,0.053657
1450,2.595,2.810890,0.215890,0.046609


In [33]:
df_new.iloc[0]['squared differences'] = 16

In [34]:
df_new

Unnamed: 0,actual,predicted,difference,squared differences
14740,1.369,1.415480,0.046480,16.000000
10101,2.413,2.506770,0.093770,0.008793
20566,2.007,1.461880,-0.545120,0.297156
2670,0.725,0.854710,0.129710,0.016825
15709,4.600,4.288763,-0.311237,0.096868
...,...,...,...,...
6655,1.695,1.996390,0.301390,0.090836
3505,2.046,2.024840,-0.021160,0.000448
1919,1.286,1.517640,0.231640,0.053657
1450,2.595,2.810890,0.215890,0.046609


In [35]:
df_new['squared differences'].mean()

0.2675813904844083

In [37]:
df_new.iloc[1]['squared differences'] = 30
df_new.iloc[2]['squared differences'] = 15
df_new.iloc[3]['squared differences'] = 66

In [38]:
df_new['squared differences'].mean()

0.2943927341589728