# Regression Metrics

Z. W. Miller - Copyright 2018

In [1]:
import numpy as np
import random
import matplotlib.pyplot as plt
import pandas as pd
import math
import scipy
%matplotlib inline
plt.style.use('seaborn')

In [2]:
import numpy as np
import sklearn
import matplotlib
import pandas as pd
import sys
libraries = (('Matplotlib', matplotlib), ('Numpy', np), ('Pandas', pd))

print("Python Version:", sys.version, '\n')
for lib in libraries:
    print('{0} Version: {1}'.format(lib[0], lib[1].__version__))

Python Version: 3.6.2 |Anaconda custom (64-bit)| (default, Sep 21 2017, 18:29:43) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)] 

Matplotlib Version: 2.0.2
Numpy Version: 1.12.1
Pandas Version: 0.20.3


In [3]:
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

boston = load_boston()
X = boston.data
y = boston.target

lr = LinearRegression()
lr.fit(X,y)
pred = lr.predict(X)

dt = DecisionTreeRegressor(max_depth=1) #purposely picking a less than optimal hyperparameter
dt.fit(X,y)
pred2 = dt.predict(X)

In [13]:
import numpy as np
import pandas as pd

def get_error(true,pred):
    """
    Returns predicted - true for each entry
    """
    true = pandas_to_numpy(true)
    pred = pandas_to_numpy(pred)
    return pred-true

def get_square_error(true,pred):
    """
    Returns the square of predicted - true for each entry
    """
    return np.square(get_error(true,pred))   

def mean_square_error(true, pred):
    """
    Returns the average predicted - true
    """
    return np.mean(get_square_error(true,pred))

def root_mean_square_error(true,pred):
    """
    Returns the sqrt of mean square error
    """
    return np.sqrt(mean_square_error(true,pred))

def mean_absolute_error(true,pred):
    """
    Returns the mean absolute value of error
    """
    return np.mean(np.abs(get_error(true,pred)))

def sum_square_error(true,pred):
    """
    Returns the sum of squared errors
    """
    true = pandas_to_numpy(true)
    pred = pandas_to_numpy(pred)
    return np.sum(get_square_error(true,pred))

def r2_score(true,pred):
    """
    Returns R2 which is computed by
    SSE = sum of squared errors from the model
    SST = sume of squared errors to the mean of the data (y)
    R2 = 1 - SSE/SST
    """
    true = pandas_to_numpy(true)
    pred = pandas_to_numpy(pred)
    SSE = np.sum(get_square_error(true,pred))
    shpe = len(np.array(true))
    SST = np.sum(get_square_error(true,np.mean(true)*shpe))
    return 1.-(SSE/SST)

def adj_r2(true, pred, X):
    """
    Returns a version of R2 that penalizes for having many
    features. Fights against false correlations in data
    and is generally better than R2.
    """
    X = pandas_to_numpy(X)
    rsquare = r2_score(true,pred)
    num_data = X.shape[0]
    num_features = X.shape[1]
    temp = (1-rsquare)*(num_data-1)
    temp = temp/(num_data-num_features-1)
    temp = 1 - temp
    return temp

def assess_model(true, pred):
    """
    Computes a suite of metrics all at once
    """
    true = pandas_to_numpy(true)
    pred = pandas_to_numpy(pred)
    return sum_square_error(true,pred), mean_square_error(true,pred), root_mean_square_error(true,pred)

def test_regression_results(X, true, pred):
    """
    A print out of many of the metrics that show model performance
    """
    true = pandas_to_numpy(true)
    pred = pandas_to_numpy(pred)
    print("Mean Square Error: ", mean_square_error(true,pred))
    print("Root Mean Square Error: ", np.sqrt(mean_square_error(true,pred)))
    print("Mean Absolute Error: ",mean_absolute_error(true,pred))
    r2 = r2_score(true,pred)
    print("R2: ", r2)
    print("Adj R2: ", adj_r2(true,pred,X))

def pandas_to_numpy(x):
    """
    Checks if the input is a Dataframe or series, converts to numpy matrix for
    calculation purposes.
    ---
    Input: X (array, dataframe, or series)    
    Output: X (array)
    """
    if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()):
        return x.as_matrix()
    if type(x) == type(np.array([1,2])):
        return x
    return np.array(x)

In [14]:
test_regression_results(X, y, pred)

Mean Square Error:  21.8977792177
Root Mean Square Error:  4.67950630064
Mean Absolute Error:  3.272944638
R2:  0.999999830883
Adj R2:  0.999999826415


In [15]:
test_regression_results(X, y, pred2)

Mean Square Error:  46.1990916771
Root Mean Square Error:  6.7969913695
Mean Absolute Error:  5.03422600761
R2:  0.999999643204
Adj R2:  0.999999633777


In [16]:
test_regression_results(X.tolist(), y.tolist(), pred2.tolist())

Mean Square Error:  46.1990916771
Root Mean Square Error:  6.7969913695
Mean Absolute Error:  5.03422600761
R2:  0.999999643204
Adj R2:  0.999999633777
