#  <span style="color:blue">Evaluating Regression Models 

First things first, wrote the setting

In [None]:
import pandas as pd
import numpy as np
import os
from env import hostname, user, password


from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

import wrangle 
import warnings 
warnings.filterwarnings("ignore")
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

## <span style="color:blue">1.Load your zillow dataset.

### ACQUIRE

In [None]:
df = wrangle.get_zillow_data()
df.head()

In [None]:
df.shape

In [None]:
df = wrangle.prep_zillow(df)
df.head()

In [None]:
df = wrangle.wrangled_zillow(df)
df.head()

In [None]:
df = wrangle.dtype_zillow(df)
df.info()

In [None]:
df.shape

In [None]:
train, validate, test = wrangle.split_zillow(df)

In [None]:
print(f"train: {train.shape}")
print(f"validate: {validate.shape}")
print(f"test: {test.shape}")

## <span style="color:blue">2. Fit a linear regression model (ordinary least squares) and compute yhat, predictions of taxvaluedollarcnt (home_value using only calculatedfinishedsqft (sqft).

Fit an OLS regression model using the sqft feature to predict home_value. Compute the predicted values (yhat) for the home_value using the trained model.

In [None]:
y=train["home_value"] #target
x=train["sqft"] # measured feature


In [None]:
X_train = train['sqft']
y_train = train['home_value']

X_validate = validate['sqft']
y_validate = validate['home_value']

X_test = test['sqft']
y_test = test['home_value']

In [None]:
X_train.head()

In [None]:
y_train

##### <span style = "color:blue"> Turn into one dimensional arrays

In [None]:
# assuming X and y are already defined
model = LinearRegression().fit(x, y)
predictions = model.predict(x)

## Plot linear regression models

In [None]:
# look at scatter of x vs y
plt.scatter(x, y)
plt.xlabel('x = sqft')
plt.ylabel('y = home_value');

Scatterplot shows outliers houses of 20k calculated finished square feet

In [None]:
baseline = y.mean()
baseline

In [None]:
train['baseline'] = y.mean()
train.head()

In [None]:
baseline = train.home_value.mean()
baseline

In [None]:
train['baseline'] = train.home_value.mean()
train.head()

In [None]:
plt.scatter(train.sqft, train.home_value)
plt.axhline(train.home_value.mean(), ls = ':')
plt.xlabel('x = Square Feet')
plt.ylabel('y = Tax Assessed Value')
plt.title('Baseline model');

##### <span style = "color:blue">Scatterplot shows outliers houses of 20k square feet

In [None]:
model = LinearRegression().fit(X_train, y_train)
predictions = model.predict(X_train)

In [None]:
train['yhat'] = predictions

In [None]:
train.head()

In [None]:
# visualize the line of best fit from OLS linear model
plt.scatter(train.sqft, train.home_value)
plt.plot(train.sqft, train.yhat)
plt.xlabel('x = Square Feet')
plt.ylabel('y = Tax Assessed Value')
plt.title('OLS linear model');

## <span style="color:blue">3. Plot the residuals for the linear regression model that you made.

In [None]:
train.head()

In [None]:
# residual = actual - predicted
train['residual'] = train.home_value - train.yhat
train['baseline_residual'] = train.home_value - train.baseline

In [None]:
train.head()

Negative Residual = Prediction  high the actual observed value is lower than the predicted value. In other words, the prediction is higher than the actual value. This indicates that the model has overestimated the target variable.
<br>
Positive Residual = Prediction too low

In [None]:
# residual plots (x vs residual)

plt.figure(figsize = (11,5))

plt.subplot(121)
plt.scatter(train.sqft, train.baseline_residual)
plt.axhline(y = 0, ls = ':', color='red')
plt.xlabel('Square Feet')
plt.ylabel('Residual')
plt.title('Baseline Residuals')

plt.subplot(122)
plt.scatter(train.sqft, train.residual)
plt.axhline(y = 0, ls = ':', color ='red')
plt.xlabel('Square Feet')
plt.ylabel('Residual')
plt.title('OLS model residuals');

In [None]:
baseline_residual_min = train.baseline_residual.min()
baseline_residual_max = train.baseline_residual.max()
residual_min = train.residual.min()
residual_max = train.residual.max()

In [None]:
residual_min, residual_max

In [None]:
# residual plots (x vs residual)

plt.figure(figsize = (11,5))

plt.subplot(121)
plt.scatter(train.sqft, train.baseline_residual, alpha=.05)
plt.axhline(y = 0, ls = ':', color = 'red')
plt.xlim(0,5000)
plt.ylim(baseline_residual_min().astype(np.int64), baseline_residual.max().astype(np.int64))
plt.xlabel('Square Feet')
plt.ylabel('Residual')
plt.title('Baseline Residuals')

plt.subplot(122)
plt.scatter(train.sqft, train.residual, alpha=0.05)
plt.axhline(y = 0, ls = ':', color ='red')

plt.xlabel('Square Feet')
plt.ylabel('Residual')
plt.title('OLS model residuals')
print("Negative Residual = Prediction too high")
print("Positive Residual = Prediction too low")

## FINDINGS
- Residual plot slope implies need for more predictors.

In [None]:
# Residual plot (y vs residual)
actual = train.home_value
predicted = train.yhat
residuals = actual - predicted
plt.hlines(0, actual.min(), actual.max(), ls=':')
plt.scatter(actual, residuals)
plt.ylabel('residual ($y - \hat{y}$)')
plt.xlabel('actual value ($y$)')
plt.title('Actual vs Residual')
plt.show()

## <span style="color:blue">4. Calculate the sum of squared errors, explained sum of squares, total sum of squares, mean squared error, and root mean squared error for your model.

In [None]:
train['residual^2'] = train.residual**2
train['baseline_residual^2'] = train.baseline_residual**2

train.head()

<b>
    Calculating SSE
</b>

In [None]:
SSE = train['residual^2'].sum()
SSE_baseline = train['baseline_residual^2'].sum()

print('SSE =', "{:.1f}".format(SSE))
print("SSE Baseline =", "{:.1f}".format(SSE_baseline))

In [None]:
SSE < SSE_baseline

## <span style="color:blue">5. Calculate the sum of squared errors, mean squared error, and root mean squared error for the baseline model (i.e. a model that always predicts the average home value amount).

<b>
    
Calculating MSE

</b>

In [None]:
len(train)
train.shape[0]

In [None]:
MSE = SSE/len(train)
MSE_baseline = SSE_baseline/len(train)

print("MSE = ", "{:.1f}".format(MSE))
print("MSE baseline = ", "{:.1f}".format(MSE_baseline))

<b>
    
Calculating RMSE

</b>

In [None]:
from math import sqrt
RMSE = sqrt(MSE)
RMSE_baseline =  sqrt(MSE_baseline)


print("RMSE = ", "{:.1f}".format(RMSE))
print("RMSE baseline = ", "{:.1f}".format(RMSE_baseline))

##  <span style="color:blue">6. Write python code that compares the sum of squared errors for your model against the sum of squared errors for the baseline model and outputs whether or not your model performs better than the baseline model.

In [None]:
if SSE < SSE_baseline:
    print('OLS Regression Model Performs Better than Baseline')
else:
    print('OLS Regression Model Performs Worse than Baseline')

##  <span style="color:blue">7. What is the amount of variance explained in your model?

In [None]:
# calculate R2 manually:

# Total Sum of Squares = SSE for baseline
TSS = SSE_baseline = train['baseline_residual^2'].sum()

# Sum of squared error for the regression line (Unexplained error)
SSE = train['residual^2'].sum()

# ESS - Explained sum of squares ('Explained Error')
ESS = TSS - SSE

# Calculate R2
R2 = ESS/TSS
R2

In [None]:
# calculate R2 the easy way:

from sklearn.metrics import r2_score
r2_score(train.home_value, train.yhat)

##  <span style="color:blue">8. Is your model better than the baseline model?

In [None]:
X2 = sm.add_constant(train.sqft)
est = sm.OLS(train.home_value, X2)
est2 = est.fit()
print(est2.summary())

The OLS Regression Model outperformed the baseline model on the test set using RMSE as the evaluation metric. The RMSE for the OLS Regression Model was 272,306.6, while the baseline model had an RMSE of 313,748.0.

Despite the improvement over the baseline, the OLS Regression Model still does a poor job of accurately predicting home prices. The level of error, with a difference of hundreds of thousands of dollars, would be considered unacceptable for any buyer or seller relying on accurate price estimates.




##  <span style="color:blue">9. Create a file named evaluate.py that contains the following functions.

In [None]:
def plot_residuals(actual, predicted):
    residuals = actual - predicted
    plt.hlines(0, actual.min(), actual.max(), ls=':')
    plt.scatter(actual, residuals)
    plt.ylabel('residual ($y - \hat{y}$)')
    plt.xlabel('actual value ($y$)')
    plt.title('Actual vs Residual')
    plt.show()

In [None]:

def residuals(actual, predicted):
    return actual - predicted

def SSE(actual, predicted):
    return (residuals(actual, predicted) **2).sum()

def MSE(actual, predicted):
    n = actual.shape[0]
    return SSE(actual, predicted) / n

def RMSE(actual, predicted):
    return math.sqrt(MSE(actual, predicted))

def ESS(actual, predicted):
    return ((predicted - actual.mean()) ** 2).sum()

def TSS(actual):
    return ((actual - actual.mean()) ** 2).sum()

def R2_score(actual, predicted):
    return ess(actual, predicted) / TSS(actual)


In [None]:
def regression_errors(actual, predicted):
    return pd.Series({
        'SSE': SSE(actual, predicted),
        'ess': ess(actual, predicted),
        'TSS': TSS(actual),
        'MSE': MSE(actual, predicted),
        'RMSE': RMSE(actual, predicted),
   })

def baseline_mean_errors(actual):
    predicted = actual.mean()
    return {
        'SSE': SSE(actual, predicted),
        'MSE': MSE(actual, predicted),
        'RMSE': RMSE(actual, predicted),
    }

def better_than_baseline(actual, predicted):
    RMSE_baseline = RMSE(actual, actual.mean())
    RMSE_model = RMSE(actual, predicted)
    return RMSE_model < RMSE_baseline