# Working notebook 3

# **Goals:**

* Discover key attributes that drive and have a high correlation with home value.

* Use those attributes to develop a machine learning model to predict home value.

    * Carefully select features that will prevent data leakage. 


## Imports

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt


import wrangle as w
import model as m

# Acquire:

In [None]:
# acquire telco data 
df = w.get_zillow_data()

* Data acquire from Codeup Database 11/17/22

* It contained  52441 rows and 10 columns before cleaning

* Each row represents a single family household:
    * properties from 2017 with current transactions
    * located in the Californian counties of 'Los Angeles' or 'Orange'or 'Ventura'

* Each column represents a feature related to the single family residential.

# Data Wrangle

In [None]:
df.isnull().sum()

In [None]:
52441 - 50446 

In [None]:
(1995/52441) *100

In [None]:
100 -((1995/52441) *100)

In [None]:
# a total of 1995 rows were removed as outliers still maintain 96.2% of original total data
df = w.handle_outliers(df)

In [None]:
df.isnull().sum()

In [None]:
50446 

In [None]:
# dropped properties with no bathrooms and no bedrooms 75 rows at still retained 96% of original data
df[(df.bathrooms==0) & (df.bedrooms ==0)]

In [None]:
def no_beds_and_baths(df):
    df= df[~(df.bathrooms==0) & ~(df.bedrooms ==0)]
    
    return df

In [None]:
# drop 0 beds and 0 baths
df= df[~(df.bathrooms==0) & ~(df.bedrooms ==0)]

In [None]:
df.shape

In [None]:
50326/52441

In [None]:
df.isnull().sum()

In [None]:
w.process_optional_features(df)

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
# dropp nulls  a total of 40 rows at this point we have retain 95.9% of original data
df = df.dropna()

In [None]:
df.shape

In [None]:
50326-50286

In [None]:
50286/52441

In [None]:
def process_fancy_features(df):
    
    columns = ['fireplace','deck','pool','garage']    
    for feature in columns:
        df[feature]=df[feature].replace(r"^\s*$", np.nan, regex=True)     
        # fill fancy features with 0 assumption that if it was not mark it did not exist
        df[feature] = df[feature].fillna(0)
    return df

In [None]:
def handle_outliers(df):
    """Manually handle outliers '"""
    df = df[df.bathrooms <= 6]
    
    df = df[df.bedrooms <= 6]
    
    df = df[df.home_value <= 1_750_000]
    
    return df

In [None]:
def zillow_prep(df):
    
    # remove outliers
    df = handle_outliers(df)
    
    # removed rows with 0 beds and 0 baths
    df = df[~(df.bathrooms==0) & ~(df.bedrooms ==0)]
    
    # process nulls in luxury features:
    df = process_fancy_features(df)
    
    # drop nulls
    df = df.dropna()

    return df

In [None]:
# FIPS code 6111 Ventura County, 6059  Orange County, 6037 Los Angeles County
df.county.value_counts()

In [None]:
df.isnull().sum()

In [None]:
def new_features(df):
    #Creating new column for home age using year_built, casting as float
    df['home_age'] = 2017- df['yearbuilt']
    df["home_age"] = df["home_age"].astype('float')
    
    df['optional_features'] = (df.garage==1)|(df.deck == 1)|(df.pool == 1)|(df.fireplace == 1)
    
    return df
    
    

In [None]:
def encode_features(df):
    df.fireplace = df.fireplace.replace({2:1, 3:1, 4:1, 5:1})
    df.deck= df.deck.replace({66:1})
    df.garage = df.garage.replace({2:1, 3:1, 4:1, 5:1, 6:1, 7:1, 8:1, 9:1, 10:1, 13:1,14:1})
    df.optional_features = df.optional_features.replace({False:0, True: 1})
    temp = pd.get_dummies(df['county'], drop_first=False)
    df = pd.concat([df, temp],axis =1)
    return df

In [None]:
df.head()

In [None]:
df =new_features(df)

In [None]:
df.head()

In [None]:
df=encode_features(df)

In [None]:
df.head(5)

###                                                        <h1><center>Data Dictionary</center></h1>     


|Feature          | Description|
| :---------------: | :---------------------------------- |
| home_value (target) | The total tax assessed value of the parcel  |
| squarefeet:  | Calculated total finished living area of the home |
| bathrooms:   |  Number of bathrooms in home including fractional bathrooms |
| bedrooms: | Number of bedrooms in home  |
| yearbuilt:  |  The Year the principal residence was built   |
| fireplace: | fireplace on property (if any = 1) |
| deck:  | deck on property (if any = 1) |
| pool:  | pool on property (if any = 1) |
| garage: | garage on property (if any = 1) |
| county: | FIPS code for californian counties: 6111 Ventura County, 6059  Orange County, 6037 Los Angeles County |
| home_age: | The age of the home in 2017   |
|optional_features: |If a home has any of the follwing: fireplace, deck, pool, garage it is noted as 1   |
|additional Features: | 	Encoded and values for categorical data

# Prepare:

In [None]:
# prepare data 
df = w.zillow_prep(df)

In [None]:
# split data: train, validate and test
train, validate, test = w.split_data(df)

prepare actions:
* After the follwing steps I retained 95.9% of original data:
    * Outliers were removed
    (to better fit the definition of Single Family Property):
    
        * Beds above 6 
        * Baths above 6 
        * Home values above 1_750_000
        * Rows with both 0 beds and 0 baths 
        
    * For the following features it was assumed null values meant the structure did not exist on property:
        * fireplace (45198)
        * deck (52052)
        * pool (41345)
        * garage (34425)
            
    * The following null values were dropped:
        * home_value (1)
        * squarefeet (82)
        * yearbuilt (116)

* Encoded categorical variables
* Split data into train, validate and test 
    * Approximately: train 56%, validate 24%, test 20%
    * Stratified on 'churn'


In [None]:
df 

In [None]:
train.head()

In [None]:
train.shape, validate.shape, test.shape

# Looking at the data

In [None]:
train.head(10)

# Data Summary

In [None]:
train.describe().T

# Explore:

## Does contract type affect churn?

In [None]:
# Obtain plot for contract type vs churn
e.get_plot_contract(train)

* **It seems that customers with a two-year contracts churn less than customers with month-to-month contract.**

**I will now conduct a chi-square test to determine if there is an association between contract type and churn.**

* The confidence interval is 95%
* Alpha is set to 0.05 

$H_0$: There is **no** relationship between contract type and churn.

$H_a$: There is a relationship between contract type and churn.

In [None]:
# Obtain chi-square on Contract type
e.get_chi2_contract(train)

The p-value is less than alpha. **There is evidence to support that tenure has an association with churn.** I believe that tenure is a driver of churn. Adding an encoded version of this feature to the model will likely increase the mode's accuracy.

# Exploration Summary

* A
* B
* C

# Features that will be included in my model

* **A**  has a significant statistical relationship to 
* **B**  has a significant statistical relationship to 
* **C**  has a significant statistical relationship to 


# Features that will be not included in my model

* **D** did not ..
* **Other features** have ..

# Modeling:

## Scaling

# Prepare  data for models

In [None]:
# prepare data for modeling
X_train, y_train, X_validate, y_validate, X_test, y_test = m.model_data_prep(train, validate, test)

In [None]:
train.head()

In [None]:
def model_data_prep(train, validate,test):
    X_train_scaled, X_validate_scaled, X_test_scaled = scale_data(train, 
               validate, 
               test, 
               columns_to_scale=['squarefeet','bathrooms','bedrooms','yearbuilt','home_age'])
    # Setup X and y
    X_train_scaled = X_train_scaled.drop(columns=['home_value','county'])
    y_train = train.home_value

    X_validate_scaled = X_validate_scaled.drop(columns=['home_value','county'])
    y_validate = validate.home_value

    X_test_scaled = X_test_scaled.drop(columns=['home_value','county'])
    y_test = test.home_value
    
    return X_train_scaled,y_train, X_validate_scaled,y_validate, X_test_scaled, y_test

In [None]:
X_train_scaled

In [None]:
y_train

In [None]:
from sklearn.preprocessing import MinMaxScaler

def scale_data(train, 
               validate, 
               test, 
               columns_to_scale=['squarefeet','bathrooms','bedrooms','yearbuilt','home_age']):
    '''
    scale_data takes in train , validate, test data  and returns their scaled counterparts.
    '''
    # create copies of our original data
    train_scaled = train.copy()
    validate_scaled = validate.copy()
    test_scaled = test.copy()
    #create the scaler
    scaler = MinMaxScaler()
    # fit the scaler into train data
    scaler.fit(train[columns_to_scale])
    
    # applying the scaler to train, validate, and test data
    train_scaled[columns_to_scale] = pd.DataFrame(scaler.transform(train[columns_to_scale]),
                                                  columns=train[columns_to_scale].columns.values).set_index([train.index.values])
                                                  
    validate_scaled[columns_to_scale] = pd.DataFrame(scaler.transform(validate[columns_to_scale]),
                                                  columns=validate[columns_to_scale].columns.values).set_index([validate.index.values])
    
    test_scaled[columns_to_scale] = pd.DataFrame(scaler.transform(test[columns_to_scale]),
                                                 columns=test[columns_to_scale].columns.values).set_index([test.index.values])
    
    return train_scaled, validate_scaled, test_scaled

In [None]:
X_train_scaled, X_validate_scaled, X_test_scaled = scale_data(train, 
               validate, 
               test, 
               columns_to_scale=['squarefeet','bathrooms','bedrooms','yearbuilt','home_age'])

In [None]:
# Setup X and y
X_train_scaled = X_train_scaled.drop(columns=['home_value','county'])
y_train = train.home_value

X_validate_scaled = X_validate_scaled.drop(columns=['home_value','county'])
y_validate = validate.home_value

X_test_scaled = X_test_scaled.drop(columns=['home_value','county'])
y_test = test.home_value

In [None]:
X_train_scaled

# Model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:


#scores.loc[len(scores.index)] = [key, f, RMSE_baseline, RMSE, R2, RMSE_val, R2_val, diff]

In [None]:
# set up dataframe for predictions, add actual values
train_pred = pd.DataFrame({
    'actual': train.home_value
}) 
validate_pred = pd.DataFrame({
    'actual': validate.home_value
}) 

## Baseline

In [None]:
# add a baseline model
train_pred['baseline_mean'] = train.home_value.mean()
validate_pred['baseline_mean'] = validate.home_value.mean()

train_pred['baseline_median'] = train.home_value.median()
validate_pred['baseline_median'] = validate.home_value.median()

# OLS

In [None]:

# 1. make the thing
lm = LinearRegression()
# 2. fit the thing
lm.fit(X_train, y_train)
# 3. use the thing (make predictions)

train_pred['OLS_Model'] = lm.predict(X_train)
validate_pred['OLS_Model'] = lm.predict(X_validate)

In [None]:
train_pred

In [None]:
validate_pred

## Using Kbest 7 features

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression

# parameters: f_regression stats test, give me 7 features
f_selector = SelectKBest(f_regression, k=7)

# find the top 8 X's correlated with y
f_selector.fit(X_train, y_train)

# boolean mask of whether the column was selected or not. 
feature_mask = f_selector.get_support()

# get list of top K features. 
f_feature = X_train.iloc[:,feature_mask].columns.tolist()


In [None]:
X_train[f_feature]

In [None]:
X_train.columns.to_list()

In [None]:
# 1. make the thing
lm = LinearRegression()
# 2. fit the thing
lm.fit(X_train[f_feature], y_train)
# 3. use the thing (make predictions)

train_pred['OLS_Model_f7'] = lm.predict(X_train[f_feature])
validate_pred['OLS_Model_f7'] = lm.predict(X_validate[f_feature])

In [None]:
train_pred

# Using Kbest 4 features

In [None]:
# parameters: f_regression stats test, give me 4 features
f_selector = SelectKBest(f_regression, k=4)

# find the top 8 X's correlated with y
f_selector.fit(X_train, y_train)

# boolean mask of whether the column was selected or not. 
feature_mask = f_selector.get_support()

# get list of top K features. 
f_feature = X_train.iloc[:,feature_mask].columns.tolist()

# 1. make the thing
lm = LinearRegression()
# 2. fit the thing
lm.fit(X_train[f_feature], y_train)
# 3. use the thing (make predictions)

train_pred['OLS_Model_f4'] = lm.predict(X_train[f_feature])
validate_pred['OLS_Model_f4'] = lm.predict(X_validate[f_feature])

In [None]:
X_train[f_feature]

In [None]:
train_pred

# Using Kbest 3 features

In [None]:
# parameters: f_regression stats test, give me 3 features
f_selector = SelectKBest(f_regression, k=3)

# find the top 8 X's correlated with y
f_selector.fit(X_train, y_train)

# boolean mask of whether the column was selected or not. 
feature_mask = f_selector.get_support()

# get list of top K features. 
f_feature = X_train.iloc[:,feature_mask].columns.tolist()

# 1. make the thing
lm = LinearRegression()
# 2. fit the thing
lm.fit(X_train[f_feature], y_train)
# 3. use the thing (make predictions)

train_pred['OLS_Model_f3'] = lm.predict(X_train[f_feature])
validate_pred['OLS_Model_f3'] = lm.predict(X_validate[f_feature])

In [None]:
X_train[f_feature]

In [None]:
train_pred

# OLS_ RFE  features = 7  

In [None]:
columns = X_train.columns.to_list()

In [None]:
X_train

In [None]:
from sklearn.feature_selection import RFE
lm = LinearRegression()


# 1. Transform our X
rfe = RFE(lm, n_features_to_select=7)
rfe.fit(X_train, y_train)
print('selected top 7 features:', X_train.columns[rfe.support_])
X_train_rfe = rfe.transform(X_train)
# 2. Use the transformed x in our model
lm.fit(X_train_rfe, y_train)
#convert to DF
X_train_rfe = pd.DataFrame(X_train_rfe, columns = X_train.columns[rfe.support_], index = X_train.index)

train_pred['OLS_rfe7'] = lm.predict(X_train_rfe)

In [None]:
# 3. Make predictions


X_validate_rfe = rfe.transform(X_validate)
#Convert to df
X_validate_rfe = pd.DataFrame(X_validate_rfe, columns = X_validate.columns[rfe.support_], index = X_validate.index)

validate_pred['OLS_rfe7'] = lm.predict(X_validate_rfe)

In [None]:
train_pred

# OLS_RFE 4 features

In [None]:
lm = LinearRegression()


# 1. Transform our X
rfe = RFE(lm, n_features_to_select=4)
rfe.fit(X_train, y_train)
print('selected top 4 features:', X_train.columns[rfe.support_])
X_train_rfe = rfe.transform(X_train)
# 2. Use the transformed x in our model
lm.fit(X_train_rfe, y_train)
#convert to DF
X_train_rfe = pd.DataFrame(X_train_rfe, columns = X_train.columns[rfe.support_], index = X_train.index)

train_pred['OLS_rfe4'] = lm.predict(X_train_rfe)
X_validate_rfe = rfe.transform(X_validate)
#Convert to df
X_validate_rfe = pd.DataFrame(X_validate_rfe, columns = X_validate.columns[rfe.support_], index = X_validate.index)

validate_pred['OLS_rfe4'] = lm.predict(X_validate_rfe)

# OLS_RFE 3 features

In [None]:
lm = LinearRegression()


# 1. Transform our X
rfe = RFE(lm, n_features_to_select=3)
rfe.fit(X_train, y_train)
print('selected top 3 features:', X_train.columns[rfe.support_])
X_train_rfe = rfe.transform(X_train)
# 2. Use the transformed x in our model
lm.fit(X_train_rfe, y_train)
#convert to DF
X_train_rfe = pd.DataFrame(X_train_rfe, columns = X_train.columns[rfe.support_], index = X_train.index)

train_pred['OLS_rfe3'] = lm.predict(X_train_rfe)
X_validate_rfe = rfe.transform(X_validate)
#Convert to df
X_validate_rfe = pd.DataFrame(X_validate_rfe, columns = X_validate.columns[rfe.support_], index = X_validate.index)

validate_pred['OLS_rfe3'] = lm.predict(X_validate_rfe)

# OLS_RFE 2 features

In [None]:
lm = LinearRegression()


# 1. Transform our X
rfe = RFE(lm, n_features_to_select=2)
rfe.fit(X_train, y_train)
print('selected top 2 features:', X_train.columns[rfe.support_])
X_train_rfe = rfe.transform(X_train)
# 2. Use the transformed x in our model
lm.fit(X_train_rfe, y_train)
#convert to DF
X_train_rfe = pd.DataFrame(X_train_rfe, columns = X_train.columns[rfe.support_], index = X_train.index)

train_pred['OLS_rfe2'] = lm.predict(X_train_rfe)
X_validate_rfe = rfe.transform(X_validate)
#Convert to df
X_validate_rfe = pd.DataFrame(X_validate_rfe, columns = X_validate.columns[rfe.support_], index = X_validate.index)

validate_pred['OLS_rfe2'] = lm.predict(X_validate_rfe)

In [None]:
train_pred

# Polynomial

In [None]:
from sklearn.preprocessing import PolynomialFeatures

# Degree 2

In [None]:
# 1. Generate Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
poly.fit(X_train)
X_train_poly = pd.DataFrame(
    poly.transform(X_train),
    columns=poly.get_feature_names(X_train.columns),
    index=train.index,
)
X_train_poly.head()

# 2. Use the features
lm = LinearRegression()
lm.fit(X_train_poly, y_train)


train_pred['poly_d2'] = lm.predict(X_train_poly)

X_validate_poly = pd.DataFrame(
    poly.transform(X_validate),
    columns=poly.get_feature_names(X_validate.columns),
    index=validate.index,
)
validate_pred['poly_d2'] = lm.predict(X_validate_poly)


In [None]:
X_validate

# Degree 2 interactions ONLY

In [None]:
# 1. Generate Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
poly.fit(X_train)
X_train_poly = pd.DataFrame(
    poly.transform(X_train),
    columns=poly.get_feature_names(X_train.columns),
    index=train.index,
)
X_train_poly.head()

# 2. Use the features
lm = LinearRegression()
lm.fit(X_train_poly, y_train)


train_pred['Ipoly_d2'] = lm.predict(X_train_poly)

# X_validate_poly = poly.transform(X_validate)
X_validate_poly = pd.DataFrame(
    poly.transform(X_validate),
    columns=poly.get_feature_names(X_validate.columns),
    index=validate.index,
)
validate_pred['Ipoly_d2'] = lm.predict(X_validate_poly)

In [None]:
validate_pred

# Degree 3

In [None]:
# 1. Generate Polynomial Features
poly = PolynomialFeatures(degree=3, include_bias=False, interaction_only=False)
poly.fit(X_train)
X_train_poly = pd.DataFrame(
    poly.transform(X_train),
    columns=poly.get_feature_names(X_train.columns),
    index=train.index,
)
X_train_poly.head()

# 2. Use the features
lm = LinearRegression()
lm.fit(X_train_poly, y_train)


train_pred['poly_d3'] = lm.predict(X_train_poly)


#X_validate_poly = poly.transform(X_validate)

X_validate_poly = pd.DataFrame(
    poly.transform(X_validate),
    columns=poly.get_feature_names(X_validate.columns),
    index=validate.index,
)
validate_pred['poly_d3'] = lm.predict(X_validate_poly)

In [None]:
X_validate_poly.head()

# DEGREE 3 Interactions Only

In [None]:
# 1. Generate Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
poly.fit(X_train)
X_train_poly = pd.DataFrame(
    poly.transform(X_train),
    columns=poly.get_feature_names(X_train.columns),
    index=train.index,
)
X_train_poly.head()

# 2. Use the features
lm = LinearRegression()
lm.fit(X_train_poly, y_train)


train_pred['Ipoly_d3'] = lm.predict(X_train_poly)

# X_validate_poly = poly.transform(X_validate)
X_validate_poly = pd.DataFrame(
    poly.transform(X_validate),
    columns=poly.get_feature_names(X_validate.columns),
    index=validate.index,
)
validate_pred['Ipoly_d3'] = lm.predict(X_validate_poly)

In [None]:
validate_pred

# Degree 4

In [None]:
# 1. Generate Polynomial Features
poly = PolynomialFeatures(degree=4, include_bias=False, interaction_only=False)
poly.fit(X_train)
X_train_poly = pd.DataFrame(
    poly.transform(X_train),
    columns=poly.get_feature_names(X_train.columns),
    index=train.index,
)
X_train_poly.head()

# 2. Use the features
lm = LinearRegression()
lm.fit(X_train_poly, y_train)


train_pred['poly_d4'] = lm.predict(X_train_poly)

#X_validate_poly = poly.transform(X_validate)

X_validate_poly = pd.DataFrame(
    poly.transform(X_validate),
    columns=poly.get_feature_names(X_validate.columns),
    index=validate.index,
)
validate_pred['poly_d4'] = lm.predict(X_validate_poly)

# DEGREE 4 interaction Only

In [None]:
# 1. Generate Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
poly.fit(X_train)
X_train_poly = pd.DataFrame(
    poly.transform(X_train),
    columns=poly.get_feature_names(X_train.columns),
    index=train.index,
)
X_train_poly.head()

# 2. Use the features
lm = LinearRegression()
lm.fit(X_train_poly, y_train)


train_pred['Ipoly_d4'] = lm.predict(X_train_poly)

#X_validate_poly = poly.transform(X_validate)

X_validate_poly = pd.DataFrame(
    poly.transform(X_validate),
    columns=poly.get_feature_names(X_validate.columns),
    index=validate.index,
)
validate_pred['Ipoly_d4'] = lm.predict(X_validate_poly)

# Evaluate Models

In [None]:
train_pred

In [None]:
def evaluate_metrics(df, col,actual):
    MSE = mean_squared_error(actual, df[col])
    SSE = MSE * len(df)
    RMSE = MSE ** .5
    ESS = ((df[col] - actual.mean())**2).sum()
    TSS = ESS + SSE
    R2 = explained_variance_score(actual, df[col])
    return MSE, SSE, RMSE,ESS, TSS,R2

In [None]:
train.tax_value - train.yhat

In [None]:
baseline = train.tax_value.mean()
train['baseline']=baseline
baseline

In [None]:
RMSE_baseline-RMSE

In [None]:
train.columns

In [None]:
col = train_pred.columns.to_list()

In [None]:
from sklearn.metrics import mean_squared_error,explained_variance_score
metric_df = pd.DataFrame(columns =['model','MSE','SSE','RMSE','ESS','TSS','R2'])
for i in col:
    MSE,SSE, RMSE, ESS, TSS, R2 = evaluate_metrics(train_pred, i , y_train)
    # sklearn.metrics.explained_variance_score

    metric_df= metric_df.append({
                    'model': i,
                    'MSE':MSE,
                     'SSE':SSE,
                     'RMSE':RMSE,
                     'ESS':ESS,
                     'TSS':TSS,
                     'R2':R2},ignore_index=True)
    

In [None]:
metric_df

In [None]:
metric_df[['model','RMSE','R2']]


In [None]:
metric_df[['model','RMSE','R2']].sort_values(by='R2',ascending=False)

In [None]:
col = validate_pred.columns.to_list()
metric_val = pd.DataFrame(columns =['model','MSE','SSE','RMSE','ESS','TSS','R2'])
for i in col:
    MSE,SSE, RMSE, ESS, TSS, R2 = evaluate_metrics(validate_pred, i , y_validate)
    metric_val= metric_val.append({
                    'model': i,
                    'MSE':MSE,
                     'SSE':SSE,
                     'RMSE':RMSE,
                     'ESS':ESS,
                     'TSS':TSS,
                     'R2':R2},ignore_index=True)
    


In [None]:
metric_val[['model','RMSE','R2']]

In [None]:
validate_pred['poly_d4']

In [None]:
metric_val2 = pd.DataFrame(columns =[])
metric_val2['residual'] = validate_pred['poly_d4'] - validate_pred['actual']
metric_val2['residual^2'] = metric_val2.residual ** 2
SSE = sum(metric_val2['residual^2'])
MSE = SSE/len(metric_val2)
from math import sqrt
RMSE = sqrt(MSE)
metric_val2['RMSE']=RMSE

In [None]:
metric_val2

In [None]:
evs = explained_variance_score(validate_pred.actual, validate_pred.poly_d4)
print('Explained Variance = ', round(evs,3))

In [None]:
evs = explained_variance_score(train_pred.actual, train_pred.poly_d3)
print('Explained Variance = ', round(evs,3))

In [None]:
df['yhat_baseline'] = df['y'].mean()
df['yhat'] = ols_model.predict(df[['x']])

df['residual'] = df['yhat'] - df['y']
df['residual_baseline'] = df['yhat_baseline'] - df['y']

df['residual^2'] = df.residual ** 2

df['residual_baseline^2'] = df.residual_baseline ** 2

In [None]:
def plot_residuals(y, yhat,df):
    '''
    plot_residuals takis in acutal value of target y and predicted value and returns a scatter plot of reiduals.
    y: targets acutal value
    yhat: predicted value or target
    '''
    # calculate residauals
    residuals = y - df[yhat]
    
    # create scatter plot
    plt.scatter(x=y, y=residuals)

    # create labels for axis and title
    plt.xlabel('Home Value')
    plt.ylabel('Residuals')
    plt.title('Residual vs Home Value Plot')

    # show plot
    plt.show()

In [None]:
col = train_pred.columns.to_list()
col

In [None]:
for i in col:  
    print(i)
    plot_residuals(y_train, i, train_pred)





In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def regression_errors(actual, yhat,df):

    residual = actual - df[yhat]
    
    mse = mean_squared_error(actual, df[yhat])
    sse = (residual **2).sum()
    rmse = sqrt(mse)
    tss = ((actual - df[yhat].mean()) ** 2).sum()
    ess = ((df[yhat] - actual.mean()) ** 2).sum()
    print(f""" 
    MSE: {round(mse,2)}
    SSE: {round(sse,2)}
    RMSE: {round(rmse,2)}
    TSS: {round(tss,2)}
    ESS: {round(ess,2)}
    """)

In [None]:
for i in col:  
    print(i)
    regression_errors(y_train, i, train_pred)



* metric

In [None]:
# prep data for modeling
x_train,y_train,x_validate,y_validate, x_test, y_test = m.model_prep(train,validate,test)

**The ....** 

# Comparing Models

* All ....

# Model on Test data

In [None]:
m.get_logit_model(x_train,y_train,x_test,y_test, True)

## Modeling Summary

* A
* B

# Conclusion

## Exploration



* A
* B

## Modeling

**The final model performed....**

## Recommendations

* A
* B
* C

## Next Steps

* A
* B
* C