### Import libraries and functions.

In [None]:
# Import my favorite imports.
import pandas as pd
import env
import acquire
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.preprocessing
from scipy.stats import pearsonr, spearmanr
from scipy import stats

# modeling methods
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import explained_variance_score

import sklearn.linear_model
import sklearn.feature_selection
import sklearn.preprocessing

# import splitting and imputing functions
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# turn off pink boxes for demo
import warnings
warnings.filterwarnings("ignore")

## Project Goals:

### Explore the 2017 Zillow data and create a model that will be able to predict a property's tax assessed value of single family properties.

### Acquire data and split.

In [None]:
# Acuire data, clean it up, and split into train, validate, and test datasets.
train, validate, test = acquire.wrangle_zillow()

### Verify no null values and datatypes.

In [None]:
# Verify that each data set does not have null values and that the datatypes are correct.
train.info(), validate.info(), test.info()

### Visualize dataframe.

In [None]:
# Visualize the 'train' dataset.
train.head()

### Scale data and update dataframe with columns for scaled data.

In [None]:
# Create scaler object using Min-Max Scaler.
scaler = sklearn.preprocessing.MinMaxScaler()

# Scale the independent variables and add scaled columns to dataframe.
train, validate, test = acquire.add_scaled_columns(train, validate, test, scaler, ['sqft', 'bedrooms', 'bathrooms', 'fips'])

In [None]:
# Visualize scaled data columns.
train.head()

### Create baseline prediction.

In [None]:
# Set a baseline prediction as the average of the tax values of all the homes.
baseline = train.tax_value.mean()

In [None]:
baseline

### Create variables for independent variables to test against target.

In [None]:
target = "tax_value"

# split train into X (dataframe, drop target) & y (series, keep target only)
X_train = train.drop(columns=[target])
y_train = train[target]

# split validate into X (dataframe, drop target) & y (series, keep target only)
X_validate = validate.drop(columns=[target])
y_validate = validate[target]

# split test into X (dataframe, drop target) & y (series, keep target only)
X_test = test.drop(columns=[target])
y_test = test[target]

X_train.info()

In [None]:
# Visualize preview of independent variables' dataframe and make sure target is not included.
X_train.head()

In [None]:
# Drop the columns that have the original values of the independent variables.
X_train_scaled = X_train.drop(columns=['sqft','bedrooms','bathrooms', 'fips'])
X_validate_scaled = X_validate.drop(columns=['sqft','bedrooms','bathrooms', 'fips'])
X_test_scaled = X_test.drop(columns=['sqft','bedrooms','bathrooms', 'fips'])

In [None]:
# Visualize the dataframe of only scaled values.
X_train_scaled.head()

### Identify top two features that influence the target.

In [None]:
# Call the 'select_kbest' function housed in the 'acquire' module to identify top two features.
acquire.select_kbest(X_train_scaled, y_train, 2)

### Convert arrays into dataframes to make easier to modify.

In [None]:
y_train = pd.DataFrame(y_train)
y_validate = pd.DataFrame(y_validate)

In [None]:
# Use the mean of the tax value to use as a baseline prediction.
tax_value_mean = baseline
y_train['tax_value_mean'] = baseline
y_validate['tax_value_mean'] = baseline

In [None]:
# Visualize target value with baseline prediction.
y_train.head()

In [None]:
# Calculate the RMSE value for baseline prediction.
rmse_train = mean_squared_error(y_train.tax_value,
                                y_train.tax_value_mean) ** .5
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.tax_value_mean) ** (0.5)

In [None]:
print("RMSE using Mean\nTrain/In-Sample: ", round(rmse_train, 2), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 2))

### Create a datafame that will house the name of the model used and the RMSE values for the train and validate datasets.

In [None]:
metric_df = pd.DataFrame(data=[
            {
                'model': 'mean_baseline', 
                'RMSE_train': rmse_train,
                'RMSE_validate': rmse_validate
                }
            ])

In [None]:
metric_df

### Call the 'evaluate_model' function to update the 'metric_df' with different models and their RMSE values. All of the models used were fit on the scaled values of the independent variables.

In [None]:
metric_df = acquire.model_evaluation(X_train_scaled, X_validate_scaled, y_train, y_validate)

In [None]:
metric_df['difference'] = metric_df.RMSE_validate - metric_df.RMSE_train

In [None]:
metric_df

### Plotting Actual vs. Predicted Values

In [None]:
# y_validate.head()
plt.figure(figsize=(16,8))
plt.plot(y_validate.tax_value, y_validate.tax_value_mean, alpha=.5, color="gray", label='_nolegend_')
plt.annotate("Baseline: Predict Using Mean", (16, 230000))
plt.plot(y_validate.tax_value, y_validate.tax_value, alpha=.5, color="blue", label='_nolegend_')
plt.annotate("The Ideal Line: Predicted = Actual", (10, 20000), rotation=26)

plt.scatter(y_validate.tax_value, y_validate.tax_value_pred_lm, 
            alpha=.5, color="red", s=100, label="Model: LinearRegression")
plt.scatter(y_validate.tax_value, y_validate.tax_value_pred_glm, 
            alpha=.5, color="yellow", s=100, label="Model: TweedieRegressor")
plt.scatter(y_validate.tax_value, y_validate.tax_value_pred_lars, alpha=.5, color="green", s=100, label="Model: LassoLars")

plt.legend()
plt.xlabel("Actual Tax Value")
plt.ylabel("Predicted Tax Value")
plt.title("Where are predictions more extreme? More modest?")
# plt.annotate("The polynomial model appears to overreact to noise", (2.0, -10))
# plt.annotate("The OLS model (LinearRegression)\n appears to be most consistent", (15.5, 3))
plt.show()

### Going with the OLS model to use on the test set.

In [None]:
# predict test
# use the thing!
lm = LinearRegression()
# fit the thing
lm.fit(X_train_scaled, y_train.tax_value)

y_test_predictions = lm.predict(X_test_scaled)

In [None]:
# evaluate: rmse
rmse_test = mean_squared_error(y_test, y_test_predictions) ** (1/2)

### Compare OLS model with baseline prediction.

In [None]:
print('OLS RMSE: ', rmse_test)
metric_df.drop(columns='difference')

### Conclusions & Recommendations

* With a baseline RMSE of 197,243 and my model's RMSE of 187,977 my model is a slight improvement on the baseline model.
* At this time I do not feel confident in using my model to predict the tax value of a property.

### Next Steps

* With more time I would like to investigate different features and how they influence the target to improve my current OLS model.