In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import seaborn as sns
from math import sqrt
import pandas_profiling

import env
import acquire
import prep
import explore_final
import scipy.stats as stats
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from statsmodels.formula.api import ols

## Initial Hypothesis

Square footage and home value extremes are leading drivers of log error.

## Acquire and summarize data

Initial Dataframe included single family residence homes with 40+ initial fields brought in.

After reviewing the data for missing values, multicollinearity, and low correlation we dropped our data down to 14 columns and 54,000 rows. 300 values were imputed on lotsizesquarefeet with the mean in order to maintain the column.

In [None]:
#nmysql query single family residence from zillow DB
df = prep.prep_df_initial()

In [None]:
df.head(2)

## Data prep and test/train split

Our initial dataframe was split into 70/30 train/test adding a tax_per_sqft field. Random State 123 was used.

In [None]:
#train test split and adding a tax_per_sqft field
train, test = prep.get_train_and_test(df)

In [None]:
train.head(2)

## Initial exploration and feature selection

In [None]:
#adding a baseline yhat as mean of log error and residual amt based off that.
train['mean_logerror'] = train.logerror.mean()
train['residual'] = train['logerror']- train['mean_logerror']

In [None]:
train.head(2)

In [None]:
# Heatmap of possible initial features
plt.figure(figsize=(12,10))
subset = df[['bathrooms','bedrooms','sqft','tax_value']]
cor = subset.corr()
sns.heatmap(cor,annot=True,cmap=plt.cm.Blues)
plt.show()

In [None]:
X_train, y_train, X_test, y_test = prep.get_baseline_train_test_split(df)

In [None]:
X_train.head(2)

## Initial Baseline score

In [None]:
baseline = mean_squared_error(train.logerror,train.mean_logerror)
print(baseline)

## MVP

After some basic feature engineering and selection we trained the new model on just sqft and bedrooms. This model performed slightly better than our baseline.

In [None]:
#MSE of sqft and bedrooms as x variables
x = train[['sqft','bedrooms']]
y = train[['logerror']]
ols_model = ols('y ~ x', data=train).fit()
train['yhat'] = ols_model.predict(x)

In [None]:
model1 = mean_squared_error(train.logerror,train.yhat)
print(model1)
model1 < baseline

## Exploration

We began our exploration by clustering on the y variable.

In [None]:
y_train = pd.DataFrame(y_train)
explore_final.elbow_plot(y_train)

In [None]:
#Logerror clusters. Function takes y,X, and num_clusters
train = explore_final.target_cluster(y_train,X_train,6)

In [None]:
train.groupby('cluster').mean().sort_values(by='logerror')

### Target clustering

The crosstab above shows the clusters sorted by lowest average log error to highest. The summary data shows that the largest/most expensive, and smallest/least expensive houses had the largest absolute value logerrors.

Unfortunately...this was due to our clusters being very disproportionate in size.

This outcome still gave us valuable insight that the logerror was less accurate the further it moved from the median price/sqft.

In [None]:
train.cluster.value_counts()

### Variable clustering

After exploring bedrooms and bathrooms differences with ttests we discovered they were not providing value and were dropped. 

The decision was also made to drop location from this model in order to isolate only on features related to the houses specfically. 

A variable was created for price per sqft and tax value was dropped.

This left us with three variables for the next model: tax_per_sqft, structuretaxvaluedollarcnt, and lotsizesquarefeet

In [None]:
df = prep.prep_df()
#df = df.drop(columns=['latitude','longitude','tax_value'])

In [None]:
df.head(2).T

In [None]:
train, test = prep.get_train_and_test(df)
X_train, y_train, X_test, y_test = prep.get_train_test_split(train, test)

In [None]:
X_train.head(2)

In [2]:
explore_final.elbow_plot(X_train)

NameError: name 'X_train' is not defined

In [3]:
x = explore_final.bad_dist()