In [22]:
import pandas as pd
import numpy as np
import prepare
import acquire
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn.linear_model import LinearRegression, LassoLars
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.feature_selection import RFE

import warnings
warnings.filterwarnings("ignore")

In [10]:
train, y_train, y_validate, y_test, X_train_scaled, X_train, X_validate, X_test, X_validate_scaled, X_test_scaled = prepare.prep_zillow_data('taxvaluedollarcnt')

In [11]:
X_train_scaled.head()

Unnamed: 0,unit_sq_feet,bedroom_count,bathroom_count,lot_size_sq_feet
0,0.078248,0.2,0.3,0.003961
1,0.073514,0.3,0.2,0.002864
2,0.078117,0.4,0.2,0.003505
3,0.065952,0.3,0.2,0.002623
4,0.142688,0.4,0.3,0.00671


# Modeling
We use these variables that we believe strongly correlate with churn to create a model that will allow us to predict churn
***

## Baseline
We'll now construct a baseline and gauge its accuracy to determine what accuracy an alternate model would need in order to be viable.

In [14]:
# making a copy of train data so we can add a baseline column without changing original train DF
y_train_bl = y_train.copy()

# adding baseline_prediction column that holds the mean of house tax values
y_train_bl['baseline_prediction'] = y_train['tax_dollar_value'].mean()

y_train_bl.head()

Unnamed: 0,tax_dollar_value,baseline_prediction
6531,209099.0,475181.64038
6057,271949.0,475181.64038
2580,250933.0,475181.64038
5490,236264.0,475181.64038
1229,818000.0,475181.64038


In [18]:
RMSE_bl = np.sqrt(mean_squared_error(y_train.tax_dollar_value, y_train_bl.baseline_prediction))
print("Baseline Root mean squared error: {:.100}".format(round(RMSE_bl)))

Baseline Root mean squared error: 698688.0


Our baselines RMSE value is 698,688. We'll need to construct at least one model that can attain a RMSE value lower than this.

## Feature Engineering
We'll be using Recursive Feature Elimination to rank each of our features in terms of their viability in predicting house value.

In [20]:
train, y_train, y_validate, y_test, X_train_scaled, X_train, X_validate, X_test, X_validate_scaled, X_test_scaled = prepare.prep_zillow_data('taxvaluedollarcnt')

Using recursive feature elimination to find top 3 features for predicting tax value dollar count.


In [25]:
# creating linear regression object
lm = LinearRegression()

# creating RFE object that will use our linear regression object and only pick only the single best feature
rfe = RFE(lm, 1)

# transforming data using rfe object
x_rfe = rfe.fit_transform(X_train, y_train)

# fitting our linear regression model to data
lm.fit(X_train, y_train)

# storing array of boolean values that reflect true if a feature was one of the three selected
# false otherwise
mask = rfe.support_

# creating list of the top feature using boolean mask
rfe_features = X_train.loc[:,mask].columns.tolist()

# creating array of ranking list
var_ranks = rfe.ranking_

# creating list of feature names
var_names = X_train_scaled.columns.tolist()

# combine ranks and names into a df
rfe_ranks_df = pd.DataFrame({'Var': var_names, 'Rank': var_ranks})

# sort the df by rank
rfe_ranks_df.sort_values('Rank')

Unnamed: 0,Var,Rank
2,bathroom_count,1
1,bedroom_count,2
0,unit_sq_feet,3
3,lot_size_sq_feet,4


- RFE shows that bathroom_count is the most useful feature for predicting house value.
- bedroom_count is second
- unit_sq_feet is third
- lot_size_sq_feet is last

## Non-Baseline Models

### Model 1

In [29]:
# creating linear regression object
lm_1 = LinearRegression()

# fitting model to data
lm.fit(X_train_scaled, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)