In [None]:
import pandas as pd
import numpy as np
import matt_prepare
import matt_acquire
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, explained_variance_score


In [2]:
train, y_train, y_validate, y_test, X_train_scaled, X_train, X_validate, X_test, X_validate_scaled, X_test_scaled = matt_prepare.prep_zillow_data('taxvaluedollarcnt')

In [3]:
X_train_scaled.head()

Unnamed: 0,unit_sq_feet,bedroom_count,bathroom_count,lot_size_sq_feet
0,0.078248,0.2,0.3,0.003961
1,0.073514,0.3,0.2,0.002864
2,0.078117,0.4,0.2,0.003505
3,0.065952,0.3,0.2,0.002623
4,0.142688,0.4,0.3,0.00671


In [4]:
from sklearn.linear_model import LinearRegression, LassoLars
from sklearn.preprocessing import PolynomialFeatures

# Create A Baseline

In [5]:
y_train

Unnamed: 0,taxvaluedollarcnt
6531,209099.0
6057,271949.0
2580,250933.0
5490,236264.0
1229,818000.0
...,...
3441,253006.0
578,454832.0
8645,157915.0
10573,459908.0


In [6]:
y_train['baseline_prediction'] = y_train['taxvaluedollarcnt'].mean()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [7]:
y_train.head()

Unnamed: 0,taxvaluedollarcnt,baseline_prediction
6531,209099.0,475181.64038
6057,271949.0,475181.64038
2580,250933.0,475181.64038
5490,236264.0,475181.64038
1229,818000.0,475181.64038


In [9]:
RMSE_bl = np.sqrt(mean_squared_error(y_train.taxvaluedollarcnt, y_train.baseline_prediction))
print("Baseline (ŷ = ȳ)\n  Root mean squared error: {:.100}".format(RMSE_bl))

Baseline (ŷ = ȳ)
  Root mean squared error: 698687.872686219983734190464019775390625


In [12]:
(y_train.taxvaluedollarcnt).max()

23858374.0

In [13]:
(y_train.taxvaluedollarcnt).min()

11089.0

In [14]:
(y_train.taxvaluedollarcnt).mean()

475181.64037985133

# Feature Engineering

Use K Best to select the 3 best features for predicting tax value dollar count.

In [23]:
train, y_train, y_validate, y_test, X_train_scaled, X_train, X_validate, X_test, X_validate_scaled, X_test_scaled = matt_prepare.prep_zillow_data('taxvaluedollarcnt')

In [24]:
from sklearn.feature_selection import SelectKBest, f_regression


In [25]:
f_selector = SelectKBest(f_regression, k=3)


In [26]:
X_train.shape, y_train.shape

((7266, 4), (7266, 1))

In [29]:
# fitting selector to data
f_selector.fit(X_train, y_train)


  y = column_or_1d(y, warn=True)


SelectKBest(k=3, score_func=<function f_regression at 0x7ff24d1f8a70>)

In [30]:
# storing array of boolean values that reflect true if a feature was one of the two selected
# false otherwise
f_support = f_selector.get_support()


In [35]:
# creating list of the top 2 features using boolean mask
f_feature = X_train.loc[:,f_support].columns.tolist()

# printing results
print(str(len(f_feature)), 'k best selected features for predicting tax value dollar count')
print(f_feature)

3 k best selected features for predicting tax value dollar count
['calculatedfinishedsquarefeet', 'bedroomcnt', 'bathroomcnt']


Using recursive feature elimination to find top 3 features for predicting tax value dollar count.


In [37]:
from sklearn.feature_selection import RFE


In [39]:
# creating linear regression object
lm = LinearRegression()

# creating RFE object
# specifying to use our linear regression object and only pick top 3 features
rfe = RFE(lm, 3)

# transforming data using rfe object
x_rfe = rfe.fit_transform(X_train, y_train)

# fitting our linear regression model to data
lm.fit(X_train, y_train)

# storing array of boolean values that reflect true if a feature was one of the three selected
# false otherwise
mask = rfe.support_

# creating list of the top 3 features using boolean mask
rfe_features = X_train.loc[:,mask].columns.tolist()

# displaying results
print(str(len(rfe_features)), 'RFE selected features for predicting tax value dollar amount')
print(rfe_features)

3 RFE selected features for predicting tax value dollar amount
['calculatedfinishedsquarefeet', 'bedroomcnt', 'bathroomcnt']


  y = column_or_1d(y, warn=True)


In [None]:
Creating a table to visual rand

In [40]:
var_ranks = rfe.ranking_
# get the variable names
var_names = X_train_scaled.columns.tolist()
# combine ranks and names into a df for clean viewing
rfe_ranks_df = pd.DataFrame({'Var': var_names, 'Rank': var_ranks})
# sort the df by rank
rfe_ranks_df.sort_values('Rank')

Unnamed: 0,Var,Rank
0,unit_sq_feet,1
1,bedroom_count,1
2,bathroom_count,1
3,lot_size_sq_feet,2
