In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import acquire
import prepare

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

import warnings
warnings.filterwarnings('ignore')

from sklearn.cluster import KMeans

import scipy.stats as stats

random_state = 42

from math import sqrt
from sklearn.linear_model import LinearRegression, LassoLars 
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
import sklearn as sk

In [2]:
df = acquire.zillow_data()

Reading from local CSV...


In [3]:
df = prepare.prep_zillow(df)
train, validate, test = prepare.train_validate_test_split(df)
target = 'logerror'

train	 n = 29001
test	 n = 10358
validate n = 12429


In [4]:
outlier_columns = [col for col in df.columns if ((df[col].dtype != 'object') & (col not in [target, 'latitude', 'longitude']))]
train, validate, test = prepare.remove_outliers(train, validate, test, 3, outlier_columns)

train	 n = 15874
test	 n = 5731
validate n = 6750


In [5]:
train, validate, test = prepare.scale_zillow(train, test, validate, target)
scaled_cols = [col for col in train.columns if 'scaled_' in col]
train[scaled_cols].head()

Unnamed: 0,scaled_bathroomcnt,scaled_bedroomcnt,scaled_sqft,scaled_fireplacecnt,scaled_fullbathcnt,scaled_garagecarcnt,scaled_garagetotalsqft,scaled_latitude,scaled_longitude,scaled_lotsizesquarefeet,scaled_poolcnt,scaled_threequarterbathnbr,scaled_yearbuilt,scaled_structuretaxvaluedollarcnt,scaled_taxvaluedollarcnt,scaled_assessmentyear,scaled_age,scaled_taxvalue_per_sqft,scaled_taxvalue_per_bedroom,scaled_taxvalue_per_bathroom
23280,0.2,0.5,0.428773,0.0,0.2,0.0,0.0,0.920835,0.633751,0.865116,0.0,0.0,0.905109,0.419218,0.179412,0.905109,0.094891,0.136174,0.143275,0.172067
55542,0.2,0.5,0.223075,0.0,0.2,0.0,0.0,0.614566,0.375837,0.373451,0.0,0.0,0.59854,0.318709,0.295637,0.59854,0.40146,0.407332,0.238742,0.286719
7420,0.0,0.166667,0.123661,0.0,0.0,0.0,0.0,0.474572,0.563689,0.24082,0.0,0.0,0.277372,0.151239,0.101579,0.277372,0.722628,0.216177,0.166873,0.200407
68942,0.4,0.833333,0.527985,0.0,0.4,0.0,0.0,0.710386,0.483486,0.906679,0.0,0.0,0.525547,0.136226,0.056628,0.525547,0.474453,0.028125,0.025552,0.030687
67157,0.0,0.333333,0.149121,0.0,0.0,0.0,0.0,0.408863,0.602333,0.25107,0.0,0.0,0.532847,0.150186,0.112596,0.532847,0.467153,0.209163,0.120586,0.16415


In [6]:
train, validate, test = prepare.encode_zillow(train, test, validate, target)
enc_cols = [col for col in train.columns if 'enc_' in col]
train[enc_cols].head()

Unnamed: 0,enc_fips_06059,enc_fips_06111,enc_propertycountylandusecode_0101,enc_propertycountylandusecode_0102,enc_propertycountylandusecode_0103,enc_propertycountylandusecode_0104,enc_propertycountylandusecode_0108,enc_propertycountylandusecode_0109,enc_propertycountylandusecode_010G,enc_propertycountylandusecode_010M,enc_propertycountylandusecode_1,enc_propertycountylandusecode_1110,enc_propertycountylandusecode_1111,enc_propertycountylandusecode_122
23280,0,0,0,0,0,0,0,0,0,0,0,0,0,0
55542,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7420,0,0,0,0,0,0,0,0,0,0,0,0,0,0
68942,0,0,0,0,0,0,0,0,0,0,0,0,0,0
67157,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# Modeling

In [7]:
def determine_regression_baseline(train, target):
    '''
    This function takes in a train sample and a continuous target variable label and 
    determines whether the mean or median performs better as a baseline prediction. 
    '''
    # create empty dataframe for storing prediction results
    results = pd.DataFrame(index=train.index)
    # assign actual values for the target variable
    results['actual'] = train[target]
    # assign a baseline using mean
    results['baseline_mean'] = train[target].mean()
    # assign a baseline using median
    results['baseline_median']= train[target].median()
    
    # get RMSE values for each potential baseline
    RMSE_baseline_mean = sqrt(sk.metrics.mean_squared_error(results.actual, results.baseline_mean))
    RMSE_baseline_median = sqrt(sk.metrics.mean_squared_error(results.actual, results.baseline_median))
    
    # compare the two RMSE values; drop the lowest performer and assign the highest performer to baseline variable
    if RMSE_baseline_median < RMSE_baseline_mean:
        results = results.drop(columns='baseline_mean')
        results['RMSE_baseline'] = RMSE_baseline_median
        baseline_type = 'median'
    else:
        results = results.drop(columns='baseline_median')
        results['RMSE_baseline'] = RMSE_baseline_mean
        baseline_type = 'mean'
    
    return baseline_type

In [8]:
determine_regression_baseline(train, target)

'mean'

In [9]:
model_info = pd.DataFrame()
model_results = pd.DataFrame()
model_number = 0

def run_baseline(train,
                 validate,
                 target,
                 model_number,
                 model_info,
                 model_results):
    
    baseline_type = determine_regression_baseline(train, target)

    y_train = train[target]
    y_validate = validate[target]

    # identify model number
    model_number = 'baseline'
    #identify model type
    model_type = 'baseline'

    # store info about the model

    # create a dictionary containing model number and model type
    dct = {'model_number': model_number,
           'model_type': model_type}
    # append that dictionary to the model_info dataframe
    model_info = model_info.append(dct, ignore_index=True)


    # establish baseline predictions for train sample
    y_pred = baseline_pred = pd.Series(train[target].mean()).repeat(len(train))

    # get metrics
    dct = {'model_number': model_number, 
           'sample_type': 'train', 
           'metric_type': 'RMSE',
           'score': sqrt(sk.metrics.mean_squared_error(y_train, y_pred))}
    model_results = model_results.append(dct, ignore_index=True)


    # establish baseline predictions for validate sample
    if baseline_type == 'mean':
        y_pred = baseline_pred = pd.Series(validate[target].mean()).repeat(len(validate))
    elif baseline_type == 'median':
        y_pred = baseline_pred = pd.Series(validate[target].median()).repeat(len(validate))

    # get metrics
    dct = {'model_number': model_number, 
           'sample_type': 'validate', 
           'metric_type': 'RMSE',
           'score': sqrt(sk.metrics.mean_squared_error(y_validate, y_pred))}
    model_results = model_results.append(dct, ignore_index=True)
    
    model_number = 0
    
    return model_number, model_info, model_results

In [10]:
model_number, model_info, model_results = run_baseline(train, validate, target, 
                                                       model_number, model_info, model_results)

In [11]:
model_info

Unnamed: 0,model_number,model_type
0,baseline,baseline


In [12]:
model_results

Unnamed: 0,model_number,sample_type,metric_type,score
0,baseline,train,RMSE,0.066873
1,baseline,validate,RMSE,0.065904


In [13]:
# recreating the best performing model from the previous zillow regression project:

features = []
degree = 4

# establish model number
model_number += 1

# establish model type
model_type = 'polynomial regression'

# store info about the model
# create a dictionary containing the features and hyperparameters used in this model instance
dct = {'model_number': model_number,
'model_type': model_type,
'features': features,
'degree': degree}
# append that dictionary to the model_info dataframe
model_info = model_info.append(dct, ignore_index=True)

#split the samples into x and y
x_train = train[features]
y_train = train[target]

x_validate = validate[features]
y_validate = validate[target]

# create a polynomial features object
pf = PolynomialFeatures(degree=degree)

# fit and transform the data
x_train_poly = pf.fit_transform(x_train)
x_validate_poly = pf.fit_transform(x_validate)

# create the model object and fit to the training sample
linreg = LinearRegression().fit(x_train_poly, y_train)

# make predictions for the training sample
y_pred = linreg.predict(x_train_poly)
sample_type = 'train'

# store information about model performance
# create dictionaries for each metric type for the train sample and append those dictionaries to the model_results dataframe
dct = {'model_number': model_number, 
    'sample_type': sample_type, 
    'metric_type': 'RMSE',
    'score': sqrt(sk.metrics.mean_squared_error(y_train, y_pred))}
model_results = model_results.append(dct, ignore_index=True)

# make predictions for the validate sample
y_pred = linreg.predict(x_validate_poly)
sample_type = 'validate'

# store information about model performance
# create dictionaries for each metric type for the train sample and append those dictionaries to the model_results dataframe
dct = {'model_number': model_number, 
    'sample_type': sample_type, 
    'metric_type': 'RMSE',
    'score': sqrt(sk.metrics.mean_squared_error(y_validate, y_pred))}
model_results = model_results.append(dct, ignore_index=True)


In [14]:
model_info

Unnamed: 0,model_number,model_type,degree,features
0,baseline,baseline,,
1,1,polynomial regression,4.0,"[scaled_bedroomcnt, scaled_bathroomcnt, scaled..."


In [15]:
model_results

Unnamed: 0,model_number,sample_type,metric_type,score
0,baseline,train,RMSE,0.066873
1,baseline,validate,RMSE,0.065904
2,1,train,RMSE,0.065639
3,1,validate,RMSE,0.068267
