In [1]:
# tab data
import pandas as pd
import numpy as np

# data and prep
from wrangle import wrangle_zillow
from preprocess import scale_data

# data viz
import matplotlib.pyplot as plt

# stats and modeling needs
from scipy import stats
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures

from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler

import warnings
warnings.filterwarnings("ignore")

In [2]:
train, validate, test = wrangle_zillow()

In [3]:
# get only oc for each
train = train[train.county == 'Orange']
# get only oc for each
validate = validate[validate.county == 'Orange']
# get only oc for each
test = test[test.county == 'Orange']

In [4]:
num_cols = ['square_feet', 'beds', 'baths']

In [5]:
# xtrain, ytrain
X_train = train[num_cols]
y_train = train[['tax_value']]

# xvalidate, yvalidate
X_validate = validate[num_cols]
y_validate = validate[['tax_value']]

# xtest, ytest
X_test = test[num_cols]
y_test = test[['tax_value']]

In [6]:
# scaling
X_train_scaled, X_validate_scaled, X_test_scaled = scale_data(X_train, num_cols, 'minmax', X_validate=X_validate, X_test=X_test)

In [7]:
# preprocess
def preprocess_zillow():
    '''
    Actions: scales data ready for modeling
    '''
    # get data
    train, validate, test = wrangle_zillow()
    
    # get only oc for each
    train = train[train.county == 'Orange']
    # get only oc for each
    validate = validate[validate.county == 'Orange']
    # get only oc for each
    test = test[test.county == 'Orange']
    
    # set list of num columns
    num_cols = ['square_feet', 'beds', 'baths']
    
    # xtrain, ytrain
    X_train = train[num_cols]
    y_train = train[['tax_value']]

    # xvalidate, yvalidate
    X_validate = validate[num_cols]
    y_validate = validate[['tax_value']]

    # xtest, ytest
    X_test = test[num_cols]
    y_test = test[['tax_value']]
    
    # scaling
    X_train_scaled, X_validate_scaled, X_test_scaled = scale_data(X_train, num_cols, 'minmax', X_validate=X_validate, X_test=X_test)
    
    # exit function and return
    return X_train_scaled, X_validate_scaled, X_test_scaled, y_train, y_validate, y_test

In [8]:
X_train_scaled, X_validate_scaled, X_test_scaled, y_train, y_validate, y_test= preprocess_zillow()

In [9]:
y='tax_value'
# baseline using median
y_train['baseline_mean'] = round(y_train[y].mean(), 2)
y_validate['baseline_mean'] = round(y_validate.loc[:,y].mean(), 2)

In [10]:
# RMSE of mean baseline predictions
rmse_train = mean_squared_error(y_train.loc[:,y], y_train['baseline_mean'], squared=False)
rmse_validate = mean_squared_error(y_validate.loc[:,y], y_validate['baseline_mean'], squared=False)

# checking scores
rmse_train, rmse_validate, rmse_train - rmse_validate

(259532.8366898489, 255477.65091251553, 4055.1857773333613)

In [11]:
def get_baseline():
    '''
    Actions: caluculate baseline and add it to train df, output the rmse and rmse difference
    '''
    y='tax_value'
    # baseline using mean
    y_train['baseline_mean'] = round(y_train[y].mean(), 2)
    y_validate['baseline_mean'] = round(y_validate.loc[:,y].mean(), 2)
    
    # RMSE of mean baseline predictions
    rmse_train = mean_squared_error(y_train.loc[:,y], y_train['baseline_mean'], squared=False)
    rmse_validate = mean_squared_error(y_validate.loc[:,y], y_validate['baseline_mean'], squared=False)

    # getting difference
    rmse_diff = rmse_train - rmse_validate
    
    # printing scores
    print(f'''Baseline Model
    Baseline: {round(y_train[y].mean(), 2)}
    RMSE on Train: {round(rmse_train, 2)}
    RMSE on Validate: {round(rmse_validate, 2)}
    RMSE Difference: {round(rmse_diff, 2)}''')
    
    return

In [12]:
get_baseline()

Baseline Model
    Baseline: 451120.36
    RMSE on Train: 259532.84
    RMSE on Validate: 255477.65
    RMSE Difference: 4055.19


# Modeling

In [13]:
def basic_linear_modeling(X_train_scaled, y_train, x_validate_scaled, y_validate, model):
    '''
    Arguments:
    Actions: runs through specified linear model pipeline and returns the predictions for train and val
    '''
    # fit the model
    model.fit(X_train_scaled, y_train)
   
    # return predictions
    return model.predict(X_train_scaled), model.predict(X_validate_scaled)

In [14]:
def get_lassolars_model():
    '''
    Actions: Returns rmse metric for ols model
    '''
    
    # assigning model to y_train
    y_train['lars_alpha1_preds'], y_validate['lars_alpha1_preds'] = basic_linear_modeling(X_train_scaled, y_train[['tax_value']], X_validate_scaled, y_validate[['tax_value']], model=LinearRegression())
    
    # RMSE of mean baseline predictions
    rmse_train = mean_squared_error(y_train['tax_value'], y_train['lars_alpha1_preds'], squared=False)
    rmse_validate = mean_squared_error(y_validate['tax_value'], y_validate['lars_alpha1_preds'], squared=False)

    # getting difference
    rmse_diff = rmse_train - rmse_validate
    
    # printing scores
    print(f'''LassoLars Alpha-1 Model
    RMSE on Train: {round(rmse_train, 4)}
    RMSE on Validate: {round(rmse_validate, 4)}
    RMSE Difference: {round(rmse_diff, 4)}''')
    
    return

In [15]:
get_lassolars_model()

LassoLars Alpha-1 Model
    RMSE on Train: 226149.8616
    RMSE on Validate: 1068134691.6044
    RMSE Difference: -1067908541.7427


In [16]:
def get_model(model):
    '''
    
    
    '''
    
    y_train['lars_alpha1_preds'], y_validate['lars_alpha1_preds'] = basic_linear_modeling(X_train_scaled, y_train[['tax_value']], X_validate_scaled, y_validate[['tax_value']], model=LassoLars(alpha=1))
    
    rmse_train = mean_squared_error(y_train[target], round(y_train[col], 4), squared=False)
    rmse_validate = mean_squared_error(y_validate[target], round(y_validate[col], 4), squared=False)
        rmse_diff = rmse_train - rmse_validate
        model_rmse.append({
            'Model': col,
            'Train RMSE': rmse_train,
            'Validate RMSE': rmse_validate,
            'RMSE Difference': rmse_diff
        })

IndentationError: unexpected indent (4074839256.py, line 11)

In [28]:
def get_ols_model():
    '''
    Action: prints metrics for ols model for view
    '''
    # assigns ols predictions to y datasets
    y_train['ols_preds'], y_validate['ols_preds'] = basic_linear_modeling(X_train_scaled, y_train[['tax_value']], X_validate_scaled, y_validate[['tax_value']], model=LinearRegression())
    
    # RMSE of ols model predictions
    rmse_train = mean_squared_error(y_train['tax_value'], y_train['ols_preds'], squared=False)
    rmse_validate = mean_squared_error(y_validate['tax_value'], y_validate['ols_preds'], squared=False)

    # getting difference
    rmse_diff = rmse_train - rmse_validate
    
    # printing scores
    print(f'''OLS Model
    RMSE on Train: {round(rmse_train, 4)}
    RMSE on Validate: {round(rmse_validate, 4)}
    RMSE Difference: {round(rmse_diff, 4)}''')
    
    return

In [29]:
get_ols_model()

OLS Model
    RMSE on Train: 226149.8616
    RMSE on Validate: 1068134691.6044
    RMSE Difference: -1067908541.7427


In [None]:
y_train['ols_preds'], y_validate['ols_preds'] = basic_linear_modeling(X_train_scaled, y_train[['tax_value']], X_validate_scaled, y_validate[['tax_value']], model=LinearRegression())

In [18]:
y_train['lars_alpha_hundredth_preds'], y_validate['lars_alpha_tenth_preds'] = basic_linear_modeling(X_train_scaled, y_train[['tax_value']], X_validate_scaled, y_validate[['tax_value']], model=LassoLars(alpha=.01))
y_train['lars_alpha_tenth_preds'], y_validate['lars_alpha_tenth_preds'] = basic_linear_modeling(X_train_scaled, y_train[['tax_value']], X_validate_scaled, y_validate[['tax_value']], model=LassoLars(alpha=.1))
y_train['lars_alpha_half_preds'], y_validate['lars_alpha_half_preds'] = basic_linear_modeling(X_train_scaled, y_train[['tax_value']], X_validate_scaled, y_validate[['tax_value']], model=LassoLars(alpha=.5))
y_train['lars_alpha1_preds'], y_validate['lars_alpha1_preds'] = basic_linear_modeling(X_train_scaled, y_train[['tax_value']], X_validate_scaled, y_validate[['tax_value']], model=LassoLars(alpha=1))

In [None]:
y_train['glm_poi_preds'], y_validate['glm_poi_preds'] = basic_linear_modeling(X_train_scaled, y_train[['tax_value']], X_validate_scaled, y_validate[['tax_value']], model=TweedieRegressor(power=1))

In [None]:
# make the polyni mial features
pf = PolynomialFeatures(degree=2)

# fit_transform
X_train_degree2 = pf.fit_transform(X_train_scaled)

# transform for validate
X_validate_degree2 = pf.transform(X_validate_scaled)


In [None]:
# create the model object
lm2 = LinearRegression(normalize=True)

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
lm2.fit(X_train_degree2, y_train.tax_value)

# predict train
y_train['poly_preds'] = lm2.predict(X_train_degree2)


# predict validate
y_validate['poly_preds'] = lm2.predict(X_validate_degree2)


In [None]:
def get_poly_model():
    '''
    Action: Returns metrics for polynomial model
    '''
    
    # make the polyni mial features
    pf = PolynomialFeatures(degree=2)

    # fit_transform
    X_train_degree2 = pf.fit_transform(X_train_scaled)

    # transform for validate
    X_validate_degree2 = pf.transform(X_validate_scaled)

    # create the model object
    lm2 = LinearRegression(normalize=True)

    # fit the model to our training data. We must specify the column in y_train, 
    # since we have converted it to a dataframe from a series! 
    lm2.fit(X_train_degree2, y_train.tax_value)

    # predict train
    y_train['poly_preds'] = lm2.predict(X_train_degree2)

    # predict validate
    y_validate['poly_preds'] = lm2.predict(X_validate_degree2)

    # RMSE of predictions
    rmse_train = mean_squared_error(y_train['tax_value'], y_train['poly_preds'], squared=False)
    rmse_validate = mean_squared_error(y_validate['tax_value'], y_validate['poly_preds'], squared=False)

    # getting difference
    rmse_diff = rmse_train - rmse_validate
    
    # printing scores
    print(f'''Polynomial 2-degree Model
    RMSE on Train: {round(rmse_train, 4)}
    RMSE on Validate: {round(rmse_validate, 4)}
    RMSE Difference: {round(rmse_diff, 4)}''')
    
    return

In [None]:
get_poly_model()

In [None]:
y_train.columns

In [19]:
model_rmse = []
target = 'tax_value'
for col in y_train:
    if col in ['glm_poi_preds','lars_alpha_hundredth_preds', 'tax_value']:
        pass
    else:
        rmse_train = mean_squared_error(y_train[target], round(y_train[col], 4), squared=False)
        rmse_validate = mean_squared_error(y_validate[target], round(y_validate[col], 4), squared=False)
        rmse_diff = rmse_train - rmse_validate
        model_rmse.append({
            'Model': col,
            'Train RMSE': rmse_train,
            'Validate RMSE': rmse_validate,
            'RMSE Difference': rmse_diff
        })

In [None]:
train = 1000000
for i in model_rmse:
    if i['Train RMSE'] < train:
        train = i['Train RMSE']

# getting the lowest train RMSE
for i in model_rmse:
    if i['Train RMSE'] == train:
        print(i)

In [20]:
model_rmse

[{'Model': 'baseline_mean',
  'Train RMSE': 259532.8366898489,
  'Validate RMSE': 255477.65091251553,
  'RMSE Difference': 4055.1857773333613},
 {'Model': 'lars_alpha1_preds',
  'Train RMSE': 226149.9063306188,
  'Validate RMSE': 1067348113.100551,
  'RMSE Difference': -1067121963.1942204},
 {'Model': 'lars_alpha_tenth_preds',
  'Train RMSE': 226149.86207921835,
  'Validate RMSE': 1068056033.7539779,
  'RMSE Difference': -1067829883.8918986},
 {'Model': 'lars_alpha_half_preds',
  'Train RMSE': 226149.8728073113,
  'Validate RMSE': 1067741402.3524348,
  'RMSE Difference': -1067515252.4796275}]

In [30]:
def get_poly_test():
    '''
    Action: Returns metrics for polynomial model ran on test
    '''
    
    # make the polyni mial features
    pf = PolynomialFeatures(degree=2)

    # fit_transform
    X_train_degree2 = pf.fit_transform(X_train_scaled)

    # transform for validate
    X_test_degree2 = pf.transform(X_test_scaled)

    # create the model object
    lm2 = LinearRegression(normalize=True)

    # fit the model to our training data. We must specify the column in y_train, 
    # since we have converted it to a dataframe from a series! 
    lm2.fit(X_train_degree2, y_train.tax_value)

    # predict train
    y_train['poly_preds'] = lm2.predict(X_train_degree2)

    # predict validate
    y_test['poly_preds'] = lm2.predict(X_test_degree2)

    # RMSE of predictions
    rmse_train = mean_squared_error(y_train['tax_value'], y_train['poly_preds'], squared=False)
    rmse_test = mean_squared_error(y_test['tax_value'], y_test['poly_preds'], squared=False)

    # getting difference
    rmse_diff = rmse_train - rmse_test
    
    # printing scores
    print(f'''Polynomial 2-degree Model
    RMSE on Train: {round(rmse_train, 4)}
    RMSE on Test: {round(rmse_test, 4)}
    RMSE Difference: {round(rmse_diff, 4)}''')
    
    return