In [1]:
# tab data
import pandas as pd
import numpy as np

# data and prep
from wrangle import wrangle_zillow
from preprocess import scale_data

# data viz
import matplotlib.pyplot as plt

# stats and modeling needs
from scipy import stats
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures

from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler

import warnings
warnings.filterwarnings("ignore")

In [2]:
train, validate, test = wrangle_zillow()

In [3]:
# get only oc for each
train = train[train.county == 'Los Angeles']
# get only oc for each
validate = validate[validate.county == 'Los Angeles']
# get only oc for each
test = test[test.county == 'Los Angeles']

In [4]:
num_cols = ['square_feet', 'baths', 'beds']

In [5]:
# xtrain, ytrain
X_train = train[num_cols]
y_train = train[['tax_value']]

# xvalidate, yvalidate
X_validate = validate[num_cols]
y_validate = validate[['tax_value']]

# xtest, ytest
X_test = test[num_cols]
y_test = test[['tax_value']]

In [6]:
# scaling
X_train_scaled, X_validate_scaled, X_test_scaled = scale_data(X_train, num_cols, 'minmax', X_validate=X_validate, X_test=X_test)

In [7]:
y='tax_value'
# baseline using median
y_train['baseline_mean'] = round(y_train[y].mean(), 2)
y_validate['baseline_mean'] = round(y_validate.loc[:,y].mean(), 2)

In [8]:
# RMSE of mean baseline predictions
rmse_train = mean_squared_error(y_train.loc[:,y], y_train['baseline_mean'], squared=False)
rmse_validate = mean_squared_error(y_validate.loc[:,y], y_validate['baseline_mean'], squared=False)

# checking scores
rmse_train, rmse_validate, rmse_train - rmse_validate

(253136.35562037252, 251369.25292817858, 1767.1026921939338)

In [9]:
def basic_linear_modeling(X_train, y_train, x_validate, y_validate, model):
    '''
    Arguments:
    Actions:
    Returns:
    Modules:
    '''
    # fit the model
    model.fit(X_train, y_train)
   
    # return predictions
    return model.predict(X_train), model.predict(X_validate)
    

In [10]:
y_train['ols_preds'], y_validate['ols_preds'] = basic_linear_modeling(X_train_scaled, y_train[['tax_value']], X_validate_scaled, y_validate[['tax_value']], model=LinearRegression())

In [11]:
y_train['lars_alpha_hundredth_preds'], y_validate['lars_alpha_tenth_preds'] = basic_linear_modeling(X_train_scaled, y_train[['tax_value']], X_validate_scaled, y_validate[['tax_value']], model=LassoLars(alpha=.01))
y_train['lars_alpha_tenth_preds'], y_validate['lars_alpha_tenth_preds'] = basic_linear_modeling(X_train_scaled, y_train[['tax_value']], X_validate_scaled, y_validate[['tax_value']], model=LassoLars(alpha=.1))
y_train['lars_alpha_half_preds'], y_validate['lars_alpha_half_preds'] = basic_linear_modeling(X_train_scaled, y_train[['tax_value']], X_validate_scaled, y_validate[['tax_value']], model=LassoLars(alpha=.5))
y_train['lars_alpha1_preds'], y_validate['lars_alpha1_preds'] = basic_linear_modeling(X_train_scaled, y_train[['tax_value']], X_validate_scaled, y_validate[['tax_value']], model=LassoLars(alpha=1))

In [12]:
y_train['glm_poi_preds'], y_validate['glm_poi_preds'] = basic_linear_modeling(X_train_scaled, y_train[['tax_value']], X_validate_scaled, y_validate[['tax_value']], model=TweedieRegressor(power=1))

In [13]:
# make the polyni mial features
pf = PolynomialFeatures(degree=2)

# fit_transform
X_train_degree2 = pf.fit_transform(X_train_scaled)

# transform for validate
X_validate_degree2 = pf.transform(X_validate_scaled)


In [14]:
# create the model object
lm2 = LinearRegression(normalize=True)

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
lm2.fit(X_train_degree2, y_train.tax_value)

# predict train
y_train['poly_preds'] = lm2.predict(X_train_degree2)


# predict validate
y_validate['poly_preds'] = lm2.predict(X_validate_degree2)


In [15]:
y_train.columns

Index(['tax_value', 'baseline_mean', 'ols_preds', 'lars_alpha_hundredth_preds',
       'lars_alpha_tenth_preds', 'lars_alpha_half_preds', 'lars_alpha1_preds',
       'glm_poi_preds', 'poly_preds'],
      dtype='object')

In [16]:
model_rmse = []
target = 'tax_value'
for col in y_train:
    if col in ['glm_poi_preds','lars_alpha_hundredth_preds', 'tax_value']:
        pass
    else:
        rmse_train = mean_squared_error(y_train[target], round(y_train[col], 4), squared=False)
        rmse_validate = mean_squared_error(y_validate[target], round(y_validate[col], 4), squared=False)
        rmse_diff = rmse_train - rmse_validate
        model_rmse.append({
            'Model': col,
            'Train RMSE': rmse_train,
            'Validate RMSE': rmse_validate,
            'RMSE Difference': rmse_diff
        })

In [17]:
train = 1000000
for i in model_rmse:
    if i['Train RMSE'] < train:
        train = i['Train RMSE']

# getting the lowest train RMSE
for i in model_rmse:
    if i['Train RMSE'] == train:
        print(i)

{'Model': 'poly_preds', 'Train RMSE': 234080.58026768963, 'Validate RMSE': 232929.97477777212, 'RMSE Difference': 1150.605489917507}


In [18]:
model_rmse

[{'Model': 'baseline_mean',
  'Train RMSE': 253136.35562037252,
  'Validate RMSE': 251369.25292817858,
  'RMSE Difference': 1767.1026921939338},
 {'Model': 'ols_preds',
  'Train RMSE': 234749.75867283187,
  'Validate RMSE': 903359752.2224324,
  'RMSE Difference': -903125002.4637595},
 {'Model': 'lars_alpha_tenth_preds',
  'Train RMSE': 234749.76037928433,
  'Validate RMSE': 903199591.1216372,
  'RMSE Difference': -902964841.3612579},
 {'Model': 'lars_alpha_half_preds',
  'Train RMSE': 234749.80134020446,
  'Validate RMSE': 902558946.7185745,
  'RMSE Difference': -902324196.9172343},
 {'Model': 'lars_alpha1_preds',
  'Train RMSE': 234749.92934045708,
  'Validate RMSE': 901758141.2149996,
  'RMSE Difference': -901523391.2856591},
 {'Model': 'poly_preds',
  'Train RMSE': 234080.58026768963,
  'Validate RMSE': 232929.97477777212,
  'RMSE Difference': 1150.605489917507}]