# Working Notebook 2

# Imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns


import matplotlib.pyplot as plt
import scipy.stats as stats
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import QuantileTransformer, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,LassoLars, TweedieRegressor

import env

# Acquire data

In [None]:
def get_zillow_data():
        ''' Acquire Zillow data using properties_2017 table from Code up Data Base. Columns bedroomcnt, 
            bathroomcnt, calculatedfinishedsquarefeet, taxvaluedollarcnt, yearbuilt, taxamount, fips 
        '''
   
         # sql query for acquisition
        sql_query = """
        SELECT calculatedfinishedsquarefeet,bathroomcnt,bedroomcnt,taxvaluedollarcnt,yearbuilt, fireplacecnt,
        decktypeid, poolcnt, garagecarcnt,fips

        FROM properties_2017
        LEFT JOIN propertylandusetype USING(propertylandusetypeid)
        LEFT JOIN predictions_2017 USING(parcelid)
        WHERE (propertylandusetype.propertylandusedesc LIKE ('%%Single%%')) 
            AND (predictions_2017.transactiondate like '2017%%');
        """
        # Acquisition
        df = pd.read_sql(sql_query, env.get_connection('zillow'))
        return df

In [None]:
# had to adress % by using %% so it could be read by notebook

In [None]:
# import data
df = get_zillow_data()

In [None]:
# 52441 observations
df.shape

# Data Wrangle

In [None]:
# info
df.info()

In [None]:
df.isnull().sum().sort_values()

<div class="alert alert-info"> 
roomcnt                          0
    
fips                             0
    
regionidcounty                   0
    
latitude                         0
    
propertycountylandusecode        0
    
longitude                        0
    
regionidzip                     26
    
yearbuilt                      116
    
fullbathcnt                    137
    
calculatedbathnbr              137

I could maybe add one of these to explorations
    

In [None]:
df.dtypes

In [None]:
df.describe().T

<div class="alert alert-info"> There seems to be bathrooms/ bedrooms with min of 0 and max above what ight be consider a single family dweling

In [None]:
# rename columns
df = df.rename(columns={'bedroomcnt': 'bedrooms','bathroomcnt': 'bathrooms',
            'calculatedfinishedsquarefeet': 'squarefeet','taxvaluedollarcnt': 'home_value',
                        'lotsizesquarefeet':'lot','fireplacecnt': 'fireplace','decktypeid':'deck','poolcnt':'pool',
                       'garagecarcnt':'garage','fips':'county'})

In [None]:
def process_fancy_features(df, feature):
    df[feature]=df[feature].replace(r"^\s*$", np.nan, regex=True)
    if df[feature]:
        df[feature] == 1
    else:
        df[feature].fillna(0)
    return df[feature].isnull().sum()

In [None]:
df.garage 

In [None]:
df.isnull().sum()

In [None]:
fancy_features = ['fireplace','deck','pool','garage']

In [None]:
def process_fancy_features(df, feature):
    df[feature]=df[feature].replace(r"^\s*$", np.nan, regex=True)     
    # fill fancy features with 0 assumption that if it was not mark it did not exist
    df[feature] = df[feature].fillna(0)
    return df

In [None]:
def process_fancy_features(df):
    columnst = ['fireplace','deck','pool','garage']    
    for i in columns:
        df[feature]=df[feature].replace(r"^\s*$", np.nan, regex=True)     
        # fill fancy features with 0 assumption that if it was not mark it did not exist
        df[feature] = df[feature].fillna(0)
    return df

In [None]:
for i in fancy_features:
    process_fancy_features(df, i)

These values were filled in with 0 since the presense was not anotated it is assumed it does not exist

fireplace     45198

deck          52052

pool          41345

garage        34426

In [None]:
# still have 82 nulls in square feet and 116 nulls in yearbuilt and 1 null in home_value
df.isnull().sum()

In [None]:
df.describe()

## outliers

In [None]:
def handle_outliers(df):
    """Manually handle outliers '"""
    df = df[df.bathrooms <= 6]
    
    df = df[df.bedrooms <= 6]
    
    df = df[df.home_value <= 1_750_000]
    
    return df


In [None]:
df.shape

In [None]:
(51918-1632)/(52441)

In [None]:
1632/ 52441

In [None]:
# removed homes above 1_750_000 as outliers total of 1632 rows about 3% of data still retain .958 of original dat
df[df.home_value > 1_750_000]

In [None]:
pd.options.display.float_format = '{:,.3f}'.format
df. home_value.describe()

In [None]:
52112/52441

In [None]:
52112-52441

In [None]:
df = handle_outliers(df)

# nulls and 0

In [None]:
df.isnull().sum()

In [None]:
df[df.squarefeet.isnull()]

In [None]:
# dropped properties with no bathrooms and no bedrooms 153 rows at still retained .990 of original data
df= df[~(df.bathrooms==0) & ~(df.bedrooms ==0)]

In [None]:
df.shape

In [None]:
51959 - 52112

In [None]:
51959/52441

In [None]:
df[df['squarefeet'].isnull()]

In [None]:
df.squarefeet.mean()

In [None]:
df.isnull().sum()

In [None]:
# dropped null values in yearbuilt 89 and 1 in home_value still retained .99 of original data.
# total dropped 90, 
df = df.dropna()

In [None]:
df.shape

In [None]:
51918/52441

### fireplace

In [None]:
df.fireplace.value_counts()

In [None]:
df.fireplace = df.fireplace.replace({2:1, 3:1, 4:1, 5:1})
df.fireplace.value_counts()

### garage

In [None]:
df.garage.value_counts()

In [None]:
df.garage = df.garage.replace({2:1, 3:1, 4:1, 5:1, 6:1, 7:1, 8:1, 9:1, 10:1, 13:1,14:1})
df.garage.value_counts()

### pool

In [None]:
df.pool.value_counts()

In [None]:
df.deck.value_counts()

In [None]:
df.deck= df.deck.replace({66:1})
df.deck.value_counts()

In [None]:
def encode_features (df):
    df.fireplace = df.fireplace.replace({2:1, 3:1, 4:1, 5:1})
    df.deck= df.deck.replace({66:1})
    df.garage = df.garage.replace({2:1, 3:1, 4:1, 5:1, 6:1, 7:1, 8:1, 9:1, 10:1, 13:1,14:1})
    df = pd.get_dummies(df, columns=['county','fancy_features'], drop_first=False)
    return df
    
    
    

# New Column

In [None]:
def new_features(df):
    #Creating new column for home age using year_built, casting as float
    df['home_age'] = 2017- df['yearbuilt']
    df["home_age"] = df["home_age"].astype('float')
    
    df['optional_features'] = (df.garage==1)|(df.deck == 1)|(df.pool == 1)|(df.fireplace == 1)
    
    return df
    
    
    

In [None]:
#Creating new column for home age using year_built, casting as float
df['home_age'] = 2017- df['yearbuilt']
df["home_age"] = df["home_age"].astype('float')

In [None]:
df['fancy_features'] = (df.garage==1)|(df.deck == 1)|(df.pool == 1)|(df.fireplace == 1)

## FIPS

In [None]:
# Relabeling FIPS data
#df['fips'] = df.fips.replace({6037:'Los Angeles',
 #                  6059:'Orange',
  #                 6111:'Ventura'})

# split data for exploration

In [None]:
def split_data(df):
    '''
    split_data takes in data Frame and splits into  train , validate, test.
    The split is 20% test 80% train/validate. Then 30% of 80% validate and 70% of 80% train.
    Aproximately (train 56%, validate 24%, test 20%)
    Returns train, validate, and test 
    '''
    # split test data from train/validate
    train_and_validate, test = train_test_split(df, random_state=123, test_size=.2)

    # split train from validate
    train, validate = train_test_split(train_and_validate, random_state=123, test_size=.3)
                                   
    return train, validate, test

In [None]:
train, validate, test = split_data(df)

# Explore

In [None]:
train.info()

In [None]:
train.describe().T

In [None]:
train.head()

In [None]:
train.columns

In [None]:
#sns.pairplot(train['squarefeet', 'bathrooms', 'bedrooms', 'home_value', 'yearbuilt',
  #      'county', 'home_age']
 #      )

In [None]:
# correaltion between variables
zillow_corr = train.corr(method='spearman')
zillow_corr

In [None]:
# pass my correlation matrix to Seaborn's heatmap 
kwargs = {'alpha':.9,
          'linewidth':3, 
          'linestyle':'-',
          'linecolor':'black'}
sns.heatmap(zillow_corr, cmap='Purples', annot=True, 
            mask=np.triu(zillow_corr), **kwargs)

In [None]:
train.columns.to_list()

In [None]:
def plot_variable_pair(df):
    columns = ['squarefeet',
 'bathrooms',
 'bedrooms',
 'home_value',
 'yearbuilt',
 'fireplace',
 'deck',
 'pool',
 'garage',
 'home_age',
    'county',
              'fancy_features']
    for i, col in enumerate(columns):
        sns.lmplot(data=df, x=col, y='home_value', line_kws={'color':'red'})
        plt.show()

In [None]:
plot_variable_pair(train)

In [None]:
def plot_categorical_and_continuous_vars(df, cat_vars, cont_vars):
    for col in cat_vars:
        for col2 in cont_vars:
            fig, (ax1, ax2, ax3) = plt.subplots(1,3, figsize=(16,6))
            fig.suptitle(f'{col} vs. {col2}')
            sns.boxplot(data=df, x=col, y=col2, ax=ax1)
            sns.violinplot(data=df, x=col, y=col2, ax=ax2)
            sns.barplot(data=df, x=col, y=col2, ax=ax3)
            plt.show()

In [None]:
train.columns

In [None]:
# set categories
cat_vars =['pool','garage','deck','fireplace', 'bathrooms', 'bedrooms','county','fancy_features']
cont_vars =['home_value','home_age','squarefeet']

In [None]:
plot_categorical_and_continuous_vars(train, cat_vars, cont_vars)

<div class="alert alert-info">
Home_Value increases with features:
    
    * pool
    
    * garage
    
    * deck
    
    * fireplace
    
    * Square feet
    
    * bathrooms
    
    * Bedrooms
    
    * fancy_feature

Questions:
       * Answer the following initial question
       
        * What does the average home look like
        
        * Do popular builts have a higher home value than the average built.
        
        * Do properties with more bathrooms have a higher home value? bedrooms? squarefeet?
        
        * What are the average attributes of home whose value that are in the bottom quantile?
        
        * Are one stories home more valuable than two story homes.
        
        * Does having things like pool, deck, fireplace, garage increase home value?
        
        *( maybe look into roomcnt, regionidcounty, propertycountylandusecode, regionzip,fullbathcnt, yearbuilt, calculatedbathbr)

# What does the average home look like?

In [None]:
columns = train.columns.to_list()

In [None]:
for i in columns:
    mean = train[i].mean()
    print (f'{i} mean = {mean}')

The average home_value is 433,444
The average home is 1835 **squarefeet**, has between 2 to 2.5 **bathrooms**, and 3 **bedrooms**.
About 1 in 3 homes have a **garage** on average
and 1 in 5 homes have a **pool** on average
1 in 7 have a **fireplace**
and less tha 1% have a **deck**


# What is the most popular built?

In [None]:
for i in columns:
    mode = train[i].mode()
    print (f'{i} mode = {mode}')

The most popular built is a home with 1_120 squarefeet, 2 bathrooms, 3 bedrooms,  

# Do popular builts have a higher home value than others? More house vs minimal house

In [None]:
house = train[ (train.bathrooms ==2)&(train.bedrooms ==3)]
house.shape

In [None]:
more_house = train[ (train.bathrooms>2)&(train.bedrooms >3)]

In [None]:
minimal_house = train[(train.bathrooms<2)&(train.bedrooms <3)]

In [None]:
more_house.home_value.median(), minimal_house.home_value.median(), house.home_value.median()

In [None]:
more_house.home_value.mean(), minimal_house.home_value.mean(), house.home_value.mean()

# Homes with the ideal amount of bathrooms?

In [None]:
train.bathrooms.median()

In [None]:
house = train[train.bathrooms == 2]
more_house = train[ train.bathrooms>2]

minimal_house = train[train.bathrooms<2]



In [None]:
more_house.home_value.median(), minimal_house.home_value.median(), house.home_value.median()

In [None]:
more_house.home_value.mean(), minimal_house.home_value.mean(), house.home_value.mean()

#  What are the average attributes of home whose value that are in the bottom quantile?

In [None]:
bottom_q = train[train.home_value < train.home_value.quantile(.25)]

In [None]:
bottom_q.bedrooms.mean(), bottom_q.bathrooms.median()

In [None]:
bottom_q.bathrooms.mean(), bottom_q.bathrooms.median()

In [None]:
bottom_q.home_value.mean()

In [None]:
def show_cat_vs_cont(df, cat_vars=cat_vars, cont_vars=cont_vars):
    print('Categorical vs Continuous Variables:')
    #number = 1
    palettes = ['flare', 'Blues_r', 'PuRd_r', 'Accent']
    for j, cont in enumerate(cont_vars):
        plt.figure(figsize=(20,4))
        plt.suptitle(cont)
        for i, cat in enumerate(cat_vars):
            plt.subplot(1, 4, i+1)
            sns.barplot(data=df, x=cat, y=cont, palette=palettes[j])
            plt.title(cat + ' vs ' + cont)
        plt.show()

In [None]:
# show_cat_vs_cont(train, cat_vars,cont_vars)

# Does having an extra feature in a home raise home value?

In [None]:
#train['fancy_features'] = (train.garage==1)|(train.deck == 1)|(train.pool == 1)|(train.fireplace == 1)

In [None]:
sns.boxplot(data=train, x='fancy_features', y='home_value');

# Is fips a driver of home value?

In [None]:
sns.boxplot(data=train, x='county', y='home_value');

In [None]:
train.head()

In [None]:
plt.figure(figsize=(8, 12))
heatmap = sns.heatmap(train.corr(method='spearman')[['home_value']].sort_values(by='home_value', ascending=False), vmin=-1, vmax=1, annot=True, cmap='Purples')
heatmap.set_title('Features Correlating with Home Price', fontdict={'fontsize':18}, pad=16);

<div class="alert alert-info">Highest correlation seems to be squarefeet, bathrooms and fancy_features, yearbuilt

In [None]:
columns = train.columns.to_list()
columns

In [None]:
#for i in columns:    
 #   sns.displot(train[i])

# Stats

In [None]:
above_mean_bathrooms = train.bathrooms[train.bathrooms > train.bathrooms.mean()]
overall_mean = train.bathrooms.mean()

In [None]:
alpha=0.05
t, p = stats.ttest_1samp(above_mean_bathrooms, overall_mean)

print(t, p/2)

In [None]:
# pearsonr r for continuous variables
for i in cont_vars:
    α =0.05
    corr, p = stats.pearsonr(train[i], train.home_value)
    print('_____________________________________________________')
    print('HYPOTHESIS')
    print(f'H0: There is no significant difference between tax_value in different {i}')
    print(f'Ha:(There is a significant difference between tax_value in different {i}')
    print(f'{i} correlation {corr}, p-val{p}')
    if p < α:
        print('We reject the null hypothesis.')
        print(f'There is a correlation between home_value and {i}')
    else:
        print('We fail to reject the null hypothesis.')
        print(f'There is no correlation between home_value and {i}')

In [None]:
for i in cat_vars:
    α =0.05
    corr, p = stats.pearsonr(train[i], train.home_value)
    print('_____________________________________________________')
    print('HYPOTHESIS')
    print(f'H0: There is no significant difference between tax_value in different {i}')
    print(f'Ha:(There is a significant difference between tax_value in different {i}')
    print(f'{i} correlation {corr}, p-val{p}')
    if p < α:
        print('We reject the null hypothesis.')
        print(f'There is a correlation between tax_value and {i}')
    else:
        print('We fail to reject the null hypothesis.')
        print(f'There is no correlation between tax_value and {i}')

# Model

In [None]:
columns.remove('home_value')

In [None]:
columns

In [None]:
# Relabeling FIPS data
train['county'] = train.county.replace({6037:'Los Angeles',
                       6059:'Orange',
                       6111:'Ventura'})
# Creating Dummy Variables from County
train = pd.get_dummies(train, columns=['county','fancy_features'], drop_first=False)

In [None]:
# Relabeling FIPS data
validate['county'] = validate.county.replace({6037:'Los Angeles',
                       6059:'Orange',
                       6111:'Ventura'})
# Creating Dummy Variables from County
validate = pd.get_dummies(validate, columns=['county','fancy_features'], drop_first=False)

In [None]:
# Relabeling FIPS data
test['county'] = test.county.replace({6037:'Los Angeles',
                       6059:'Orange',
                       6111:'Ventura'})
# Creating Dummy Variables from County
test= pd.get_dummies(test, columns=['county','fancy_features'], drop_first=False)

In [None]:
train.shape

In [None]:
validate.shape

In [None]:
validate.columns

In [None]:
test.shape

In [None]:
train.columns.to_list()

# VIF

In [None]:
X = train[['squarefeet', 'home_age', 'county_Los Angeles',
        'fancy_features_True']]
# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
  
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]
  
vif_data

### Scale Data

In [None]:
def scale_data(train, 
               validate, 
               test, 
               columns_to_scale=['squarefeet','bathrooms','bedrooms','yearbuilt','home_age']):
    '''
    scale_data takes in train , validate, test data  and returns their scaled counterparts.
    '''
    # create copies of our original data
    train_scaled = train.copy()
    validate_scaled = validate.copy()
    test_scaled = test.copy()
    #create the scaler
    scaler = QuantileTransformer(output_distribution='normal')
    # fit the scaler into train data
    scaler.fit(train[columns_to_scale])
    
    # applying the scaler to train, validate, and test data
    train_scaled[columns_to_scale] = pd.DataFrame(scaler.transform(train[columns_to_scale]),
                                                  columns=train[columns_to_scale].columns.values).set_index([train.index.values])
                                                  
    validate_scaled[columns_to_scale] = pd.DataFrame(scaler.transform(validate[columns_to_scale]),
                                                  columns=validate[columns_to_scale].columns.values).set_index([validate.index.values])
    
    test_scaled[columns_to_scale] = pd.DataFrame(scaler.transform(test[columns_to_scale]),
                                                 columns=test[columns_to_scale].columns.values).set_index([test.index.values])
    
    return train_scaled, validate_scaled, test_scaled

In [None]:
X_train_scaled, X_validate_scaled, X_test_scaled = scale_data(train,validate,test)

In [None]:
X_train_scaled

# Set up X and y target

In [None]:
sns.histplot(train.home_value)

plt.show()

In [None]:
 columns_to_scale=['squarefeet','bathrooms','bedrooms','yearbuilt','home_age']

In [None]:
for i in columns_to_scale:
    plt.figure(figsize=(13, 6))
    plt.subplot(121)
    plt.hist(data=train, x= i,bins=20)
    plt.title(f'Original {i}')
    
    plt.subplot(122)
    plt.hist(x= i, data=X_train_scaled,bins=20)
    plt.title(f'Quantile Transformation Normal {i}')

    plt.show();

In [None]:
# Setup X and y
X_train_scaled = X_train_scaled.drop(columns='home_value')
y_train = train.home_value

X_validate_scaled = X_validate_scaled.drop(columns='home_value')
y_validate = validate.home_value

X_test_scaled = X_test_scaled.drop(columns='home_value')
y_test = test.home_value

In [None]:
X_test_scaled.dtypes

# Baseline

In [None]:
from sklearn.metrics import mean_squared_error
# We need y_train and y_validate to be dataframes to append the new columns with predicted values. 
y_train = pd.DataFrame(y_train)
y_validate = pd.DataFrame(y_validate)

# 1. Predict HV_pred_mean
HV_pred_mean = y_train.home_value.mean()
y_train['HV_pred_mean'] = HV_pred_mean
y_validate['HV_pred_mean'] = HV_pred_mean

# 2. compute G3_pred_median
HV_pred_median = y_train.home_value.median()
y_train['HV_pred_median'] = HV_pred_median
y_validate['HV_pred_median'] = HV_pred_median

# 3. RMSE of G3_pred_mean
rmse_train = mean_squared_error(y_train.home_value, y_train.HV_pred_mean)**(1/2)
rmse_validate = mean_squared_error(y_validate.home_value, y_validate.HV_pred_mean)**(1/2)

print("RMSE using Mean\nTrain/In-Sample: ", round(rmse_train, 2), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 2))

# 4. RMSE of G3_pred_median
rmse_train = mean_squared_error(y_train.home_value, y_train.HV_pred_median)**(1/2)
rmse_validate = mean_squared_error(y_validate.home_value, y_validate.HV_pred_median)**(1/2)

print("RMSE using Median\nTrain/In-Sample: ", round(rmse_train, 2), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 2))


In [None]:
y_train

In [None]:
# plot to visualize actual vs predicted. 
plt.hist(y_train.home_value, color='blue', alpha=.5, label='Actual home value')
plt.hist(y_train.HV_pred_mean, bins=1, color='red', alpha=.5, rwidth=100, label="Predicted home value - Mean")
plt.hist(y_train.HV_pred_median, bins=1, color='black', alpha=.5, rwidth=100, label="Predicted Final Grades - Median")
plt.xlabel("home_value")

plt.legend()
plt.show()

# OLS Model

In [None]:
# create the model object
lm = LinearRegression(normalize=True)

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
lm.fit(X_train_scaled, y_train.home_value)

# predict train
y_train['HV_pred_lm'] = lm.predict(X_train_scaled)

In [None]:
# evaluate: rmse
rmse_train = mean_squared_error(y_train.home_value, y_train.HV_pred_lm)**(1/2)
# predict validate
y_validate['HV_pred_lm'] = lm.predict(X_validate_scaled)

In [None]:
# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.home_value, y_validate.HV_pred_lm)**(1/2)

print("RMSE for OLS using LinearRegression\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)


# Lasso Lars

In [None]:
# create the model object
lars = LassoLars(alpha=1.0)

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
lars.fit(X_train_scaled, y_train.home_value)

# predict train
y_train['HV_pred_lars'] = lars.predict(X_train_scaled)

In [None]:
# evaluate: rmse
rmse_train = mean_squared_error(y_train.home_value, y_train.HV_pred_lars)**(1/2)

# predict validate
y_validate['HV_pred_lars'] = lars.predict(X_validate_scaled)

In [None]:
# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.home_value, y_validate.HV_pred_lars)**(1/2)

print("RMSE for Lasso + Lars\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

# TweedieRegressor (GLM)

In [None]:
# create the model object
glm = TweedieRegressor(power=1, alpha=0)

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
glm.fit(X_train_scaled, y_train.home_value)

# predict train
y_train['HV_pred_glm'] = glm.predict(X_train_scaled)

# evaluate: rmse
rmse_train = mean_squared_error(y_train.home_value, y_train.HV_pred_glm)**(1/2)

# predict validate
y_validate['HV_pred_glm'] = glm.predict(X_validate_scaled)

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.home_value, y_validate.HV_pred_glm)**(1/2)

print("RMSE for GLM using Tweedie, power=1 & alpha=0\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

# Polynomial Regression

In [None]:
# make the polynomial features to get a new set of features
pf = PolynomialFeatures(degree=2)

# fit and transform X_train_scaled
X_train_degree2 = pf.fit_transform(X_train_scaled)

# transform X_validate_scaled & X_test_scaled
X_validate_degree2 = pf.transform(X_validate_scaled)
X_test_degree2 = pf.transform(X_test_scaled)

In [None]:
# create the model object
lm2 = LinearRegression(normalize=True)

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
lm2.fit(X_train_degree2, y_train.home_value)

# predict train
y_train['HV_pred_lm2'] = lm2.predict(X_train_degree2)

# evaluate: rmse
rmse_train = mean_squared_error(y_train.home_value, y_train.HV_pred_lm2)**(1/2)

# predict validate
y_validate['HV_pred_lm2'] = lm2.predict(X_validate_degree2)

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.home_value, y_validate.HV_pred_lm2)**(1/2)

print("RMSE for Polynomial Model, degrees=2\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

# Evaluate

In [None]:
def calculate_mse(y_predicted):
    return mean_squared_error(y_train.home_value, y_predicted)



pd.options.display.float_format = '{:,.3f}'.format

def calculate_RMSE(y_predicted):
    return mean_squared_error(y_train.home_value,y_predicted)**.5


In [None]:
y_train.apply(calculate_mse).sort_values()

In [None]:
y_train.apply(calculate_RMSE).sort_values()

In [None]:
y_train

BEST MODELS are  LM2 and then GLM then LM:

HV_pred_lm2      264,401.437

HV_pred_glm      272,993.029

HV_pred_lm       280,034.483


In [None]:
y_validate

In [None]:
def regression_errors(y, yhat):
    '''
    regression_errors takes in actual value  of target y  and predicted value yhat 
    and returns  SSE, ESS, TSS, MSE, RMSE
    y: actual values of target
    yhat: predicted value of target
    
    Return :
        * SSE Sum or Squared error
        * ESS Explained sum of squares
        * TSS Total sum of squares
        * MSE Mean squared error
        * RMSE Root mean squared error
        
    '''

    # calculations
    MSE = mean_squared_error(y, yhat)
    SSE = MSE * len(y)
    RMSE = MSE**.5
    ESS = ((yhat - y.mean())**2).sum()
    TSS = ESS + SSE
    
    return f'SSE = {SSE}', f'ESS = {ESS}', f'TSS = {TSS}', f'MSE = {MSE}', f'RMSE = {RMSE}'

In [None]:
for i in y_validate.columns.to_list():
    print(i)
    print (regression_errors(y_validate.home_value, y_validate[i]))

## Best models is HV_pred_lm2, then HV_pred_glm

# ADD MORE Models

In [None]:
# make the polynomial features to get a new set of features
pf = PolynomialFeatures(degree=3)

# fit and transform X_train_scaled
X_train_degree3 = pf.fit_transform(X_train_scaled)

# transform X_validate_scaled & X_test_scaled
X_validate_degree3 = pf.transform(X_validate_scaled)
X_test_degree3 = pf.transform(X_test_scaled)

In [None]:
# create the model object
lm4 = LinearRegression(normalize=True)

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
lm4.fit(X_train_degree3, y_train.home_value)

# predict train
y_train['HV_degree3'] = lm4.predict(X_train_degree3)

# evaluate: rms
rmse_train = mean_squared_error(y_train.home_value, y_train.HV_degree3)**(1/2)

# predict validate
y_validate['HV_degree3'] = lm4.predict(X_validate_degree3)

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.home_value, y_validate.HV_degree3)**(1/2)

print("RMSE for Polynomial Model, degrees=2\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

# Multiple Regression + RFE 6 features

In [None]:
lm3 = LinearRegression()


# 1. Transform our X
rfe = RFE(lm3, n_features_to_select=6)
rfe.fit(X_train_scaled, y_train.home_value)
print('selected top 6 features:', X_train_scaled.columns[rfe.support_])


In [None]:
X_train_rfe = rfe.transform(X_train_scaled)
# 2. Use the transformed x in our model
lm3.fit(X_train_rfe, y_train.home_value)


In [None]:
X_train_scaled.shape

In [None]:
X_train_rfe.shape

In [None]:
# 3. Make predictions

X_validate_rfe = rfe.transform(X_validate_scaled)
y_train['multiple_rfe'] = lm3.predict(X_train_rfe)

In [None]:
X_validate_rfe = pd.DataFrame(X_validate_rfe, columns = X_validate_scaled.columns[rfe.support_], index = X_validate_scaled.index)

In [None]:
y_validate['multiple_rfe'] = lm3.predict(X_validate_rfe)

In [None]:
y_validate

### Select Features for model

In [None]:
def select_Kbest(X, y, k=2):   
    '''
    select_Kbest takes in :
    X: a dataframe representing numerical independent features
    y: a pandas Series representing a target variable
    k: a keyword argument defaulted to 2 for the number of features
    
    returns: a list of the selected features using SelectBest 
    '''
    # model
    kbest = SelectKBest(f_regression, k=k)
    # fit
    kbest.fit(X, y)
    # mask of selection
    feature_mask = kbest.get_support()
    rank = X.columns[feature_mask]
    
    # convert to list
    rank = rank.to_list()


    return f'TOP {k} features: {rank}'

In [None]:
select_Kbest(X_train_scaled,y_train.home_value, k=5)

### Simple model

In [None]:
predictions = pd.DataFrame({
    'actual': validate.tax_value
}) 

In [None]:
# X must be 2-d array
#X_train_scaled = X_train_scaled[['bathrooms']]
# y can be 1-d array
#y_train = train.tax_value

# 1. make the thing
lm = LinearRegression()
# 2. fit the thing
lm.fit(X_train_scaled[['bathrooms']], y_train)
# 3. use the thing (make predictions)
#X_validate = validate[['taxamount']]
predictions['simple_lm'] = lm.predict(X_validate[['bathrooms']])

In [None]:
# look into modesl make up 
lm.coef_, lm.intercept_

In [None]:
print(f'tax_value = {lm.coef_}*bathrooms + {lm.intercept_}')

In [None]:
predictions

### Multiple + RFE

In [None]:
def rfe(X, y, k=2):
    ''' 
    rfe takes in:
    X: a dataframe representing numerical independent features
    y: a pandas Series representing a target variable
    k: a keyword argument defaulted to 2 for the number of features
    
    returns: a list of the selected features using RFE
    '''
    # Model
    model = LinearRegression()
    rfe = RFE(model, n_features_to_select= k)
    #fit
    rfe.fit(X, y)
    # mask selection
    mask = rfe.get_support()
    
    return X.columns[mask]

In [None]:
rfe(X_train_scaled, y_train, k=2)

In [None]:


lm = LinearRegression()
k = 2

### 1. Transform our X
rfe = RFE(lm, n_features_to_select=2)
rfe.fit(X_train_scaled, y_train)
print('selected top 2 features:', X_train_scaled.columns[rfe.support_])
X_train_rfe = rfe.transform(X_train_scaled)

In [None]:
# 1. Transform our X
X_train_rfe = rfe.transform(X_train_scaled)
# 2. Use the transformed x in our model
lm.fit(X_train_rfe, y_train)

# 3. Make predictions
X_validate_rfe = rfe.transform(X_validate)
predictions['multiple_rfe'] = lm.predict(X_validate_rfe)

predictions.head()

In [None]:
# look into models make up
lm.coef_, lm.intercept_

In [None]:
print(f'tax_value = {lm.coef_[0]:.2f} x squarefeet + {lm.coef_[1]:.2f} x bedroom + {lm.intercept_:.2f}')

### Poly Degree2

In [None]:
# 1. Generate Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
poly.fit(X_train_scaled)
X_train_poly = pd.DataFrame(
    poly.transform(X_train_scaled),
    columns=poly.get_feature_names(X_train_scaled.columns),
    index=train.index,
)
X_train_poly.head()

In [None]:
# 2. Use the features
lm = LinearRegression()
lm.fit(X_train_poly, y_train)

X_validate_poly = poly.transform(X_validate_scaled)
predictions['polynomial degree 2'] = lm.predict(X_validate_poly)
predictions

# Poly interactions_only 

In [None]:
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
poly.fit(X_train_scaled)
X_train_poly = pd.DataFrame(
    poly.transform(X_train),
    columns=poly.get_feature_names(X_train.columns),
    index=train.index,
)
lm = LinearRegression()
lm.fit(X_train_poly, y_train)

X_validate_poly = poly.transform(X_validate_scaled)
predictions['polynomial only interaction'] = lm.predict(X_validate_poly)

pd.Series(lm.coef_, index=poly.get_feature_names(X_train_scaled.columns)).sort_values()

In [None]:
predictions

## Laso Lars

In [None]:
# create the model object
lars = LassoLars(alpha=1)

# fit the model to our training data
lars.fit(X_train, y_train)

# predict validate
X_validate_pred_lars = lars.predict(X_validate)

# Add lassolars predictions to our predictions DataFrame
predictions['lasso_lars'] = X_validate_pred_lars

In [None]:
predictions

# GLM

In [None]:
# create the model object
glm = TweedieRegressor(power=1, alpha=0)

# fit the model to our training data
glm.fit(X_train_scaled, y_train)

# predict validate
X_validate_predict_glm = glm.predict(X_validate_scaled)

# Add lassolars predictions to our predictions DataFrame
predictions['glm'] = X_validate_predict_glm

### Baseline

In [None]:
# add a baseline model
predictions['baseline'] = train.tax_value.mean()

In [None]:
predictions

# Evaluate

In [None]:
def calculate_mse(y_predicted):
    return mean_squared_error(predictions.actual, y_predicted)

predictions.apply(calculate_mse).sort_values()

In [None]:
pd.options.display.float_format = '{:,.3f}'.format
def calculate_RMSE(y_predicted):
    return mean_squared_error(predictions.acual,y_predicted)
predictions.apply(calculate_mse).sort_values()**.5

First itteration of models Polynomial degree2 was best, Then glm, then lasso_lars then baseline