In [1]:
import pandas as pd
import numpy as np
from scripts import *
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
import seaborn as sns
from scipy import stats
import warnings
from shapely.geometry import Point,Polygon
import geopandas as gpd
import descartes
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import cross_val_score
warnings.filterwarnings('ignore')
%matplotlib inline

SyntaxError: invalid syntax (scripts.py, line 65)

In [None]:
# importing main data
training_data = pd.read_csv('/Users/brendanferris/Desktop/kings_county_housing/kc_house_data_train.csv')
missing_vals = pd.read_csv('bedroomsfilled.csv')
holdout_data = pd.read_csv('/Users/brendanferris/Desktop/kings_county_housing/kc_house_data_train.csv')

# Housing Data: first look.

In [None]:
fig, ax = plt.subplots(figsize = (15,25))
street_map.plot(ax=ax, alpha = 0.8, color = 'grey')
geo_df[geo_df['price'] > 0].plot(ax = ax , markersize = 2,
                   color = 'blue',marker = 'v', aspect = 1.5)
ax.set_title('Houses in Kings County, WA', fontdict =
             {'fontsize': 20})
ax.set_ylabel('Latitude',fontdict = {'fontsize': 20})
ax.set_xlabel('Longitude',fontdict = {'fontsize': 20});

In [None]:
training_data.head(4)

# Adding Additional Zipcode Information

Additional zipcode information was collected from [here](https://www.unitedstateszipcodes.org) to allow for additional relevant features. 

In [None]:
# import additional zipcode data
zipcode_data = pd.read_excel('/Users/brendanferris/Desktop/kings_county_housing/EDA/extradata.xlsx', index_col='zipcode')
clean_data(training_data)
clean_data(missing_vals)

#replaced missing values with correct info from zillow.
training_data = no_bedrooms(training_data, missing_vals)

#merge the original data with additional zipcode data. 
training_data = pd.merge(training_data, zipcode_data, how='right', on='zipcode')

#replace 33 bedrooms with correct value
training_data.set_value(8597, 'bedrooms', 3).head(3)

In [None]:
training_data['available_housing'] = training_data['total_housing_units'] - training_data['occupied_housing_units']

In [None]:
training_data

# Min/Max Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scaled_feat = ['pop_density',
               'sqft_living15', 
               'sqft_above', 
               'sqft_basement',
               'bedrooms',
               'bathrooms',
               'median_household_income',
               'median_home_value',
               'sqft_lot']


training_data[scaled_feat] = scaler.fit_transform(training_data[scaled_feat])  

# Creating Dummies For Categorical Variables

In [None]:
waterfront_dummies = pd.get_dummies(training_data['waterfront'], prefix='waterf', drop_first=True)
view_dummies = pd.get_dummies(training_data['view'], prefix='has_view', drop_first=True)
condition_dummies = pd.get_dummies(training_data['condition'], prefix='condition', drop_first=True)
grade_dummies = pd.get_dummies(training_data['grade'], prefix='grade', drop_first=True)
month_dummies = pd.get_dummies(training_data['sale_month'], prefix='month', drop_first=True)
zip_dummies = pd.get_dummies(training_data['zipcode'], prefix='zip', drop_first=True)


training_data = training_data.drop(['postoffice','id','zipcode','sale_month','waterfront', 'view', 'condition', 'grade'], axis=1)
training_data = pd.concat([training_data, waterfront_dummies, view_dummies, condition_dummies, grade_dummies, month_dummies, zip_dummies], axis=1)
training_data.head(5)

# Removing Colinear Features

In [None]:
training_data.drop(['zip_98003',
                    'owned_household_with_mortgage', 
                    'houses_owned_outright',
                    'land_area',
                    'occupied_housing_units', 
                    'total_housing_units',
                    'sqft_living', 
                    'condition_3'], axis=1, inplace=True)

In [None]:
multicolinear_features(training_data) #graph should be empty

# Sklearn Linear Model

In [None]:
y = training_data.price
x = training_data.drop(['price'], axis=1)

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=.2)
x_train.shape, x_test.shape

In [None]:
lr = LinearRegression()
lr.fit(x_train, y_train)
y_hat = lr.predict(x_test)
MSE = mean_squared_error(y_test, y_hat)
print('RMSE',np.sqrt(MSE).round(5))

In [None]:
np.mean(cross_val_score(estimator=lr, X=x_train, y=y_train, cv=30))

In [None]:
residuals = (y_test - y_hat)
plt.hist(residuals);

In [None]:
sns.residplot(y_hat, y_test, lowess=True, color='g')

# OLS Linear Model

In [None]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

outcome = 'price'
predictors = training_data.drop('price', axis=1)
pred_sum = "+".join(predictors.columns)
formula = outcome + "~" + pred_sum


model = ols(formula= formula, data=training_data).fit()
model.summary()