In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
%matplotlib inline

<img src="../images/KingCounty.jpeg">

### In this notebook I will be mapping data from IPUMS' (Integrated Public Use Microdata Series) National Historic Geographic Information System onto our housing data. From IPUMS I retrieved a csv file with median income by zipcode for the United States in 2015.

In [2]:
#import IPUMS Data
dfgo = pd.read_csv('../data/2015_zip.csv')

In [3]:
dfgo.head()

Unnamed: 0,nhgis0003_ds215_20155_2015_zcta,Unnamed: 1,Unnamed: 2
0,GISJOIN,NAME_E,ADNKE001
1,GIS Join Match Code,Estimates: Area Name,Estimates: Median household income in the past...
2,G00601,601,10816
3,G00602,602,16079
4,G00603,603,16804


In [4]:
#working on getting the column headers correct
headers = dfgo.iloc[0]
new_df  = pd.DataFrame(dfgo.values[1:], columns=headers)

In [5]:
new_df.head()

Unnamed: 0,GISJOIN,NAME_E,ADNKE001
0,GIS Join Match Code,Estimates: Area Name,Estimates: Median household income in the past...
1,G00601,601,10816
2,G00602,602,16079
3,G00603,603,16804
4,G00606,606,12512


In [6]:
#dropping this column because we do not need it
new_df = new_df.drop('GISJOIN', axis = 1)

In [7]:
#renaming columns
dfzip = new_df.rename(columns={'NAME_E': 'zipcode', 'ADNKE001': 'median_income'})

In [8]:
dfzip.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33121 entries, 0 to 33120
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   zipcode        33121 non-null  object
 1   median_income  31068 non-null  object
dtypes: object(2)
memory usage: 517.6+ KB


In [9]:
#realigning rows
dfzip = dfzip.drop(0, axis = 0)

In [10]:
#converting columns from objects to numbers
dfzip[["zipcode", "median_income"]] = dfzip[["zipcode", "median_income"]].apply(pd.to_numeric)

In [11]:
dfzip.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33120 entries, 1 to 33120
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   zipcode        33120 non-null  int64  
 1   median_income  31067 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 776.2 KB


### Now I need to isolate the King County zip codes and map the median income onto our data frame so I am going to import our dataframe.

In [12]:
#import our dataframe
df2 = pd.read_csv('cleaned_df-2', index_col = 0)

FileNotFoundError: [Errno 2] No such file or directory: 'cleaned_df-2'

In [None]:
#find range of King County Zip Codes
df2.zipcode.min(), df2.zipcode.max()

In [None]:
#Isolating our zipcode data so that only King County zip codes are represented
dfzip = dfzip[dfzip.zipcode >= 98001]

In [None]:
dfzip = dfzip[dfzip.zipcode <= 98199]

In [None]:
dfzip.info()

In [None]:
#turning zipcode data frame into a dictionary to be able to map it to our original dataframe
zipcodemap = dfzip.set_index('zipcode')['median_income'].to_dict()

In [None]:
df2['median_by_zip'] = df2['zipcode'].map(zipcodemap)

In [None]:
df2.head()

In [None]:
#turning column into a dataframe so I can save and my group can use
mediandf = pd.DataFrame(df2.median_by_zip)

In [None]:
mediandf.head()

In [None]:
#saving dataframe
mediandf.to_csv('mediandf.csv')

### Investigating how model performs with added feature of median household income by zip code.

In [None]:
dfdrop = df2.drop('price',axis = 1)

In [None]:
dfdrop = dfdrop.drop('id',axis = 1)

In [None]:
dfdrop = dfdrop.drop('season',axis = 1)

In [None]:
dfdrop = dfdrop.drop('price_per_sqft',axis = 1)

In [None]:
dfdrop = dfdrop.drop('zipcode',axis = 1)

In [None]:
dfdrop.info()

In [None]:
X = dfdrop #want 2 dimensions of feature
y = df2['price']
#defining training and test data with 72/25 split and random state of 42
X_train , X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)

In [None]:
# Replace missing values with the mean using simple imputer
imputer = SimpleImputer()
# Fit imputer to the indepedent variable using only the training data -- fit LEARNS the data so in this case it is 
#finding the mean
imputer.fit(X_train)
# Replace missing values in the training and test data with mean -- transform takes what fit learned and transforms
#the data with that information -- in this case replacing null values with the mean.
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

In [None]:
#Fit a LinearRegression model
baselinelinreg = LinearRegression()
baselinelinreg.fit(X_train, y_train)
#Check and interpret the intercept and coefficient
print(baselinelinreg.intercept_)
print(baselinelinreg.coef_)

In [None]:
#Make predictions on both the training and testing data
train_preds = baselinelinreg.predict(X_train)
test_preds = baselinelinreg.predict(X_test)

In [None]:
#find root(here by raising to .5 power) mean squared error o training and test predictions
train_score = (mean_squared_error(y_train, train_preds))**.5
test_score = (mean_squared_error(y_test, test_preds))**.5
print('Basline Train score:', train_score)
print('Baseline Test score:', test_score)

In [None]:
plt.bar(['Train', 'Test'], [train_score, test_score])
plt.ylabel('Error')
plt.title('BaselineLinReg Performance Bar');

It helps some, so we will use!