In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline

df = pd.read_csv("housing.csv")
df.head()

Unnamed: 0,address,bed_bath,more_info,price,status,zip_code
0,"360 E Randolph St # 601-602, Chicago, IL","3 bds · 4 ba · 2,700 sqft",http://www.zillow.com/homedetails/360-E-Randol...,"$1,299,000",Condo For Sale,60601
1,"8 E Randolph St UNIT 1006, Chicago, IL",1 bd · 1 ba · 850 sqft,http://www.zillow.com/homedetails/8-E-Randolph...,"$324,900",Condo For Sale,60601
2,,,,,,60601
3,"340 E Randolph St APT 704, Chicago, IL","2 bds · 3 ba · 1,902 sqft",http://www.zillow.com/homedetails/340-E-Randol...,"$1,099,000",Condo For Sale,60601
4,"420 E Waterside Dr UNIT 310, Chicago, IL","2 bds · 3 ba · 1,500 sqft",http://www.zillow.com/homedetails/420-E-Waters...,"$567,770",Condo For Sale,60601


Things that need to happen with this data:
1) bedrooms in one column
2) bathrooms in another
3) size in sqft in another
4) price turned into int
5) Dummy variables for the statuses (minus 1 for modeling)
6) NaN values stripped out
7) the categories of status could be consolidated (e.g. foreclosed and foreclosure)
8) -- for 0 in the bed_bath column needs to be turned into 0

In [2]:
df.shape

(7468, 6)

In [3]:
df.dtypes

address      object
bed_bath     object
more_info    object
price        object
status       object
zip_code      int64
dtype: object

In [4]:
df.columns

Index(['address', 'bed_bath', 'more_info', 'price', 'status', 'zip_code'], dtype='object')

In [5]:
df["status"].unique()

array(['Condo For Sale', nan, 'New Construction', 'Pre-Foreclosure',
       'Pre-Foreclosure (Auction)', 'For Sale by Owner', 'Auction',
       'Foreclosed', 'Lot/Land For Sale', 'Make Me Move®',
       'House For Sale', 'Townhouse For Sale', 'Coming Soon',
       'Foreclosure', 'Apartment For Sale', 'Co-op For Sale'], dtype=object)

In [6]:
df["status"].value_counts()

Condo For Sale               1943
House For Sale               1336
Pre-Foreclosure              1182
Foreclosed                    530
Apartment For Sale            476
Pre-Foreclosure (Auction)     395
Auction                       328
Lot/Land For Sale             250
Foreclosure                   232
For Sale by Owner             166
New Construction              143
Townhouse For Sale            109
Make Me Move®                  69
Co-op For Sale                 20
Coming Soon                     9
Name: status, dtype: int64

In [7]:
df.isnull().sum()

address       280
bed_bath      280
more_info     280
price        2542
status        280
zip_code        0
dtype: int64

From the above it seems we should drop the empty values--280 lack a small amount of data and those represent a small subset, so those are good to drop. A large amount lack the price, which is what we are targeting (although I guess maybe those could be pulled into a separate data frame to use as an experiment to use the model to predict what they would have been)

### Data cleaning below

In [8]:
#drops rows with NaN values
df = df.dropna()
#gets rid of $ and ,, in the prices
df["price"] = df["price"].str.replace("$", "")
df["price"] = df["price"].str.replace(",", "")
df["price"] = df["price"].str.replace("M", "")
df["price"] = df["price"].str.replace("K", "")
df["price"] = df["price"].str.replace("+", "")
#change values to floats from strings
df["price"] = df["price"].apply(float)
#drop the 134 rows that have a funky format in bed_bath
#(found using the following code: df.loc[df["bed_bath"].str.contains("lot") == True].shape)
df = df.loc[df["bed_bath"].str.contains("lot") == False]
df.head()

Unnamed: 0,address,bed_bath,more_info,price,status,zip_code
0,"360 E Randolph St # 601-602, Chicago, IL","3 bds · 4 ba · 2,700 sqft",http://www.zillow.com/homedetails/360-E-Randol...,1299000.0,Condo For Sale,60601
1,"8 E Randolph St UNIT 1006, Chicago, IL",1 bd · 1 ba · 850 sqft,http://www.zillow.com/homedetails/8-E-Randolph...,324900.0,Condo For Sale,60601
3,"340 E Randolph St APT 704, Chicago, IL","2 bds · 3 ba · 1,902 sqft",http://www.zillow.com/homedetails/340-E-Randol...,1099000.0,Condo For Sale,60601
4,"420 E Waterside Dr UNIT 310, Chicago, IL","2 bds · 3 ba · 1,500 sqft",http://www.zillow.com/homedetails/420-E-Waters...,567770.0,Condo For Sale,60601
5,"155 N Harbor Dr # 1212-13, Chicago, IL","3 bds · 3 ba · 2,367 sqft",http://www.zillow.com/homedetails/155-N-Harbor...,1350000.0,Condo For Sale,60601


In [9]:
df.shape

(4792, 6)

In [10]:
#turn "--" into 0 in bed_bath and turn "Studio" into "0 bds"
#AF note to self: with this and the other replace, I know there's got to be a way to do this in fewer lines
#I thought you could input two same-length lists into .replace, but it isn't working for me
#Something to figure out when I have more time
df["bed_bath"] = df["bed_bath"].str.replace("--", "0")
df["bed_bath"] = df["bed_bath"].str.replace("Studio", "0 bds")

In [11]:
#figuring out how to split the "bed_bath" strings
df["bed_bath"].str.split(" · ").head()

0    [3 bds, 4 ba, 2,700 sqft]
1       [1 bd, 1 ba, 850 sqft]
3    [2 bds, 3 ba, 1,902 sqft]
4    [2 bds, 3 ba, 1,500 sqft]
5    [3 bds, 3 ba, 2,367 sqft]
Name: bed_bath, dtype: object

In [12]:
#assigning the "bed_bath" splits to a new df (that I can join to the original df later)
df_bbsplit = pd.DataFrame(df["bed_bath"].str.split(" · ").tolist(), columns = ["beds", "baths", "sqft"])
df_bbsplit.head()

Unnamed: 0,beds,baths,sqft
0,3 bds,4 ba,"2,700 sqft"
1,1 bd,1 ba,850 sqft
2,2 bds,3 ba,"1,902 sqft"
3,2 bds,3 ba,"1,500 sqft"
4,3 bds,3 ba,"2,367 sqft"


In [13]:
#clean the data in the new df columns to change to ints
df_bbsplit["sqft"] = df_bbsplit["sqft"].str.replace(",",'')
df_bbsplit["sqft"] = df_bbsplit["sqft"].str.replace("sqft","")
df_bbsplit["sqft"] = df_bbsplit["sqft"].str.replace("+","")
df_bbsplit["beds"] = df_bbsplit["beds"].str.replace(" bds","")
df_bbsplit["beds"] = df_bbsplit["beds"].str.replace(" bd","")
df_bbsplit["baths"] = df_bbsplit["baths"].str.replace(" ba","")

In [14]:
df_bbsplit.head()

Unnamed: 0,beds,baths,sqft
0,3,4,2700
1,1,1,850
2,2,3,1902
3,2,3,1500
4,3,3,2367


In [15]:
df_bbsplit.shape

(4792, 3)

In [16]:
df_bbsplit.isnull().sum()

beds     0
baths    0
sqft     0
dtype: int64

In [17]:
#convert to ints
df_bbsplit["sqft"] = pd.to_numeric(df_bbsplit["sqft"])
df_bbsplit["beds"] = pd.to_numeric(df_bbsplit["beds"])
df_bbsplit["baths"] = pd.to_numeric(df_bbsplit["baths"])

In [18]:
df_bbsplit.dtypes

beds       int64
baths    float64
sqft       int64
dtype: object

In [19]:
df_bbsplit.shape

(4792, 3)

In [20]:
df_bbsplit.isnull().sum()

beds     0
baths    0
sqft     0
dtype: int64

In [21]:
df_bbsplit.head()

Unnamed: 0,beds,baths,sqft
0,3,4.0,2700
1,1,1.0,850
2,2,3.0,1902
3,2,3.0,1500
4,3,3.0,2367


In [22]:
df_bbsplit.tail()

Unnamed: 0,beds,baths,sqft
4787,4,3.0,1775
4788,4,2.0,910
4789,8,3.0,0
4790,3,1.0,1191
4791,4,2.0,1181


In [23]:
#operationalize target--figure out which values are high and which are low
df["price"].describe()

count    4.792000e+03
mean     4.668757e+05
std      8.031682e+05
min      1.070000e+00
25%      1.299000e+05
50%      2.790000e+05
75%      5.007250e+05
max      1.300000e+07
Name: price, dtype: float64

In [24]:
print(df["price"].median())
print(df["price"].mean())

279000.0
466875.6995972455


In [25]:
#create operationalized target-- >= to median (273500.0) is high, < is low
# go through values in value
# if high make value 1
# if low make value 0
# assign these values to a new column
# can I write this in one line of code without creating a function and using apply??
#instead use conditional statement with true/false
# df["price"] >= 273500.0 this will return true

df["is_high"] = df["price"].apply(lambda x: 1 if x >=273500.0 else 0)

df.head()

Unnamed: 0,address,bed_bath,more_info,price,status,zip_code,is_high
0,"360 E Randolph St # 601-602, Chicago, IL","3 bds · 4 ba · 2,700 sqft",http://www.zillow.com/homedetails/360-E-Randol...,1299000.0,Condo For Sale,60601,1
1,"8 E Randolph St UNIT 1006, Chicago, IL",1 bd · 1 ba · 850 sqft,http://www.zillow.com/homedetails/8-E-Randolph...,324900.0,Condo For Sale,60601,1
3,"340 E Randolph St APT 704, Chicago, IL","2 bds · 3 ba · 1,902 sqft",http://www.zillow.com/homedetails/340-E-Randol...,1099000.0,Condo For Sale,60601,1
4,"420 E Waterside Dr UNIT 310, Chicago, IL","2 bds · 3 ba · 1,500 sqft",http://www.zillow.com/homedetails/420-E-Waters...,567770.0,Condo For Sale,60601,1
5,"155 N Harbor Dr # 1212-13, Chicago, IL","3 bds · 3 ba · 2,367 sqft",http://www.zillow.com/homedetails/155-N-Harbor...,1350000.0,Condo For Sale,60601,1


In [26]:
df["is_high"].shape

(4792,)

In [27]:
#get dummies for status using the built-in n -1 thing.
df_status_dummies = pd.get_dummies(df["status"], drop_first=True)
df_status_dummies.head()

Unnamed: 0,Auction,Co-op For Sale,Coming Soon,Condo For Sale,For Sale by Owner,Foreclosure,House For Sale,Lot/Land For Sale,Make Me Move®,New Construction,Townhouse For Sale
0,0,0,0,1,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,0
5,0,0,0,1,0,0,0,0,0,0,0


In [28]:
#fixes index problem with status dummies
#index was using the numbers from the original dataset before dropped rows
df_status_dummies.index = range(len(df_status_dummies))

In [29]:
df_status_dummies.shape

(4792, 11)

In [30]:
df_status_dummies.tail()

Unnamed: 0,Auction,Co-op For Sale,Coming Soon,Condo For Sale,For Sale by Owner,Foreclosure,House For Sale,Lot/Land For Sale,Make Me Move®,New Construction,Townhouse For Sale
4787,0,0,0,0,0,0,1,0,0,0,0
4788,0,0,0,0,0,0,1,0,0,0,0
4789,0,0,0,0,0,0,0,0,0,0,0
4790,0,0,0,0,0,0,1,0,0,0,0
4791,0,0,0,0,0,0,1,0,0,0,0


In [31]:
#for some unknown reason I have to turn this into a df without the column
df_zip = df[["zip_code"]]
df_zip.head()

Unnamed: 0,zip_code
0,60601
1,60601
3,60601
4,60601
5,60601


In [32]:
#fix index to conform with bbsplit
df_zip.index = range(len(df_zip))

In [33]:
df_zip.tail()

Unnamed: 0,zip_code
4787,60827
4788,60827
4789,60827
4790,60827
4791,60827


In [34]:
df_zip.shape

(4792, 1)

In [35]:
#create version of original df with only relevant columns to join others to
#only column from original df that is relevant is "zip_code"
#first join zip to bb_split df
housing_data_intm = pd.merge(df_zip, df_bbsplit, how="inner", left_index=True, right_index=True)
housing_data_intm.head()
#note I can also [probably do this with .concat, but the merge seems to be working for now]

Unnamed: 0,zip_code,beds,baths,sqft
0,60601,3,4.0,2700
1,60601,1,1.0,850
2,60601,2,3.0,1902
3,60601,2,3.0,1500
4,60601,3,3.0,2367


In [36]:
housing_data_intm.tail()

Unnamed: 0,zip_code,beds,baths,sqft
4787,60827,4,3.0,1775
4788,60827,4,2.0,910
4789,60827,8,3.0,0
4790,60827,3,1.0,1191
4791,60827,4,2.0,1181


In [37]:
housing_data_intm.shape

(4792, 4)

In [38]:
#then join on dummies
housing_data = pd.merge(housing_data_intm, df_status_dummies, how="inner", left_index=True, right_index=True)

In [39]:
housing_data.shape

(4792, 15)

In [40]:
housing_data.head()

Unnamed: 0,zip_code,beds,baths,sqft,Auction,Co-op For Sale,Coming Soon,Condo For Sale,For Sale by Owner,Foreclosure,House For Sale,Lot/Land For Sale,Make Me Move®,New Construction,Townhouse For Sale
0,60601,3,4.0,2700,0,0,0,1,0,0,0,0,0,0,0
1,60601,1,1.0,850,0,0,0,1,0,0,0,0,0,0,0
2,60601,2,3.0,1902,0,0,0,1,0,0,0,0,0,0,0
3,60601,2,3.0,1500,0,0,0,1,0,0,0,0,0,0,0
4,60601,3,3.0,2367,0,0,0,1,0,0,0,0,0,0,0


In [41]:
housing_data.tail()

Unnamed: 0,zip_code,beds,baths,sqft,Auction,Co-op For Sale,Coming Soon,Condo For Sale,For Sale by Owner,Foreclosure,House For Sale,Lot/Land For Sale,Make Me Move®,New Construction,Townhouse For Sale
4787,60827,4,3.0,1775,0,0,0,0,0,0,1,0,0,0,0
4788,60827,4,2.0,910,0,0,0,0,0,0,1,0,0,0,0
4789,60827,8,3.0,0,0,0,0,0,0,0,0,0,0,0,0
4790,60827,3,1.0,1191,0,0,0,0,0,0,1,0,0,0,0
4791,60827,4,2.0,1181,0,0,0,0,0,0,1,0,0,0,0


In [42]:
housing_data.isnull().sum()

zip_code              0
beds                  0
baths                 0
sqft                  0
Auction               0
Co-op For Sale        0
Coming Soon           0
Condo For Sale        0
For Sale by Owner     0
Foreclosure           0
House For Sale        0
Lot/Land For Sale     0
Make Me Move®         0
New Construction      0
Townhouse For Sale    0
dtype: int64

In [43]:
#Anna note to self: there's some other weird thing to investigate here
#not all the status categories are showing up in the dummies
#maybe they all got dropped out with the dropna?
#Yes this seems to be the case per below
df["status"].value_counts()

Condo For Sale        1943
House For Sale        1335
Apartment For Sale     476
Auction                207
Foreclosure            193
For Sale by Owner      164
New Construction       143
Lot/Land For Sale      125
Townhouse For Sale     109
Make Me Move®           68
Co-op For Sale          20
Coming Soon              9
Name: status, dtype: int64