In [3]:
# Import modules
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
import plotly.graph_objs as go 
from plotly.offline import init_notebook_mode,iplot


import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
init_notebook_mode(connected=True)

In [2]:
# Load data
df = pd.read_csv('data/house_data.csv')

In [4]:
# reading the data
df.info()
df.shape
df.tail()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85509 entries, 0 to 85508
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Price        85509 non-null  object 
 1   Address      85509 non-null  object 
 2   Bedrooms     64999 non-null  object 
 3   Bathrooms    69439 non-null  object 
 4   Size         73698 non-null  object 
 5   Sale Status  69960 non-null  object 
 6   URL          85509 non-null  object 
 7   Raw Price    85509 non-null  float64
dtypes: float64(1), object(7)
memory usage: 5.2+ MB


Unnamed: 0,Price,Address,Bedrooms,Bathrooms,Size,Sale Status,URL,Raw Price
85504,"$79,000,000","2 Park Pl, New York, NY 10007",,1 ba,"9,680 sqft",Condo for sale,https://www.zillow.com/homedetails/2-Park-Pl-N...,79000000.0
85505,"$90,000,000","432 Park Ave #82, New York, NY 10022",6 bds,8 ba,"8,054 sqft",Condo for sale,https://www.zillow.com/homedetails/432-Park-Av...,90000000.0
85506,"$95,000,000","1441 Angelo Dr, Los Angeles, CA 90210",,,,Lot / Land for sale,https://www.zillow.com/homedetails/1441-Angelo...,95000000.0
85507,"$99,000,000","908 Bel Air Rd, Los Angeles, CA 90077",9 bds,20 ba,"34,000 sqft",House for sale,https://www.zillow.com/homedetails/908-Bel-Air...,99000000.0
85508,"$110,000,000","30 Beverly Park Ter, Beverly Hills, CA 90210",8 bds,12 ba,-- sqft,House for sale,https://www.zillow.com/homedetails/30-Beverly-...,110000000.0


In [5]:
# drop NaN, Price, URL, and assign to df1
def metric_deletion(x):
    x.dropna(axis='rows',inplace=True)
    x = x[x.Bedrooms != '-- bds']
    x = x[x.Bathrooms != '-- ba']
    x = x[x.Size != '-- sqft']
    x.drop(['URL', 'Price'], axis=1,inplace=True)
    return x
df1 = metric_deletion(df)
df1.head()

Unnamed: 0,Address,Bedrooms,Bathrooms,Size,Sale Status,Raw Price
5,"3515 W Thompson Rd, Indianapolis, IN 46217",2 bds,1 ba,814 sqft,House for sale,1.0
53,"3713 Hillside Ave, Indianapolis, IN 46218",2 bds,1 ba,"1,728 sqft",House for sale,775.0
65,"1337 W Livingston St APT 1, Allentown, PA 18102",3 bds,1 ba,"1,000 sqft",House for sale,1050.0
70,"1788 Westwood Dr, Troy, MI 48083",3 bds,2 ba,"1,418 sqft",House for sale,1600.0
72,"390 Rosado Springs St, Henderson, NV 89014",2 bds,2 ba,"1,060 sqft",Townhouse for sale,1700.0


In [6]:
# Converting Bathrooms into float
df1.Bathrooms = df1.Bathrooms.str.replace(' ba','').astype('float')
# Converting Bedrooms into float
df1.Bedrooms = df1.Bedrooms.str.replace(' bds','').astype('float')

In [7]:
# Converting Size to float
def filt_size(s):
    s= s.replace(',','')
    s =s.replace(' sqft','')
    return float(s)
df1.Size = df1.Size.apply(filt_size)

In [8]:
# Spliting Address into Street, City, State, ZipCode, and drop the Address
df1.Address = df1.Address.astype('string')
df1['Street']= df1.Address.apply(lambda x: x.split(', ')[0])
df1['City']= df1.Address.apply(lambda x: x.split(', ')[1])
df1['State']= df1.Address.apply(lambda x: (x.split(', ')[-1]).split(' ')[0])
df1['ZipCode']= df1.Address.apply(lambda x: (x.split(', ')[-1]).split(' ')[1])
df2 = df1.drop(['Address'],axis=1)


In [9]:
#reset the index
df2.reset_index(inplace=True,drop=True)

In [10]:
# Found two rows of abnormal values, so found exact address on google and replace with the right values
df2.loc[28709:28711, 'State']= 'AZ'
df2.loc[28709:28711, 'ZipCode']= '85260'

In [11]:
# Converting the columns as strings for further cleaning
df2[['Street','City','State','ZipCode']].astype('string')

Unnamed: 0,Street,City,State,ZipCode
0,3515 W Thompson Rd,Indianapolis,IN,46217
1,3713 Hillside Ave,Indianapolis,IN,46218
2,1337 W Livingston St APT 1,Allentown,PA,18102
3,1788 Westwood Dr,Troy,MI,48083
4,390 Rosado Springs St,Henderson,NV,89014
...,...,...,...,...
45394,111 W 57th St PENTHOUSE 72,New York,NY,10019
45395,0 Del Valle Rd,Livermore,CA,94550
45396,1060 Brooklawn Dr,Los Angeles,CA,90077
45397,432 Park Ave #82,New York,NY,10022


In [12]:
# Finding the weird ZipCode, it is in Canada
df2.loc[df2.ZipCode == 'N9V']

Unnamed: 0,Bedrooms,Bathrooms,Size,Sale Status,Raw Price,Street,City,State,ZipCode
38379,4.0,4.0,2800.0,House for sale,865000.0,349 Benson Ct,Amherstburg,ON,N9V


In [13]:
# Dropping the row
df2.drop(df2.iloc[38379].name,inplace=True)

In [14]:
# Now the ZipCode can be converted to Integer
df2.ZipCode = df2.ZipCode.astype('int')


In [15]:
#convert Sale Status into house Types
house_status = list(df2['Sale Status'].unique())
house_type = ['House','Townhouse','Multifamily', 'Condo', 'Others', 'Apartment']
df2['Sale Status'] = df2['Sale Status'].map(dict(zip(house_status,house_type)))
df2.rename(columns={"Sale Status": "Type"}, inplace=True)


In [16]:
# Street Column might not be useful
df3 = df2.drop(['Street'], axis=1)

In [17]:
# Change Raw Price column name to Price, create perSqft column
df3.rename(columns={"Raw Price":"Price"}, inplace=True)
df3['perSqFt'] = df3.Price / df3.Size

In [18]:
# Due to previously dropping rows, reset index again
df3 = df3.reset_index(drop=True)


In [19]:
# Assign Regions
west = ['CA', 'NV', 'AK', 'WA' , 'OR', 'ID', 'MT', 'WY', 'UT', 'CO', 'AZ', 'NM', 'HI']
midwest = ['ND', 'WI','SD', 'NE', 'KS', 'MN', 'IA', 'MO', 'WI', 'IL', 'IN', 'OH','MI']
north = ['PA', 'NY', 'NH', 'MA', 'CT', 'ME', 'DC', 'NJ', 'RI']
south = ['TX', 'OK', 'AR', 'LA', 'MS', 'AL', 'TN', 'KY', 'WV', 'VA', 'MD', 'DE', 'NC', 'SC', 'GA', 'FL']
full_state_list = west + midwest + north + south

# Creating function to assign regions
def find_region(state):
    if state in west:
        state = 'West'
    elif state in north:
        state = 'North'
    elif state in south:
        state = 'South'
    elif state in midwest:
        state = 'MidWest'
    return state

# Create Region column
df3['Region'] = df3.State.apply(find_region)

In [35]:
df3

Unnamed: 0,Bedrooms,Bathrooms,Size,Type,Price,City,State,ZipCode,perSqFt,Region
0,2.00,1.00,814.00,House,1.00,Indianapolis,IN,46217,0.00,MidWest
1,2.00,1.00,1728.00,House,775.00,Indianapolis,IN,46218,0.45,MidWest
2,3.00,1.00,1000.00,House,1050.00,Allentown,PA,18102,1.05,North
3,3.00,2.00,1418.00,House,1600.00,Troy,MI,48083,1.13,MidWest
4,2.00,2.00,1060.00,Townhouse,1700.00,Henderson,NV,89014,1.60,West
...,...,...,...,...,...,...,...,...,...,...
45393,4.00,6.00,7130.00,Condo,66000000.00,New York,NY,10019,9256.66,North
45394,4.00,2.00,2500.00,House,68000000.00,Livermore,CA,94550,27200.00,West
45395,13.00,17.00,15011.00,House,75000000.00,Los Angeles,CA,90077,4996.34,West
45396,6.00,8.00,8054.00,Condo,90000000.00,New York,NY,10022,11174.57,North


In [39]:
#Created a function that checks if each state has an appropriate amount of data to preform data.
def too_little_data(table):
    little_states = []
    for x in full_state_list:
        if len(table[table['State'] == x]) < 30:
            little_states.append(x)
    return little_states
too_little_data(df3)


['WY', 'HI', 'NH', 'WV']

In [41]:
#Filtered out dataset with the above criteria.
df4 = df3[~df3['State'].isin(too_little_data(df3))]
df4

Unnamed: 0,Bedrooms,Bathrooms,Size,Type,Price,City,State,ZipCode,perSqFt,Region
0,2.00,1.00,814.00,House,1.00,Indianapolis,IN,46217,0.00,MidWest
1,2.00,1.00,1728.00,House,775.00,Indianapolis,IN,46218,0.45,MidWest
2,3.00,1.00,1000.00,House,1050.00,Allentown,PA,18102,1.05,North
3,3.00,2.00,1418.00,House,1600.00,Troy,MI,48083,1.13,MidWest
4,2.00,2.00,1060.00,Townhouse,1700.00,Henderson,NV,89014,1.60,West
...,...,...,...,...,...,...,...,...,...,...
45393,4.00,6.00,7130.00,Condo,66000000.00,New York,NY,10019,9256.66,North
45394,4.00,2.00,2500.00,House,68000000.00,Livermore,CA,94550,27200.00,West
45395,13.00,17.00,15011.00,House,75000000.00,Los Angeles,CA,90077,4996.34,West
45396,6.00,8.00,8054.00,Condo,90000000.00,New York,NY,10022,11174.57,North


In [42]:
pd.set_option('display.float_format', '{:.2f}'.format)
pd.pivot_table(df4, values=['perSqFt'], index=['State'])


Unnamed: 0_level_0,perSqFt
State,Unnamed: 1_level_1
AK,175.69
AL,136.81
AR,142.48
AZ,578.91
CA,498.29
CO,336.63
CT,212.42
DC,574.93
DE,147.46
FL,231.73


In [None]:
'''
NOT WORKING CORRECTLY


data = dict(type = 'choropleth',colorscale='Portland',locations=df3['State'],locationmode='USA-states', z=df3['Price'],text=df3['Price'], colorbar={'title':'perSqFt'})
choromap = go.Figure(data = [data],layout = dict(geo = {'scope':'usa'}))

iplot(choromap,validate=False)
'''

In [None]:
# for GeoPandas later use
# df2 = df2[df2['ZipCode'].between(10000,99999,inclusive='both')]