In [22]:
#Data cleanup here

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
import csv
import json
import requests
import seaborn as sns
import plotly.plotly as py
import time

In [3]:
# Pull csv using URL of dataset uploaded to repo

url = 'https://raw.githubusercontent.com/calderon0423/Commitment-Issues/master/Dataset/'

all_homes = pd.read_csv(f'{url}State_MedianRentalPrice_AllHomes.csv')
del all_homes['Unnamed: 0']
all_homes.head()

Unnamed: 0,RegionName,SizeRank,2010-01,2010-02,2010-03,2010-04,2010-05,2010-06,2010-07,2010-08,...,2019-03,2019-04,2019-05,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11,2019-12
0,California,1,,,2400.0,2400.0,2400.0,2495.0,2500.0,2595.0,...,2750.0,2800,2800.0,2800.0,2800.0,2850,2800.0,2800.0,2800.0,2800.0
1,Texas,2,,,,,,,,,...,1550.0,1575,1595.0,1595.0,1600.0,1595,1590.0,1550.0,1550.0,1550.0
2,New York,3,,,,,,,,,...,3200.0,3495,3299.0,3295.0,3200.0,3395,3250.0,2950.0,3000.0,3200.0
3,Florida,4,,,,,,,,,...,1850.0,1850,1850.0,1845.0,1800.0,1800,1800.0,1800.0,1795.0,1790.0
4,Illinois,5,1575.0,1500.0,1500.0,1500.0,1500.0,1500.0,1550.0,1520.0,...,1600.0,1600,1600.0,1642.5,1647.0,1650,1625.0,1600.0,1595.0,1590.0


In [4]:
# Convert the data to long form
all_homes_long = pd.melt(all_homes, id_vars=['RegionName', 'SizeRank'])

#Rename columns
all_homes_long.columns = ['RegionName', 'SizeRank', 'Time', 'Value']

missing_months = all_homes_long.groupby(['RegionName', 'Time'])['Value'].count()
missing_months = missing_months.reset_index()
missing_months

Unnamed: 0,RegionName,Time,Value
0,Alabama,2010-01,0
1,Alabama,2010-02,0
2,Alabama,2010-03,0
3,Alabama,2010-04,0
4,Alabama,2010-05,1
...,...,...,...
6115,Wyoming,2019-08,1
6116,Wyoming,2019-09,1
6117,Wyoming,2019-10,1
6118,Wyoming,2019-11,1


In [5]:
#Filter for years 2016 and 2017
all_homes_long = all_homes_long.loc[all_homes_long['Time'].isin(['2016-01', '2016-02', '2016-03', '2016-04', 
                                                                 '2016-05', '2016-06', '2016-07', '2016-08', 
                                                                 '2016-09', '2016-10', '2016-11', '2016-12',
                                                                 '2017-01', '2017-02', '2017-03', '2017-04', 
                                                                 '2017-05', '2017-06', '2017-07', '2017-08', 
                                                                 '2017-09', '2017-10', '2017-11', '2017-12'])]

#Drop NAs after filtering so NAs in previous years don't affect
all_homes_long = all_homes_long.dropna()

#View counts for each state to exclude states with less than 24 months of data
all_homes_long.RegionName.value_counts()

Kentucky                24
Minnesota               24
South Dakota            24
Alabama                 24
New Hampshire           24
Pennsylvania            24
Illinois                24
Hawaii                  24
Iowa                    24
Washington              24
Louisiana               24
Indiana                 24
Alaska                  24
Oklahoma                24
Georgia                 24
Massachusetts           24
Utah                    24
Arkansas                24
Idaho                   24
Arizona                 24
South Carolina          24
Oregon                  24
Michigan                24
New Mexico              24
Texas                   24
West Virginia           24
Delaware                24
Montana                 24
New Jersey              24
Mississippi             24
Tennessee               24
Maine                   24
Wyoming                 24
Virginia                24
Missouri                24
Nevada                  24
California              24
N

In [6]:
#Exclude Nebraska and Puerto Rico as they have less than 24 months of data
all_homes_long = all_homes_long.loc[~all_homes_long.RegionName.isin(['Puerto Rico', 'Nebraska'])]

all_homes_long


Unnamed: 0,RegionName,SizeRank,Time,Value
3672,California,1,2016-01,2300.0
3673,Texas,2,2016-01,1400.0
3674,New York,3,2016-01,3200.0
3675,Florida,4,2016-01,1700.0
3676,Illinois,5,2016-01,1550.0
...,...,...,...,...
4890,South Dakota,46,2017-12,1097.5
4891,Alaska,47,2017-12,1595.0
4892,North Dakota,48,2017-12,1170.0
4893,District of Columbia,49,2017-12,2595.0


In [7]:
#Convert Time to proper date-time
all_homes_long['Time'] = pd.to_datetime(all_homes_long['Time']) 
all_homes_long['Year'] = pd.DatetimeIndex(all_homes_long['Time']).year
all_homes_long['Month'] = pd.DatetimeIndex(all_homes_long['Time']).month
all_homes_long.to_csv('all_homes_long.csv')

all_homes_long


Unnamed: 0,RegionName,SizeRank,Time,Value,Year,Month
3672,California,1,2016-01-01,2300.0,2016,1
3673,Texas,2,2016-01-01,1400.0,2016,1
3674,New York,3,2016-01-01,3200.0,2016,1
3675,Florida,4,2016-01-01,1700.0,2016,1
3676,Illinois,5,2016-01-01,1550.0,2016,1
...,...,...,...,...,...,...
4890,South Dakota,46,2017-12-01,1097.5,2017,12
4891,Alaska,47,2017-12-01,1595.0,2017,12
4892,North Dakota,48,2017-12-01,1170.0,2017,12
4893,District of Columbia,49,2017-12-01,2595.0,2017,12


In [92]:
%matplotlib notebook

#Plot the change in rent from Jan 2016 to Dec 2017 for each state
sns.lineplot(x="Time", y="Value", data=all_homes_long, hue='RegionName', legend='full')
plt.xticks(rotation=45)

# Move the legend to an empty part of the plot
plt.legend(loc='upper center', ncol=4)
plt.ylim(500, 8000)
plt.show()
plt.tight_layout()

<IPython.core.display.Javascript object>

In [9]:
#Create datasets 
merged_df_2016 = all_homes_long.loc[(all_homes_long['Time'] >= '2016-01-01') & (all_homes_long['Time'] <= '2016-12-01')]

merged_df_2017 = all_homes_long.loc[(all_homes_long['Time'] >= '2017-01-01') & (all_homes_long['Time'] <= '2017-12-01')]


In [21]:
merged_df_2016_annual = merged_df_2016.groupby(['RegionName']).mean().reset_index()

for col in merged_df_2016_annual.columns:
    merged_df_2016_annual[col] = merged_df_2016_annual[col].astype(str)

scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],
       [0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]

data = [ dict(
        type='choropleth',
        colorscale = scl,
        autocolorscale = False,
        locations = merged_df_2016_annual['RegionName'],
        z = merged_df_2016_annual['Value'].astype(float),
        locationmode = 'USA-states',
        text = merged_df_2016_annual['Value'],
        marker = dict(
            line = dict (
                color = 'rgb(255,255,255)',
                width = 2
            )
        ),
        colorbar = dict(
            title = "USD"
        )
    ) ]

layout = dict(
        title = '2016 Zillow Rental Prices by State',
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showlakes = True,
            lakecolor = 'rgb(255, 255, 255)',
        ),
    )

fig = dict(data=data, layout=layout)

url = py.plot(fig, filename='d3-cloropleth-map')


AttributeError: module 'plotly.plotly' has no attribute 'plot'

In [94]:
average_median_value = all_homes_long.groupby(['Year', 'RegionName'])['Value'].mean()
average_median_value = average_median_value.reset_index()
average_median_value.head()

Unnamed: 0,Year,RegionName,Value
0,2016,Alabama,974.166667
1,2016,Alaska,1643.958333
2,2016,Arizona,1251.666667
3,2016,Arkansas,1019.166667
4,2016,California,2373.75


In [45]:
average_median_value.loc[average_median_value['Year'] == 2016].nlargest(3, 'Value')

Unnamed: 0,Year,RegionName,Value
31,2016,New York,3079.166667
8,2016,District of Columbia,2535.416667
21,2016,Massachusetts,2464.125


In [71]:
average_median_value.loc[average_median_value['Year'] == 2017].nlargest(3, 'Value')

Unnamed: 0,Year,RegionName,Value
80,2017,New York,3189.0
57,2017,District of Columbia,2606.363636
53,2017,California,2603.181818


In [77]:
average_median_value.loc[average_median_value['Year'] == 2016].nsmallest(3, 'Value')

Unnamed: 0,Year,RegionName,Value
25,2016,Missouri,923.75
0,2016,Alabama,974.166667
14,2016,Indiana,990.166667


In [78]:
average_median_value.loc[average_median_value['Year'] == 2017].nsmallest(3, 'Value')

Unnamed: 0,Year,RegionName,Value
74,2017,Missouri,944.090909
95,2017,West Virginia,990.227273
49,2017,Alabama,997.727273


In [None]:
#Enter any hubs we want to filter by
hubs = ['New York', 'California', 'District of Columbia', 'Missouri', 'West Virginia', 'Alabama']