In [138]:
# Dependencies
import pandas as pd
import numpy as np
import requests
import json
import geopandas as gpd
import matplotlib.pyplot as plt
import hvplot.pandas

# Import the API key
from config2 import api_key
from config2 import weather_api_key

In [99]:
#Create Dataframe with Zillow Indicator Data
zillow_indicators = pd.read_csv("Zillow Indicators.csv")
zillow_indicators.head()

Unnamed: 0,indicator_id,indicator,category
0,MRAM,"Mean Days to Pending (Raw, All Homes, Monthly)",Inventory and sales
1,SSSW,"Median Sale Price (Smooth, SFR only, Weekly View)",Inventory and sales
2,LSSM,"Median List Price (Smooth, SFR Only, Monthly)",Inventory and sales
3,CSAW,"Share of Listings With a Price Cut (Smooth, Al...",Inventory and sales
4,ISSM,"For-Sale Inventory (Smooth, SFR only, Monthly)",Inventory and sales


In [100]:
#Create Dataframe with Zillow Region Data
zillow_regions = pd.read_csv("Zillow Regions.csv")
zillow_regions.head()

Unnamed: 0,region_id,region_type,region
0,1286,county,Orange County;CA;Los Angeles-Long Beach-Anahei...
1,3175,county,Philadelphia County;PA;Philadelphia-Camden-Wil...
2,3017,county,Sacramento County;CA;Sacramento-Roseville-Fols...
3,401,county,"Bronx County;NY;New York-Newark-Jersey City, N..."
4,3165,county,Hillsborough County;FL;Tampa-St. Petersburg-Cl...


In [101]:
#Search regions Dataframe for Chicago
p = "Chicago"
chicago_regions = zillow_regions[zillow_regions['region'].str.contains(p)]
chicago_regions.head()

Unnamed: 0,region_id,region_type,region
425,10215,city,"Aurora;IL;Chicago-Naperville-Elgin, IL-IN-WI;K..."
607,41587,city,"Waukegan;IL;Chicago-Naperville-Elgin, IL-IN-WI..."
646,44597,city,"Cicero;IL;Chicago-Naperville-Elgin, IL-IN-WI;C..."
654,41460,city,"Valparaiso;IN;Chicago-Naperville-Elgin, IL-IN-..."
666,50764,city,"Arlington Heights;IL;Chicago-Naperville-Elgin,..."


In [102]:
#find all the region types
chicago_regions["region_type"].unique()

array(['city', 'county', 'neigh', 'zip', 'metro'], dtype=object)

In [103]:
#create the new dataframes
city_regions = chicago_regions.loc[chicago_regions["region_type"] == "city"]
county_regions = chicago_regions.loc[chicago_regions["region_type"] == "county"]
neigh_regions = chicago_regions.loc[chicago_regions["region_type"] == "neigh"]
zip_regions = chicago_regions.loc[chicago_regions["region_type"] == "zip"]
metro_regions = chicago_regions.loc[chicago_regions["region_type"] == "metro"]

City Dataframe Cleaning

In [104]:
city_regions.head()

Unnamed: 0,region_id,region_type,region
425,10215,city,"Aurora;IL;Chicago-Naperville-Elgin, IL-IN-WI;K..."
607,41587,city,"Waukegan;IL;Chicago-Naperville-Elgin, IL-IN-WI..."
646,44597,city,"Cicero;IL;Chicago-Naperville-Elgin, IL-IN-WI;C..."
654,41460,city,"Valparaiso;IN;Chicago-Naperville-Elgin, IL-IN-..."
666,50764,city,"Arlington Heights;IL;Chicago-Naperville-Elgin,..."


In [105]:
#clean the data by splitting region
split_region = city_regions['region'].str.split(';', n=4, expand=True)
split_region.columns = ['city', 'state', 'state_region', 'county']


In [106]:
# Concatenating the split columns with the original DataFrame
city_regions = pd.concat([city_regions, split_region], axis=1)

# Dropping the original 'region' column
city_regions.drop(columns=['region'], inplace=True)


In [107]:
city_regions.info()

<class 'pandas.core.frame.DataFrame'>
Index: 370 entries, 425 to 89052
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   region_id     370 non-null    int64 
 1   region_type   370 non-null    object
 2   city          370 non-null    object
 3   state         370 non-null    object
 4   state_region  370 non-null    object
 5   county        370 non-null    object
dtypes: int64(1), object(5)
memory usage: 20.2+ KB


In [108]:
city_regions.head()

Unnamed: 0,region_id,region_type,city,state,state_region,county
425,10215,city,Aurora,IL,"Chicago-Naperville-Elgin, IL-IN-WI",Kane County
607,41587,city,Waukegan,IL,"Chicago-Naperville-Elgin, IL-IN-WI",Lake County
646,44597,city,Cicero,IL,"Chicago-Naperville-Elgin, IL-IN-WI",Cook County
654,41460,city,Valparaiso,IN,"Chicago-Naperville-Elgin, IL-IN-WI",Porter County
666,50764,city,Arlington Heights,IL,"Chicago-Naperville-Elgin, IL-IN-WI",Cook County


County Dataframe Cleaning

In [109]:
county_regions.head()

Unnamed: 0,region_id,region_type,region
1453,139,county,"Cook County;IL;Chicago-Naperville-Elgin, IL-IN-WI"
6716,1682,county,"DuPage County;IL;Chicago-Naperville-Elgin, IL-..."
13058,220,county,"Lake County;IL;Chicago-Naperville-Elgin, IL-IN-WI"
14819,330,county,"Will County;IL;Chicago-Naperville-Elgin, IL-IN-WI"
18197,197,county,"Kane County;IL;Chicago-Naperville-Elgin, IL-IN-WI"


In [110]:
#clean the data by splitting region
split_region = county_regions['region'].str.split(';', n=3, expand=True)
split_region.columns = ['county', 'state', 'state_region']

In [111]:
# Concatenating the split columns with the original DataFrame
county_regions = pd.concat([county_regions, split_region], axis=1)

# Dropping the original 'region' column
county_regions.drop(columns=['region'], inplace=True)

In [112]:
#check that all columns are full
county_regions.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14 entries, 1453 to 36340
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   region_id     14 non-null     int64 
 1   region_type   14 non-null     object
 2   county        14 non-null     object
 3   state         14 non-null     object
 4   state_region  14 non-null     object
dtypes: int64(1), object(4)
memory usage: 672.0+ bytes


In [113]:
#check the dataframe
county_regions.head()

Unnamed: 0,region_id,region_type,county,state,state_region
1453,139,county,Cook County,IL,"Chicago-Naperville-Elgin, IL-IN-WI"
6716,1682,county,DuPage County,IL,"Chicago-Naperville-Elgin, IL-IN-WI"
13058,220,county,Lake County,IL,"Chicago-Naperville-Elgin, IL-IN-WI"
14819,330,county,Will County,IL,"Chicago-Naperville-Elgin, IL-IN-WI"
18197,197,county,Kane County,IL,"Chicago-Naperville-Elgin, IL-IN-WI"


Neighborhood Dataframe Cleaning

In [114]:
#investigate data frame
neigh_regions.head()

Unnamed: 0,region_id,region_type,region
3705,403295,neigh,Lake View East; IL; Chicago-Naperville-Elgin; ...
5918,403288,neigh,Old Irving Park; IL; Chicago-Naperville-Elgin;...
7216,403286,neigh,Mayfair; IL; Chicago-Naperville-Elgin; Cook Co...
7245,403271,neigh,North Mayfair; IL; Chicago-Naperville-Elgin; C...
8372,403306,neigh,Sheffield Neighbors; IL; Chicago-Naperville-El...


In [115]:
#split the "region" column
split_region = neigh_regions['region'].str.split(';', n=5, expand=True)
split_region.columns = ['neighborhood', 'state', 'state_region', 'county', 'city']

In [116]:
# Concatenating the split columns with the original DataFrame
neigh_regions = pd.concat([neigh_regions, split_region], axis=1)

# Dropping the original 'region' column
neigh_regions.drop(columns=['region'], inplace=True)

In [117]:
#check the dataframe
neigh_regions.info()

<class 'pandas.core.frame.DataFrame'>
Index: 563 entries, 3705 to 84483
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   region_id     563 non-null    int64 
 1   region_type   563 non-null    object
 2   neighborhood  563 non-null    object
 3   state         563 non-null    object
 4   state_region  563 non-null    object
 5   county        563 non-null    object
 6   city          563 non-null    object
dtypes: int64(1), object(6)
memory usage: 35.2+ KB


In [118]:
#view the final dataframe
neigh_regions.head()

Unnamed: 0,region_id,region_type,neighborhood,state,state_region,county,city
3705,403295,neigh,Lake View East,IL,Chicago-Naperville-Elgin,Cook County,Chicago
5918,403288,neigh,Old Irving Park,IL,Chicago-Naperville-Elgin,Cook County,Chicago
7216,403286,neigh,Mayfair,IL,Chicago-Naperville-Elgin,Cook County,Chicago
7245,403271,neigh,North Mayfair,IL,Chicago-Naperville-Elgin,Cook County,Chicago
8372,403306,neigh,Sheffield Neighbors,IL,"Chicago-Naperville-Elgin, IL-IN-WI",Cook County,Chicago


Zipcode Dataframe Cleaning

In [144]:
#investigate the dataframe
zip_regions.head()

Unnamed: 0,region_id,region_type,zipcode,state,state_region,city,county
34764,78144,zip,46411,IN,Chicago-Naperville-Elgin,Whiting,Lake County
45696,84392,zip,60141,IL,Chicago-Naperville-Elgin,Broadview,Cook County
45776,84630,zip,60629,IL,"Chicago-Naperville-Elgin, IL-IN-WI",Chicago,Cook County
45797,84620,zip,60618,IL,"Chicago-Naperville-Elgin, IL-IN-WI",Chicago,Cook County
45805,84639,zip,60639,IL,"Chicago-Naperville-Elgin, IL-IN-WI",Chicago,Cook County


In [120]:
#split the "region" column
split_region = zip_regions['region'].str.split(';', n=5, expand=True)
split_region.columns = ['zipcode', 'state', 'state_region', 'city', 'county', 'blank']

In [121]:
# Concatenating the split columns with the original DataFrame
zip_regions = pd.concat([zip_regions, split_region], axis=1)

# Dropping the original 'region' column
zip_regions.drop(columns=['region'], inplace=True)
zip_regions.drop(columns=['blank'], inplace=True)

In [122]:
#view the final dataframe
zip_regions.head()

Unnamed: 0,region_id,region_type,zipcode,state,state_region,city,county
34764,78144,zip,46411,IN,Chicago-Naperville-Elgin,Whiting,Lake County
45696,84392,zip,60141,IL,Chicago-Naperville-Elgin,Broadview,Cook County
45776,84630,zip,60629,IL,"Chicago-Naperville-Elgin, IL-IN-WI",Chicago,Cook County
45797,84620,zip,60618,IL,"Chicago-Naperville-Elgin, IL-IN-WI",Chicago,Cook County
45805,84639,zip,60639,IL,"Chicago-Naperville-Elgin, IL-IN-WI",Chicago,Cook County


In [123]:
#check that the county info is correct
zip_regions["county"].value_counts()

county
Cook County        164
Lake County         54
DuPage County       36
Will County         30
McHenry County      19
Kane County         19
DeKalb County       12
Kenosha County      10
Grundy County        8
Porter County        8
Kendall County       7
Newton County        6
Jasper County        5
 Kane County         3
 Kenosha County      2
 Cook County         1
 DeKalb County       1
 Kendall County      1
 DuPage County       1
 Newton County       1
 Porter County       1
Name: count, dtype: int64

Metro Region DataFrame Cleaning

In [124]:
#investigate metro region
metro_regions.head()

Unnamed: 0,region_id,region_type,region
89098,394463,metro,"Chicago, IL"


In [125]:
#clean data by splitting "region" column
split_region = metro_regions['region'].str.split(',', n=1, expand=True)
split_region.columns = ['city', 'state']

In [126]:
# Concatenating the split columns with the original DataFrame
metro_regions = pd.concat([metro_regions, split_region], axis=1)

# Dropping the original 'region' column
metro_regions.drop(columns=['region'], inplace=True)

In [127]:
#view dataframe
metro_regions

Unnamed: 0,region_id,region_type,city,state
89098,394463,metro,Chicago,IL


In [128]:
# Export categories_df and subcategories_df as CSV files.
metro_regions.to_csv("Resources/metro regions.csv", index=False)
zip_regions.to_csv("Resources/zip regions.csv", index=False)
neigh_regions.to_csv("Resources/neigh regions.csv", index=False)
city_regions.to_csv("Resources/city regions.csv", index=False)
county_regions.to_csv("Resources/county regions.csv", index=False)

Getting Lat/Long Coordinates

In [131]:
#create list of all cities near chicago
cities = city_regions["city"].tolist()


In [132]:
# Set the API base URL
url = "http://api.openweathermap.org/data/2.5/weather?"
query_url = url + "appid=" + weather_api_key

# Define an empty list to fetch the lat and lng for each city
city_data = []

# Print to logger
print("Beginning Data Retrieval     ")
print("-----------------------------")

# Create counters
record_count = 1
set_count = 1

# Loop through all the cities in our list to fetch weather data
for i, city in enumerate(cities):
        
    # Group cities in sets of 50 for logging purposes
    if (i % 50 == 0 and i >= 50):
        set_count += 1
        record_count = 0

    # Create endpoint URL with each city
    city_url = f"{query_url}&q={city}"
    
    # Log the url, record, and set numbers
    print("Processing Record %s of Set %s | %s" % (record_count, set_count, city))

    # Add 1 to the record count
    record_count += 1

    # Run an API request for each of the cities
    try:
        # Parse the JSON and retrieve data
        city_weather = requests.get(city_url).json()

        # Parse out latitude, longitude
        city_lat = city_weather["coord"]["lat"]
        city_lng = city_weather["coord"]["lon"]

        # Append the City information into city_data list
        city_data.append({"city": city, 
                          "Lat": city_lat, 
                          "Lng": city_lng,})

    # If an error is experienced, skip the city
    except:
        print("City not found. Skipping...")
        pass
              
# Indicate that Data Loading is complete 
print("-----------------------------")
print("Data Retrieval Complete      ")
print("-----------------------------")

Beginning Data Retrieval     
-----------------------------
Processing Record 1 of Set 1 | Aurora
Processing Record 2 of Set 1 | Waukegan
Processing Record 3 of Set 1 | Cicero
Processing Record 4 of Set 1 | Valparaiso
Processing Record 5 of Set 1 | Arlington Heights
Processing Record 6 of Set 1 | Des Plaines
Processing Record 7 of Set 1 | Evanston
Processing Record 8 of Set 1 | Schaumburg
Processing Record 9 of Set 1 | Gary
Processing Record 10 of Set 1 | Bolingbrook
Processing Record 11 of Set 1 | Palatine
Processing Record 12 of Set 1 | Orland Park
Processing Record 13 of Set 1 | Crown Point
Processing Record 14 of Set 1 | Skokie
Processing Record 15 of Set 1 | Downers Grove
Processing Record 16 of Set 1 | Oak Lawn
Processing Record 17 of Set 1 | Tinley Park
Processing Record 18 of Set 1 | Wheaton
Processing Record 19 of Set 1 | Mount Prospect
Processing Record 20 of Set 1 | Glenview
Processing Record 21 of Set 1 | Berwyn
Processing Record 22 of Set 1 | Oak Park
Processing Record 23 

In [133]:
#create a data frame
city_df = pd.DataFrame(city_data)
city_df

Unnamed: 0,city,Lat,Lng
0,Aurora,39.7294,-104.8319
1,Waukegan,42.3636,-87.8448
2,Cicero,41.8456,-87.7539
3,Valparaiso,-33.0393,-71.6273
4,Arlington Heights,42.0884,-87.9806
...,...,...,...
362,Kaneville,41.8353,-88.5220
363,Carbon Hill,33.8918,-87.5261
364,Burlington,39.8835,-74.6413
365,Golf,-8.8620,13.2568


In [134]:
chicago_data = pd.merge(city_regions, city_df, on=["city"])
chicago_data

Unnamed: 0,region_id,region_type,city,state,state_region,county,Lat,Lng
0,10215,city,Aurora,IL,"Chicago-Naperville-Elgin, IL-IN-WI",Kane County,39.7294,-104.8319
1,41587,city,Waukegan,IL,"Chicago-Naperville-Elgin, IL-IN-WI",Lake County,42.3636,-87.8448
2,44597,city,Cicero,IL,"Chicago-Naperville-Elgin, IL-IN-WI",Cook County,41.8456,-87.7539
3,41460,city,Valparaiso,IN,"Chicago-Naperville-Elgin, IL-IN-WI",Porter County,-33.0393,-71.6273
4,50764,city,Arlington Heights,IL,"Chicago-Naperville-Elgin, IL-IN-WI",Cook County,42.0884,-87.9806
...,...,...,...,...,...,...,...,...
364,52726,city,Kaneville,IL,"Chicago-Naperville-Elgin, IL-IN-WI",Kane County,41.8353,-88.5220
365,34961,city,Carbon Hill,IL,"Chicago-Naperville-Elgin, IL-IN-WI",Grundy County,33.8918,-87.5261
366,51191,city,Burlington,IL,"Chicago-Naperville-Elgin, IL-IN-WI",Kane County,39.8835,-74.6413
367,31795,city,Golf,IL,"Chicago-Naperville-Elgin, IL-IN-WI",Cook County,-8.8620,13.2568


In [135]:
zillow_data = pd.read_csv("Zillow Data.csv")
zillow_data.head()

Unnamed: 0,indicator_id,region_id,date,value
0,Z4BR,403211,2/28/2021,477466.0
1,ZALL,26591,11/30/2005,76985.30089
2,ZALL,3986,11/30/2005,176636.5842
3,ZALL,6626,11/30/2005,161867.5164
4,ZALL,49285,11/30/2005,191388.1045


In [136]:
#find the average value per region_id
zillow_averages = zillow_data.groupby("region_id")["value"].mean()

In [137]:
#combine to get the average value per city in one dataframe
chicago_values = pd.merge(chicago_data,zillow_averages, on=["region_id"])
chicago_values

Unnamed: 0,region_id,region_type,city,state,state_region,county,Lat,Lng,value
0,10215,city,Aurora,IL,"Chicago-Naperville-Elgin, IL-IN-WI",Kane County,39.7294,-104.8319,176352.206550
1,41587,city,Waukegan,IL,"Chicago-Naperville-Elgin, IL-IN-WI",Lake County,42.3636,-87.8448,142347.593150
2,44597,city,Cicero,IL,"Chicago-Naperville-Elgin, IL-IN-WI",Cook County,41.8456,-87.7539,174425.544050
3,41460,city,Valparaiso,IN,"Chicago-Naperville-Elgin, IL-IN-WI",Porter County,-33.0393,-71.6273,172825.143400
4,50764,city,Arlington Heights,IL,"Chicago-Naperville-Elgin, IL-IN-WI",Cook County,42.0884,-87.9806,340394.874750
...,...,...,...,...,...,...,...,...,...
303,25758,city,Markham,IL,"Chicago-Naperville-Elgin, IL-IN-WI",Cook County,43.8668,-79.2663,60677.650175
304,395497,city,Campton Hills,IL,"Chicago-Naperville-Elgin, IL-IN-WI",Kane County,41.9366,-88.3975,469563.241800
305,45903,city,Itasca,IL,"Chicago-Naperville-Elgin, IL-IN-WI",DuPage County,41.9750,-88.0073,363298.290450
306,26755,city,Richton Park,IL,"Chicago-Naperville-Elgin, IL-IN-WI",Cook County,41.4845,-87.7034,130409.363100


In [146]:
chicago_map = chicago_values.hvplot.points(
    "Lng",
    "Lat",
    geo = True,
    tiles = "OSM",
    size = 3,
    color = "city")

# Display the map
chicago_map