Project 1 - Terrence Cummings
- Crime, Schools, and Home Values in Minneapolis

Step 1 - Housing data
- Read in housing data from "Open Minneapolis" website using APIs (currently hit 2000 limit. Need to fix.)
- Store in dataframe
- Cleanup data
- Convert x, y map coordinates to lat/lng

In [3]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import scipy.stats as st
import pandas as pd
import numpy as np
import geopandas as gpd
import requests
import time
from scipy.stats import linregress
import json
from pprint import pprint
from datetime import datetime


In [4]:
#Initialize lists to hold key home sales data
sale_id =[]
sale_date = []
formatted_address = []
land_sale = []
community_cd = []
community_desc = []
nbhd_cd = []
nbhd_desc = []
ward = []
proptype_cd = []
proptype_desc = []
adj_sale_price = []
gross_sale_price = []
downpayment = []
x_coord = []
y_coord = []

#Initialize API record offset because of 2000 record limit per API call
resultOffset_num = 0

#Base URL for the Open Minnesota database of home sales in Minneapolis
base_url = 'https://services.arcgis.com/afSMGVsC7QlRK1kZ/arcgis/rest/services/Property_Sales_2015_to_2019/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json'

#Make successive API call to grab 2000 records each time. Total records is 41,053
while resultOffset_num<=50000:

#Increment the record offset to get the next batch of records
    resultOffset = str(resultOffset_num)
    resultRecordCount = str(2000)
    resultOffset_num = resultOffset_num+2000

#Dyamically adjust the offset parameter of the API call and create the next target URL
    pagination_url = f'&resultOffset={resultOffset}&resultRecordCount={resultRecordCount}'
    target_url = f'{base_url}{pagination_url}'

#Make the API call and store data
    home_sales_data = requests.get(target_url).json()


#Set the number of home sales in the data for looping in filling the lists of key data
    num_houses=len(home_sales_data['features'])

#Fill lists of key data
    for house in range(0, num_houses):
        sale_id.append(home_sales_data['features'][house]['attributes']['SALE_ID'])
        sale_date.append(home_sales_data['features'][house]['attributes']['SALE_DATE'])
        formatted_address.append(home_sales_data['features'][house]['attributes']['FORMATED_ADDRESS'])
        land_sale.append(home_sales_data['features'][house]['attributes']['LAND_SALE'])
        community_cd.append(home_sales_data['features'][house]['attributes']['COMMUNITY_CD'])
        community_desc.append(home_sales_data['features'][house]['attributes']['COMMUNITY_DESC'])
        nbhd_cd.append(home_sales_data['features'][house]['attributes']['NBHD_CD'])
        nbhd_desc.append(home_sales_data['features'][house]['attributes']['NBHD_DESC'])
        ward.append(home_sales_data['features'][house]['attributes']['WARD'])
        proptype_cd.append(home_sales_data['features'][house]['attributes']['PROPTYPE_CD'])
        proptype_desc.append(home_sales_data['features'][house]['attributes']['PROPTYPE_DESC'])
        adj_sale_price.append(home_sales_data['features'][house]['attributes']['ADJ_SALE_PRICE'])
        gross_sale_price.append(home_sales_data['features'][house]['attributes']['GROSS_SALE_PRICE'])
        downpayment.append(home_sales_data['features'][house]['attributes']['DOWNPAYMENT'])
        x_coord.append(home_sales_data['features'][house]['attributes']['X'])
        y_coord.append(home_sales_data['features'][house]['attributes']['Y'])


In [1]:
#Create df of all MSP home sales data
msp_home_sales_df = pd.DataFrame(zip(sale_id, sale_date, formatted_address, land_sale, community_cd, community_desc, nbhd_cd, nbhd_desc, ward, proptype_cd, proptype_desc, adj_sale_price, gross_sale_price, downpayment, x_coord, y_coord), columns = ['sale_id', 'sale_date', 'formatted_address', 'land_sale', 'community_cd', 'community_desc', 'nbhd_cd', 'nbhd_desc', 'ward', 'proptype_cd', 'proptype_desc', 'adj_sale_price', 'gross_sale_price', 'downpayment', 'x_coord', 'y_coord'])

msp_home_sales_df.set_index('sale_id', inplace=True)

#Create a df of the community codes and descriptions
community_df = pd.DataFrame(zip(community_cd, community_desc), columns=['community_cd', 'community_desc'])
community_df = community_df.drop_duplicates()
community_df = community_df.sort_values(['community_cd'])
community_df.set_index('community_cd', inplace = True)

#Create a df of the neighborhood codes and descriptions
nbhd_df = pd.DataFrame(zip(nbhd_cd, nbhd_desc), columns=['nbhd_cd', 'nbhd_desc'])
nbhd_df = nbhd_df.drop_duplicates()
nbhd_df = nbhd_df.sort_values(['nbhd_cd'])
nbhd_df.set_index('nbhd_cd', inplace = True)

#Create a df of the property type codes and descriptions
proptype_df = pd.DataFrame(zip(proptype_cd, proptype_desc), columns=['proptype_cd', 'proptype_desc'])
proptype_df = proptype_df.drop_duplicates()
proptype_df = proptype_df.sort_values(['proptype_cd'])
proptype_df.set_index('proptype_cd', inplace = True)


NameError: name 'pd' is not defined

In [6]:
#Clean up df by eliminating very low sale price
clean_msp_home_sales_df = msp_home_sales_df.loc[msp_home_sales_df['adj_sale_price']>10000]

#Clean up df by eliminating very high sale price
clean_msp_home_sales_df = clean_msp_home_sales_df.loc[clean_msp_home_sales_df['adj_sale_price']<5000000]

#Clean up by eliminating non-residential and odd property types
proptype_allowed = ['R', 'X', 'DB', 'Y', 'RZ' 'RM', 'RL', 'TP']
clean_msp_home_sales_df = clean_msp_home_sales_df[clean_msp_home_sales_df['proptype_cd'].isin(proptype_allowed)]

#Clean up by eliminating land-only sales
clean_msp_home_sales_df = clean_msp_home_sales_df[clean_msp_home_sales_df['land_sale']=='NO']

#Delete unneeded column
del clean_msp_home_sales_df['land_sale']

#Format the sale date
clean_msp_home_sales_df['sale_date']= pd.to_datetime(clean_msp_home_sales_df['sale_date'], unit = 'ms')

clean_msp_home_sales_df.to_csv('clean_msp_home_sales.csv')