In [None]:
# Objectives:
# - Only 2019 data
# - Rename each 2019 data column (pop, rent, crime)
# - Create column with city, state (abbreviation?)
# - Column with state
# - Make table for each csv
# - Join 3 Db on city, state (abbreviation?) column
# - Check for duplicate cities and drop
# - Drop any cities that lack pop, rent, crime data
# - Bin population, rent, and crime data
# - Vectorize data
# - Train nearest neighbors model on city/state, pop, rent, and crime data
# - Make model into a function
# - Use function to make a recommendation based on population, rent, crime rate
# - Check to see if recommendation matches well with data. If so:
# - Pickle the model and it is ready to be put into API and tested with Web/iOS
# - Once these steps are completed and working, we will also incorporate walkability and livability score in Release 2.
# - When walkability and livability scores are also included and working well in the model, we welcome and further additions to the model, granted the data is from 2019 (otherwise we can include a disclaimer, or we push all of the data used back to 2018, for example, as long as the data all comes from the same year)

In [13]:
# Imports

import pandas as pd
import numpy as np

In [40]:
# Load an inspect population data

population = pd.read_csv("population.csv")
population

Unnamed: 0,Location,Census,Estimates Base,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019 Population
0,"Abbeville city, Alabama",2688,2705,2699,2694,2643,2628,2608,2600,2584,2575,2571,2560
1,"Adamsville city, Alabama",4522,4506,4500,4493,4471,4449,4420,4390,4356,4327,4308,4281
2,"Addison town, Alabama",758,754,751,750,743,742,739,734,731,726,723,718
3,"Akron town, Alabama",356,356,355,347,347,343,338,339,333,332,331,328
4,"Alabaster city, Alabama",30352,31112,31209,31375,31684,31980,32182,32772,33017,33275,33413,33487
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19497,"Wamsutter town, Wyoming",451,451,450,453,462,487,508,499,493,483,474,467
19498,"Wheatland town, Wyoming",3627,3625,3622,3620,3626,3622,3642,3645,3587,3549,3527,3462
19499,"Worland city, Wyoming",5487,5487,5487,5436,5419,5419,5328,5332,5263,5158,5071,5024
19500,"Wright town, Wyoming",1807,1807,1810,1812,1864,1860,1856,1885,1857,1760,1754,1753


In [41]:
# Drop columns from population data (or can create a copy with just location and 2019 population data)

population = population.drop(['Census', 'Estimates Base', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018'], axis=1)
population

# In Location column, after the name of the town or city it contains "town" or "city". Wrangle this out.

Unnamed: 0,Location,2019 Population
0,"Abbeville city, Alabama",2560
1,"Adamsville city, Alabama",4281
2,"Addison town, Alabama",718
3,"Akron town, Alabama",328
4,"Alabaster city, Alabama",33487
...,...,...
19497,"Wamsutter town, Wyoming",467
19498,"Wheatland town, Wyoming",3462
19499,"Worland city, Wyoming",5024
19500,"Wright town, Wyoming",1753


In [42]:
# Create new column to specify if Location is city or town

population["Town or City"] = population['Location'].str.extract("(city|town)")
population

Unnamed: 0,Location,2019 Population,Town or City
0,"Abbeville city, Alabama",2560,city
1,"Adamsville city, Alabama",4281,city
2,"Addison town, Alabama",718,town
3,"Akron town, Alabama",328,town
4,"Alabaster city, Alabama",33487,city
...,...,...,...
19497,"Wamsutter town, Wyoming",467,town
19498,"Wheatland town, Wyoming",3462,town
19499,"Worland city, Wyoming",5024,city
19500,"Wright town, Wyoming",1753,town


In [44]:
# Remove city and town strings from Location column so that the column can be used to join with other data

population["Location"] = population['Location'].str.replace(' city', '', regex=False)
population["Location"] = population['Location'].str.replace(' town', '', regex=False)

population

# Consider adding just a State column, having City and State columns separate

Unnamed: 0,Location,2019 Population,Town or City
0,"Abbeville, Alabama",2560,city
1,"Adamsville, Alabama",4281,city
2,"Addison, Alabama",718,town
3,"Akron, Alabama",328,town
4,"Alabaster, Alabama",33487,city
...,...,...,...
19497,"Wamsutter, Wyoming",467,town
19498,"Wheatland, Wyoming",3462,town
19499,"Worland, Wyoming",5024,city
19500,"Wright, Wyoming",1753,town


In [49]:
# Load and inspect rental rates data

rent = pd.read_csv("rental_rates.csv")
rent

Unnamed: 0,RegionID,RegionName,SizeRank,MsaName,2014-01,2014-02,2014-03,2014-04,2014-05,2014-06,...,2020-02,2020-03,2020-04,2020-05,2020-06,2020-07,2020-08,2020-09,2020-10,2020-11
0,61639,10025,1,"New York, NY",3012.0,3025.0,3037.0,3049.0,3062.0,3074.0,...,3268.0,3234.0,3199.0,3162.0,3125.0,3088.0,3048.0,3008.0,2968.0,2925.0
1,84654,60657,2,"Chicago, IL",1588.0,1594.0,1599.0,1605.0,1610.0,1615.0,...,1834.0,1829.0,1824.0,1818.0,1813.0,1807.0,1801.0,1795.0,1788.0,1781.0
2,61637,10023,3,"New York, NY",3114.0,3123.0,3131.0,3140.0,3148.0,3156.0,...,3307.0,3275.0,3244.0,3211.0,3178.0,3144.0,3108.0,3072.0,3035.0,2997.0
3,91982,77494,4,"Houston, TX",1759.0,1763.0,1766.0,1770.0,1773.0,1776.0,...,1775.0,1777.0,1780.0,1782.0,1785.0,1788.0,1791.0,1794.0,1796.0,1799.0
4,84616,60614,5,"Chicago, IL",1740.0,1745.0,1750.0,1755.0,1759.0,1764.0,...,2023.0,2017.0,2010.0,2003.0,1995.0,1988.0,1979.0,1970.0,1962.0,1952.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3185,62321,11976,9253,"New York, NY",63788.0,,,,,,...,49083.0,48860.0,48636.0,,,,,,,47911.0
3186,58624,2110,9469,"Boston, MA",4113.0,4105.0,4097.0,4089.0,,4077.0,...,4510.0,,,4350.0,4294.0,4239.0,4180.0,4121.0,4062.0,4002.0
3187,66128,20004,9592,"Washington, DC",,,2295.0,2304.0,,2323.0,...,2496.0,2494.0,2492.0,2490.0,2487.0,2484.0,2481.0,2478.0,2475.0,2471.0
3188,399647,80951,9634,"Colorado Springs, CO",,1252.0,1256.0,1260.0,1265.0,1269.0,...,,1656.0,1662.0,1669.0,1676.0,1683.0,1690.0,1697.0,1704.0,1712.0


In [38]:
#rent = rent.drop(['RegionID', 'RegionName', 'SizeRank', '2014-01', '2014-02', '2014-03', '2014-04', ...], axis=1)

In [50]:
# Create copy of rental rates dataframe and change column names

rent = rent[['MsaName','2019-12']].copy()
rent = rent.rename(columns = {"MsaName":"Location"})
rent = rent.rename(columns = {"2019-12":"2019 Rental Rates"})
rent

Unnamed: 0,Location,2019 Rental Rates
0,"New York, NY",3311.0
1,"Chicago, IL",1838.0
2,"New York, NY",3344.0
3,"Houston, TX",1772.0
4,"Chicago, IL",2028.0
...,...,...
3185,"New York, NY",50175.0
3186,"Boston, MA",4585.0
3187,"Washington, DC",2495.0
3188,"Colorado Springs, CO",1640.0


In [51]:
# Replace abbreviations with full state names

rent["Location"] = rent['Location'].str.replace('AK', 'Alaska', regex=False)
rent["Location"] = rent['Location'].str.replace('AL', 'Alabama', regex=False)
rent["Location"] = rent['Location'].str.replace('AR', 'Arkansas', regex=False)
rent["Location"] = rent['Location'].str.replace('AS', 'American Samoa', regex=False)
rent["Location"] = rent['Location'].str.replace('AZ', 'Arizona', regex=False)
rent["Location"] = rent['Location'].str.replace('CA', 'California', regex=False)
rent["Location"] = rent['Location'].str.replace('CO', 'Colorado', regex=False)
rent["Location"] = rent['Location'].str.replace('CT', 'Connecticut', regex=False)
rent["Location"] = rent['Location'].str.replace('DC', 'District of Columbia', regex=False)
rent["Location"] = rent['Location'].str.replace('DE', 'Delaware', regex=False)
rent["Location"] = rent['Location'].str.replace('FL', 'Florida', regex=False)
rent["Location"] = rent['Location'].str.replace('GA', 'Georgia', regex=False)
rent["Location"] = rent['Location'].str.replace('GU', 'Guam', regex=False)
rent["Location"] = rent['Location'].str.replace('HI', 'Hawaii', regex=False)
rent["Location"] = rent['Location'].str.replace('IA', 'Iowa', regex=False)
rent["Location"] = rent['Location'].str.replace('ID', 'Idaho', regex=False)
rent["Location"] = rent['Location'].str.replace('IL', 'Illinois', regex=False)
rent["Location"] = rent['Location'].str.replace('IN', 'Indiana', regex=False)
rent["Location"] = rent['Location'].str.replace('KS', 'Kansas', regex=False)
rent["Location"] = rent['Location'].str.replace('KY', 'Kentucky', regex=False)
rent["Location"] = rent['Location'].str.replace('LA', 'Louisiana', regex=False)
rent["Location"] = rent['Location'].str.replace('MA', 'Massachusetts', regex=False)
rent["Location"] = rent['Location'].str.replace('MD', 'Maryland', regex=False)
rent["Location"] = rent['Location'].str.replace('ME', 'Maine', regex=False)
rent["Location"] = rent['Location'].str.replace('MI', 'Michigan', regex=False)
rent["Location"] = rent['Location'].str.replace('MN', 'Minnesota', regex=False)
rent["Location"] = rent['Location'].str.replace('MO', 'Missouri', regex=False)
rent["Location"] = rent['Location'].str.replace('MP', 'Northern Mariana Islands', regex=False)
rent["Location"] = rent['Location'].str.replace('MS', 'Mississippi', regex=False)
rent["Location"] = rent['Location'].str.replace('MT', 'Montana', regex=False)
rent["Location"] = rent['Location'].str.replace('NA', 'National', regex=False)
rent["Location"] = rent['Location'].str.replace('NC', 'North Carolina', regex=False)
rent["Location"] = rent['Location'].str.replace('ND', 'North Dakota', regex=False)
rent["Location"] = rent['Location'].str.replace('NE', 'Nebraska', regex=False)
rent["Location"] = rent['Location'].str.replace('NH', 'New Hampshire', regex=False)
rent["Location"] = rent['Location'].str.replace('NJ', 'New Jersey', regex=False)
rent["Location"] = rent['Location'].str.replace('NM', 'New Mexico', regex=False)
rent["Location"] = rent['Location'].str.replace('NV', 'Nevada', regex=False)
rent["Location"] = rent['Location'].str.replace('NY', 'New York', regex=False)
rent["Location"] = rent['Location'].str.replace('OH', 'Ohio', regex=False)
rent["Location"] = rent['Location'].str.replace('OK', 'Oklahoma', regex=False)
rent["Location"] = rent['Location'].str.replace('OR', 'Oregon', regex=False)
rent["Location"] = rent['Location'].str.replace('PA', 'Pennsylvania', regex=False)
rent["Location"] = rent['Location'].str.replace('PR', 'Puerto Rico', regex=False)
rent["Location"] = rent['Location'].str.replace('RI', 'Rhode Island', regex=False)
rent["Location"] = rent['Location'].str.replace('SC', 'South Carolina', regex=False)
rent["Location"] = rent['Location'].str.replace('SD', 'South Dakota', regex=False)
rent["Location"] = rent['Location'].str.replace('TN', 'Tennessee', regex=False)
rent["Location"] = rent['Location'].str.replace('TX', 'Texas', regex=False)
rent["Location"] = rent['Location'].str.replace('UT', 'Utah', regex=False)
rent["Location"] = rent['Location'].str.replace('VA', 'Virginia', regex=False)
rent["Location"] = rent['Location'].str.replace('VI', 'Virgin Islands', regex=False)
rent["Location"] = rent['Location'].str.replace('VT', 'Vermont', regex=False)
rent["Location"] = rent['Location'].str.replace('WA', 'Washington', regex=False)
rent["Location"] = rent['Location'].str.replace('WI', 'Wisconsin', regex=False)
rent["Location"] = rent['Location'].str.replace('WV', 'West Virginia', regex=False)
rent["Location"] = rent['Location'].str.replace('WY', 'Wyoming', regex=False)
rent

Unnamed: 0,Location,2019 Rental Rates
0,"New York, New York",3311.0
1,"Chicago, Illinois",1838.0
2,"New York, New York",3344.0
3,"Houston, Texas",1772.0
4,"Chicago, Illinois",2028.0
...,...,...
3185,"New York, New York",50175.0
3186,"Boston, Massachusetts",4585.0
3187,"Washington, District of Columbia",2495.0
3188,"Colorado Springs, Colorado",1640.0


In [52]:
# Load and inspect crime rates data (need to replicate state for every city)

crime = pd.read_csv("crime_rates.csv")
crime

Unnamed: 0,State,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape,Robbery,Aggravated assault,Property crime,Burglary,Larceny-theft,Motor vehicle theft,Arson
0,Alabama,Hoover,85670,114,4.0,15,27,68,1922,128,1694,100,2
1,Alaska,Anchorage,287731,3581,32.0,540,621,2388,12261,1692,9038,1531,93
2,Alaska,Bethel,6544,130,1.0,47,3,79,132,20,84,28,12
3,Alaska,Bristol Bay Borough,852,2,0.0,0,0,2,20,5,8,7,0
4,Alaska,Cordova,2150,0,0.0,0,0,0,7,1,6,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8100,Wyoming,Sheridan,17895,9,0.0,4,0,5,369,75,278,16,3
8101,Wyoming,Thermopolis,2830,13,0.0,0,0,13,34,7,22,5,0
8102,Wyoming,Torrington,6709,13,0.0,4,1,8,48,8,40,0,0
8103,Wyoming,Wheatland,3544,7,0.0,1,0,6,72,24,45,3,0


In [53]:
# Make new Location column with City and State data, so the data can be joineed with the other data on this column

crime['Location'] = crime['City'] + ',' + ' ' + crime['State']
crime

# Consider adding Alabama crime data from 2018

Unnamed: 0,State,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape,Robbery,Aggravated assault,Property crime,Burglary,Larceny-theft,Motor vehicle theft,Arson,Location
0,Alabama,Hoover,85670,114,4.0,15,27,68,1922,128,1694,100,2,"Hoover, Alabama"
1,Alaska,Anchorage,287731,3581,32.0,540,621,2388,12261,1692,9038,1531,93,"Anchorage, Alaska"
2,Alaska,Bethel,6544,130,1.0,47,3,79,132,20,84,28,12,"Bethel, Alaska"
3,Alaska,Bristol Bay Borough,852,2,0.0,0,0,2,20,5,8,7,0,"Bristol Bay Borough, Alaska"
4,Alaska,Cordova,2150,0,0.0,0,0,0,7,1,6,0,0,"Cordova, Alaska"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8100,Wyoming,Sheridan,17895,9,0.0,4,0,5,369,75,278,16,3,"Sheridan, Wyoming"
8101,Wyoming,Thermopolis,2830,13,0.0,0,0,13,34,7,22,5,0,"Thermopolis, Wyoming"
8102,Wyoming,Torrington,6709,13,0.0,4,1,8,48,8,40,0,0,"Torrington, Wyoming"
8103,Wyoming,Wheatland,3544,7,0.0,1,0,6,72,24,45,3,0,"Wheatland, Wyoming"


In [None]:
# When wrangling is done here, combine tables and push data to PG DB