In [1]:
# MACHINE LEARNNG (in upcoming notebook)
# - Vectorize data
# - Train nearest neighbors model on city/state, pop, rent, and crime data
# - Make model into a function
# - Use function to make a recommendation of Location based on population, rent, crime rate
# - Check to see if recommendation matches well with data. If so:
# - Pickle the model and it is ready to be put into API and tested with Web/iOS
# - Once these steps are completed and working, we will also incorporate walkability and livability score in Release 2.
# - When walkability and livability scores are also included and working well in the model, we welcome and further additions to the model, granted the data is from 2019 (otherwise we can include a disclaimer, or we push all of the data used back to 2018, for example, as long as the data all comes from the same year)

# STRETCH GOALS
# - add more data that fit team's user stories
# - attempt forecasting using data from 2010-2020
# - try fb prophet model among other time series models and techniques
# - be in conversation with engineers

In [2]:
import pandas as pd

# (prcb is an abbreviation for population_rent_crime_bins)

prcb = pd.read_csv("pop_rent_crime_bins.csv")
prcb.shape

(1626, 25)

In [3]:
prcb.head()

# We have multiple rows from the same city because there were multiple rental rates based on numerous locations within each city. We need to decide how to handle this. Averaging the rent values in a new column and getting rid of duplicates is one proposed solution.

Unnamed: 0.1,Unnamed: 0,Location,2019 Population,Town or City,2019 Rental Rates,State,City,Population,Violent crime,Murder and nonnegligent manslaughter,...,Larceny-theft,Motor vehicle theft,Arson,Crime Rate,Urban Population by City Size Categories,Urban Population by City Size Ranges,Rental Rate Categories,Rental Rate Ranges,Crime Rate Categories,Crime Rate Ranges
0,0,"Phoenix, Arizona",1680992,city,1424,Arizona,Phoenix,1688722,11803,131,...,39427,7076,201,108038.919372,Large Metropolitan Area,"1,500,000 <",Average Rent,"$1,294.50-1,549",Higher Crime,"107,943-111,135"
1,1,"Phoenix, Arizona",1680992,city,1522,Arizona,Phoenix,1688722,11803,131,...,39427,7076,201,108038.919372,Large Metropolitan Area,"1,500,000 <",Average Rent,"$1,294.50-1,549",Higher Crime,"107,943-111,135"
2,2,"Phoenix, Arizona",1680992,city,1461,Arizona,Phoenix,1688722,11803,131,...,39427,7076,201,108038.919372,Large Metropolitan Area,"1,500,000 <",Average Rent,"$1,294.50-1,549",Higher Crime,"107,943-111,135"
3,3,"Phoenix, Arizona",1680992,city,1549,Arizona,Phoenix,1688722,11803,131,...,39427,7076,201,108038.919372,Large Metropolitan Area,"1,500,000 <",Average Rent,"$1,294.50-1,549",Higher Crime,"107,943-111,135"
4,4,"Phoenix, Arizona",1680992,city,1611,Arizona,Phoenix,1688722,11803,131,...,39427,7076,201,108038.919372,Large Metropolitan Area,"1,500,000 <",Higher Rent,"$1,549-1,891.50",Higher Crime,"107,943-111,135"


In [4]:
prcb.isna().sum()

# Crime Rate Categories and Crime Rate Ranges have 30 null values, they must not have fit into a bin. These will be dropped for now but can be investigated further.

Unnamed: 0                                   0
Location                                     0
2019 Population                              0
Town or City                                 0
2019 Rental Rates                            0
State                                        0
City                                         0
Population                                   0
Violent crime                                0
Murder and nonnegligent manslaughter         0
Rape                                         0
Robbery                                      0
Aggravated assault                           0
Property crime                               0
Burglary                                     0
Larceny-theft                                0
Motor vehicle theft                          0
Arson                                        0
Crime Rate                                   0
Urban Population by City Size Categories     0
Urban Population by City Size Ranges         0
Rental Rate C

In [5]:
# Unique number of Locations count

prcb['Location'].nunique()

68

In [6]:
# Drop the null values found in Crime Rate to see how it affects shape, currently at (1626, 25)

prcb = prcb.dropna()

In [7]:
# Unique number of Locations count

prcb['Location'].nunique()

# We only lose one Location

67

In [10]:
# Imports for ML

import re
import string
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [14]:
# Combine all text columns into one:

# Urban Population by City Size Categories,	Rental Rate Categories, Crime Rate Categories

prcb['population_rent_crime'] = prcb["Urban Population by City Size Categories"] + ' ' + prcb["Rental Rate Categories"] + ' ' + prcb['Crime Rate Categories']

In [15]:
# Verify changes

prcb.head()

Unnamed: 0.1,Unnamed: 0,Location,2019 Population,Town or City,2019 Rental Rates,State,City,Population,Violent crime,Murder and nonnegligent manslaughter,...,Motor vehicle theft,Arson,Crime Rate,Urban Population by City Size Categories,Urban Population by City Size Ranges,Rental Rate Categories,Rental Rate Ranges,Crime Rate Categories,Crime Rate Ranges,population_rent_crime
0,0,"Phoenix, Arizona",1680992,city,1424,Arizona,Phoenix,1688722,11803,131,...,7076,201,108038.919372,Large Metropolitan Area,"1,500,000 <",Average Rent,"$1,294.50-1,549",Higher Crime,"107,943-111,135",Large Metropolitan Area Average Rent Higher Crime
1,1,"Phoenix, Arizona",1680992,city,1522,Arizona,Phoenix,1688722,11803,131,...,7076,201,108038.919372,Large Metropolitan Area,"1,500,000 <",Average Rent,"$1,294.50-1,549",Higher Crime,"107,943-111,135",Large Metropolitan Area Average Rent Higher Crime
2,2,"Phoenix, Arizona",1680992,city,1461,Arizona,Phoenix,1688722,11803,131,...,7076,201,108038.919372,Large Metropolitan Area,"1,500,000 <",Average Rent,"$1,294.50-1,549",Higher Crime,"107,943-111,135",Large Metropolitan Area Average Rent Higher Crime
3,3,"Phoenix, Arizona",1680992,city,1549,Arizona,Phoenix,1688722,11803,131,...,7076,201,108038.919372,Large Metropolitan Area,"1,500,000 <",Average Rent,"$1,294.50-1,549",Higher Crime,"107,943-111,135",Large Metropolitan Area Average Rent Higher Crime
4,4,"Phoenix, Arizona",1680992,city,1611,Arizona,Phoenix,1688722,11803,131,...,7076,201,108038.919372,Large Metropolitan Area,"1,500,000 <",Higher Rent,"$1,549-1,891.50",Higher Crime,"107,943-111,135",Large Metropolitan Area Higher Rent Higher Crime


In [16]:
# Instantiate Vectorizer 

tfidf = TfidfVectorizer(stop_words = 'english',ngram_range = (1,2),max_features = 2000)

In [17]:
# Instantiate Document-Term Matrix

dtm = tfidf.fit_transform(prcb['population_rent_crime'])
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

In [18]:
# Instantiate Nearest Neighbors Model

nn = NearestNeighbors(n_neighbors=4, algorithm='kd_tree')

In [19]:
# Fit Nearest Neighbors Model

nn.fit(dtm)

NearestNeighbors(algorithm='kd_tree', n_neighbors=4)

In [22]:
# View Feature Matrix as DataFrame

print(dtm.shape)
dtm.head()

(1596, 38)


Unnamed: 0,area,area average,area higher,area highest,area low,average,average crime,average rent,crime,higher,...,rent highest,rent lowest,size,size urban,small,small urban,town,town low,urban,urban area
0,0.147387,0.350478,0.0,0.0,0.0,0.278346,0.0,0.350478,0.147295,0.244404,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.147387,0.350478,0.0,0.0,0.0,0.278346,0.0,0.350478,0.147295,0.244404,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.147387,0.350478,0.0,0.0,0.0,0.278346,0.0,0.350478,0.147295,0.244404,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.147387,0.350478,0.0,0.0,0.0,0.278346,0.0,0.350478,0.147295,0.244404,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.140418,0.0,0.333906,0.0,0.0,0.0,0.0,0.0,0.14033,0.465696,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
import pickle

# Dump the trained classifier (nn) with Pickle

nn_pickle_filename = 'nn_model.pkl'
pickled_model = open(nn_pickle_filename, 'wb')  # Open the file to save as pkl file
pickle.dump(nn, pickled_model)
pickled_model.close() # Close the pickle instances

In [24]:
# Load the saved model

model_pkl = open(nn_pickle_filename, 'rb')
NN_model = pickle.load(model_pkl)
print ("Loaded model :: ", NN_model)  # print to verify

Loaded model ::  NearestNeighbors(algorithm='kd_tree', n_neighbors=4)


In [25]:
# Dump the trained classifier (tfidf) with Pickle

tfidf_pickle_filename = 'tfidf.pkl'
pickled_tfidf = open(tfidf_pickle_filename, 'wb')  # Open the file to save as pkl file
pickle.dump(tfidf, pickled_tfidf)
pickled_tfidf.close() # Close the pickle instances

In [26]:
# Loading the saved model

tfidf_pkl = open(tfidf_pickle_filename , 'rb')
tfidf_model = pickle.load(tfidf_pkl)
print ("Loaded model :: ", tfidf_model)  # print to verify

Loaded model ::  TfidfVectorizer(max_features=2000, ngram_range=(1, 2), stop_words='english')


In [46]:
#TypeError: Object of type int64 is not JSON serializable, so convert to ints below in the function

prcb.dtypes

Unnamed: 0                                    int64
Location                                     object
2019 Population                               int64
Town or City                                 object
2019 Rental Rates                             int64
State                                        object
City                                         object
Population                                    int64
Violent crime                                 int64
Murder and nonnegligent manslaughter          int64
Rape                                          int64
Robbery                                       int64
Aggravated assault                            int64
Property crime                                int64
Burglary                                      int64
Larceny-theft                                 int64
Motor vehicle theft                           int64
Arson                                         int64
Crime Rate                                  float64
Urban Popula

In [47]:
# Recommend Function

import json


try:

    def recommend(user_input):
        temp_df = NN_model.kneighbors(tfidf_model.transform([user_input]).todense())[1]
    

    #print(temp_df)

    #TypeError: Object of type int64 is not JSON serializable

        for i in range(4):
            info = prcb.iloc[temp_df[0][i]]['Location']
            info_pop = prcb.iloc[temp_df[0][i]]['2019 Population']
            info_town_or_city = prcb.iloc[temp_df[0][i]]['Town or City']
            info_rent = prcb.iloc[temp_df[0][i]]['2019 Rental Rates']
            info_state = prcb.iloc[temp_df[0][i]]['State']
            info_city = prcb.iloc[temp_df[0][i]]['City']
            info_population = prcb.iloc[temp_df[0][i]]['Population']
            info_violent_crime = prcb.iloc[temp_df[0][i]]['Violent crime']
            info_murder = prcb.iloc[temp_df[0][i]]['Murder and nonnegligent manslaughter']
            info_vehicle_theft = prcb.iloc[temp_df[0][i]]['Motor vehicle theft']
            info_arson = prcb.iloc[temp_df[0][i]]['Arson']
            info_crime_rate = prcb.iloc[temp_df[0][i]]['Crime Rate']
            info_urb_pop_cat = prcb.iloc[temp_df[0][i]]['Urban Population by City Size Categories']
            info_urb_pop_rang = prcb.iloc[temp_df[0][i]]['Urban Population by City Size Ranges']
            info_rent_cat = prcb.iloc[temp_df[0][i]]['Rental Rate Categories']
            info_rent_rang = prcb.iloc[temp_df[0][i]]['Rental Rate Ranges']
            info_crime_cat = prcb.iloc[temp_df[0][i]]['Crime Rate Categories']
            info_crime_rang = prcb.iloc[temp_df[0][i]]['Crime Rate Categories']
            
            print(json.dumps(info))
            print(json.dumps(int(info_pop)))
            print(json.dumps(info_town_or_city))
            print(json.dumps(int(info_rent)))
            print(json.dumps(info_state))
            print(json.dumps(info_city))
            print(json.dumps(int(info_population))) 
            print(json.dumps(int(info_violent_crime)))
            print(json.dumps(int(info_murder)))
            print(json.dumps(int(info_vehicle_theft)))
            print(json.dumps(int(info_arson)))
            print(json.dumps(int(info_crime_rate)))
            print(json.dumps(info_urb_pop_cat)) 
            print(json.dumps(info_urb_pop_rang))
            print(json.dumps(info_rent_cat)) 
            print(json.dumps(info_rent_rang))
            print(json.dumps(info_crime_cat)) 
            print(json.dumps(info_crime_rang))
            
            # This is for the data engineers, the return does not work in jupyter lab.  
            # Should work in VScode, or elsewhere for production.
    
            #return json.dumps(info))
            #return json.dumps(int(info_pop)))
            #return json.dumps(info_town_or_city))
            #return json.dumps(int(info_rent)))
            #return json.dumps(info_state))
            #return json.dumps(info_city))
            #return json.dumps(int(info_population))) 
            #return json.dumps(int(info_violent_crime)))
            #return json.dumps(int(info_murder)))
            #return json.dumps(int(info_vehicle_theft)))
            #return json.dumps(int(info_arson)))
            #return json.dumps(int(info_crime_rate)))
            #return json.dumps(info_urb_pop_cat)) 
            #return json.dumps(info_urb_pop_rang))
            #return json.dumps(info_rent_cat)) 
            #return json.dumps(info_rent_rang))
            #return json.dumps(info_crime_cat)) 
            #return json.dumps(info_crime_rang))


        # Add all future column names

except Exception:
    pass

In [49]:
recommend('low crime low rent low population')

"Seattle, Washington"
753675
"city"
1239
"Washington"
"Seattle"
763706
4471
28
3645
98
110174
"Metropolitan Area"
"500,000-1,500,000"
"Low Rent"
"$581.99-1,294.50"
"Higher Crime"
"Higher Crime"
"Austin, Texas"
978908
"city"
1240
"Texas"
"Austin"
986062
3953
32
3028
103
108233
"Metropolitan Area"
"500,000-1,500,000"
"Low Rent"
"$581.99-1,294.50"
"Higher Crime"
"Higher Crime"
"Austin, Texas"
978908
"city"
1264
"Texas"
"Austin"
986062
3953
32
3028
103
108233
"Metropolitan Area"
"500,000-1,500,000"
"Low Rent"
"$581.99-1,294.50"
"Higher Crime"
"Higher Crime"
"Oklahoma City, Oklahoma"
655057
"city"
1246
"Oklahoma"
"Oklahoma City"
657890
4751
75
3790
96
109642
"Metropolitan Area"
"500,000-1,500,000"
"Low Rent"
"$581.99-1,294.50"
"Higher Crime"
"Higher Crime"
