<h1 style="text-align:center">ANALYZING THE SIMILARITY OF U.S. CITIES</h1>


<h3 style="text-align:center">ADAMM HOCKMAN</h3>
<h6 style="text-align:center">IBM DATA SCIENCE CERTIFICATION - COURSERA.ORG</h6>

<h4 style="text-align:center">25 MAY 2020</h4>

<h2>Table of Contents</h2>

<div class="alert alert-block alert-info" style="margin-top: 20px">
    <ol>
        <li>Load and process the primary dataset.</li>
        <li>Load and process the secondary dataset.</li>
        <li>Acquire Foursquare data.</li>
        <li><b>Clustering the cities.</b></li>
        <li>Exploratory Data Analysis.</li>
        <li>Inferential Statistics.</li>
        <li>Machine Learning Algorithms.</li>
    </ol>
</div>
<br>
<hr>

<h4>Install and import any packages we will need.</h4>

In [59]:
import numpy as np
import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile
from pandas.io.json import json_normalize
import re
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
%matplotlib inline
from geopy.geocoders import Nominatim
import us
import time
import folium
import requests # library to handle requests
from sklearn.cluster import KMeans 
from sklearn.datasets import make_blobs
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

<h2>Clustering the cities.</h2>
<ul>
    <li>Use k-means.</li>
    <ul>
        <li>parameters:</li>
    </ul>
</ul>

In [54]:
main_df = pd.read_csv('./data/final_dataframe.csv')
main_df['City Growth (% since 2010)'] = 1 * main_df['City Growth (% since 2010)'].tolist()
main_df.set_index('Location', inplace=True)
#main_df.drop(columns=['City Latitude', 'City Longitude'], inplace=True)

def normalize(df):
    dNorm = df.drop(columns=['City Latitude','City Longitude'])
    dNorm = (( dNorm - dNorm.mean() ) / dNorm.std() ) * 3
    return dNorm

def visualize_kmeans_clusters(main_df, kclusters):

    main_normal_df = normalize(main_df)
    
    main_df.reset_index(inplace=True)
    
    # run k-means clustering
    kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(main_normal_df)
    
    # check cluster labels generated for each row in the dataframe
    print(kmeans.labels_[0:10])
    
    # add clustering labels
    main_df.insert(0, 'Cluster Labels', kmeans.labels_)
    print(main_df['Cluster Labels'].unique())
    
    # create map of US
    us_latitude = 39.50
    us_longitude = -98.35

    map_clusters = folium.Map(location=[us_latitude, us_longitude], zoom_start=3)

    # set color scheme for the clusters
    x = np.arange(kclusters)
    ys = [i + x + (i*x)**2 for i in range(kclusters)]
    colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
    rainbow = [colors.rgb2hex(i) for i in colors_array]

    # add markers to the map
    markers_colors = []
    for lat, lon, poi, cluster in zip(main_df['City Latitude'], main_df['City Longitude'], main_df['Location'], main_df['Cluster Labels']):
        label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
        folium.CircleMarker(
            [lat, lon],
            radius=5,
            popup=label,
            color=rainbow[cluster-1],
            fill=True,
            fill_color=rainbow[cluster-1],
            fill_opacity=0.7).add_to(map_clusters)
    return map_clusters

In [55]:
kclusters = 5

map_clusters = visualize_kmeans_clusters(main_df, kclusters)
map_clusters

[4 4 4 4 4 4 4 4 4 4]
[4 1 2 0 3]


In [58]:
main_df.to_csv('./data/main_data_clusters.csv')
main_df.head()

# examine clusters
#manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 0, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]

Unnamed: 0,Cluster Labels,Location,City Population 2019,City Growth (% since 2010),City Latitude,City Longitude,State Population 2019,State Growth (% since 2010),State Birth Rate,State Death Rate,...,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,4,"Alabaster, Alabama",33487,107.299177,33.244281,-86.816377,4903185,1.024605,60.9,1092.1,...,0,0,0,0,0,2,0,0,0,0
1,4,"Albertville, Alabama",21711,102.429704,34.267594,-86.208867,4903185,1.024605,60.9,1092.1,...,0,0,0,0,0,1,0,0,0,0
2,4,"Anniston, Alabama",21287,92.826618,33.695381,-85.839842,4903185,1.024605,60.9,1092.1,...,0,0,0,0,0,0,0,0,0,0
3,4,"Athens, Alabama",27366,124.424843,34.828383,-86.917134,4903185,1.024605,60.9,1092.1,...,0,0,0,0,0,1,0,0,0,0
4,4,"Auburn, Alabama",66259,123.19234,32.535699,-85.486778,4903185,1.024605,60.9,1092.1,...,0,0,0,0,0,0,0,0,0,0


<h4>Part 9: Train the MLR algorithm.</h4>
<ul>
    <li>Use k-means.</li>
    <li></li>
    <ul>
        <li></li>
        <li></li>
    </ul>
</ul>

In [226]:
def normalize(df):
    df_norm = ( ( df - df.mean() ) / df.std() )
    return df_norm

X = main_df.drop(columns=['Location','City Latitude','City Longitude','City Growth (% since 2010)'])
X = normalize(X)
X = X.to_numpy()
y = main_df['City Growth (% since 2010)'].values

print(type(X))
print(type(y))

X[0:5]

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


array([[-0.62445197, -0.20362749, -0.81340189, ..., -0.21451182,
        -0.17723363, -0.24034834],
       [-0.62445197, -0.24788653, -0.81340189, ..., -0.21451182,
        -0.17723363, -0.24034834],
       [-0.62445197, -0.08045688, -0.81340189, ..., -0.21451182,
        -0.17723363, -0.24034834],
       [-0.62445197,  0.45753694, -0.81340189, ..., -0.21451182,
        -0.17723363, -0.24034834],
       [-0.62445197, -0.22850065, -0.81340189, ..., -0.21451182,
        -0.17723363, -0.24034834]])

In [227]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

mr_model = linear_model.LinearRegression()
mr_model.fit(X_train, y_train)
mr_model

y_hat = mr_model.predict(X_test)

print(len(y_hat))
print("MLR Residual SS: %.2f" % np.mean((y_hat - y_test) ** 2))
print("MLR R2-score (train): %.2f" % r2_score(y_test,y_hat))

Train set: (1215, 556) (1215,)
Test set: (522, 556) (522,)
522
MLR Residual SS: 8057494092109578205640261632.00
MLR R2-score (train): -42394888127331788679806976.00


import matplotlib.pyplot as plt
fig, ax = plt.subplots( nrows=1, ncols=1 )  # create figure & 1 axis
ax.plot([0,1,2], [10,20,3])
fig.savefig('path/to/save/image/to.png')   # save the figure to file
plt.close(fig)    # close the figure window