# This Python Jupyter Notebook will be used for IBM Capstone Project Development

In [1]:
import pandas as pd
import numpy as np

In [2]:
print('Hello Capstone Project Course!')

Hello Capstone Project Course!


# Battle of Neighborhoods - Exploring the Neighborhoods around Johns Hopkins Carey Business School

### In 2019, Baltimore ranks the 4th most dangerous city in America, according to financial news website 247wallst.com. Baltimore’s violent crime rate in 2018, which was 1,833.4 per 100,000 people, is nearly five times higher than the national violent crime rate of 369 per 100,000 people. At the same time, properties some of the safest areas of Baltimore remained overly expensive for both buyers and renters. The disconnection between the housing market and the public not only arouses social uneasiness but also widens the economcic gap between working classes. 

### However, there is currently no accesible platform that would guide students attending JHU Carey Business School in the near future to choose which neighborhood to live in and make suggestion of which restaurants to go to. This project will give a brief introduction and offer some insights to the living environment of Inner Harbor, Baltimore, MD. 

## 1. Analysis of crime data of Baltimore
### Crime data will be pulled from the database of Baltimore Police Department. Data contains data, time, geographical location, type, neighborhood, etc. This data will show the crime rate in different neighborhoods of Baltimore.

In [3]:
# Url from Baltimore City Police Department
url = 'https://data.baltimorecity.gov/api/views/wsfq-mvij/rows.csv?accessType=DOWNLOAD'

In [None]:
# Load data from the website
df_crime = pd.read_csv(url)
df_crime.head()

In [None]:
# Only crime date and neighborhoods are needed for this project
df_crime = df_crime[['CrimeDate','Neighborhood']]
df_crime.head()

In [None]:
# Check the size of dataframe
df_crime.shape

In [None]:
# Drop null data
df_crime.dropna(axis = 0, inplace = True)
df_crime.shape

In [None]:
# Check data types 
df_crime.dtypes

In [None]:
# Group and count crime number by different neighborhoods
crime_by_nei = df_crime.groupby(['Neighborhood']).count().reset_index()
crime_by_nei

In [None]:
# Rename count column and update the dataframe
crime_by_nei.rename(columns={'CrimeDate': "Count" }, inplace = True)
crime_by_nei.head()

In [None]:
# Clean up non-existing neighborhoods
crime_by_nei.drop(crime_by_nei.index[[0,1,2]], inplace = True)
crime_by_nei

In [None]:
# Rank neighborhoods by their number of crimes
crime_by_nei.sort_values(by=['Count'], inplace = True, ascending=False)
crime_by_nei

In [None]:
# Take the first 10 rows of data
crime_top10 = crime_by_nei.head(10)
crime_top10

In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter

In [None]:
# Plot the top 10 most dangerous neighborhoods in Baltimore
ax = crime_top10.plot.bar(x ='Neighborhood', y = 'Count',figsize = (12,8),rot=45,color = ['black', 'black','black','black','black','red','black','black','black','black'])
plt.title('Top 10 Most Dangerous Neighborhoods in Baltimore', fontsize = 20)
plt.ylabel('Total Number of Crimes Reported from Jan 2000 to Aug 2020', fontsize = 15)
plt.xlabel('Name of the Neighborhoods', fontsize = 15)

In [None]:
# Specifically looking at Inner Harbor
crime_IH = df_crime[df_crime['Neighborhood'] == 'INNER HARBOR']
crime_IH.head()

In [None]:
# Check the number of cases happened at Inner Harbor
crime_IH.shape

In [None]:
# Convert column CrimeDate to datatime format so we would be able to group by year or month
crime_IH['CrimeDate'] = pd.to_datetime(crime_IH['CrimeDate'])
crime_IH.head()

In [None]:
# Set CrimeDate as the index for this dataframe
crime_IH.set_index('CrimeDate', inplace = True)
crime_IH.head()

In [None]:
# Create a dataframe by grouping by year for all neighborhoods in Baltimore
BG=crime_IH.groupby([(crime_IH.index.year)]).count()
BG

In [None]:
#Create a second dataframe grouping by different months of the year
AG=crime_IH.groupby([(crime_IH.index.month)]).count()
AG

In [None]:
#Rename colomns
BG.rename(columns={'Neighborhood': "Count" }, inplace = True)
BG

In [None]:
# Plot number of crimes based on year until Aug 2020
BG.reset_index().plot.bar(x = 'CrimeDate', y = 'Count', legend = False, figsize = (12,8),rot=45)
plt.title('Crimes Counts of Inner Harbor Annually  with August 2020', fontsize = 20)
plt.xlabel('Year', fontsize = 15)
plt.ylabel('Number of Crime Cases', fontsize = 15)

In [None]:
# Extrapolate 2020 crime count
BG1 = BG.replace(245, 245*12/8)
BG1

In [None]:
# Plot predicted number of crime in Inner Harbor annually until Dec 2020
BG1.reset_index().plot.bar(x = 'CrimeDate', y = 'Count', legend = False, figsize = (12,8),rot=45, color = ['black', 'black','black','black','black','black','red'])
plt.title('Crimes Counts of Inner Harbor Annually with 2020 Extrapolated until Dec', fontsize = 20)
plt.xlabel('Year', fontsize = 15)
plt.ylabel('Number of Crime Cases', fontsize = 15)

In [None]:
# import seaborn
import seaborn as sns

In [None]:
# Plot monthly number of crimes in Inner Harbor per month over the past 20 years to support the argument that we extrapolated
# the number of crimes from Sept to Dec in 2020. This plot shows that the number of crimes decreases towards the end of the year.
AG.reset_index(inplace = True)
bx = sns.lineplot(x="CrimeDate", y="Neighborhood", data=AG)
plt.rcParams['figure.figsize']=(15,15)
plt.xlabel('Month', fontsize = 15)
plt.ylabel('Number of Crime Cases', fontsize = 15)
plt.title('Number of Crimes per Month of the Year', fontsize = 20)

In [None]:
# Import modules for map, cluster, and plotting modules
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 

# import k-means from clustering stage
from sklearn.cluster import KMeans

!pip install folium #if just opened this document 
import folium # map rendering library

print('Libraries imported.')

In [None]:
# Get the geographical coordinate of Inner Harbor, MD
address = '707 President St, Baltimore'
geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Inner Harbor are {}, {}.'.format(latitude, longitude))

In [None]:
CLIENT_ID = 'OEBIOKHBL1PY13SJ1ZZV5154FAYCJ34T23YLFMNWHTVO2DTC' # your Foursquare ID
CLIENT_SECRET = 'NPCHLXKJK5PC4I4MRVX5EASKV00IGRZSW2KNCICZQAA1MTZY' # your Foursquare Secret
VERSION = '20200604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

In [None]:
search_query = 'Restaurant'
radius = 600
print(search_query + ' .... OK!')

In [None]:
url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, search_query, radius, LIMIT)
url

In [None]:
results = requests.get(url).json()
results

In [None]:
# assign relevant part of JSON to venues
venues = results['response']['venues']
# tranform venues into a dataframe
dataframe = json_normalize(venues)
dataframe.head()

In [None]:
filtered_columns = ['name', 'categories'] + [col for col in dataframe.columns if col.startswith('location.')] + ['id']
dataframe_filtered = dataframe.loc[:, filtered_columns]

# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

# filter the category for each row
dataframe_filtered['categories'] = dataframe_filtered.apply(get_category_type, axis=1)

# clean column names by keeping only last term
dataframe_filtered.columns = [column.split('.')[-1] for column in dataframe_filtered.columns]

dataframe_filtered

In [None]:
# Make a map that highlights all local restaurants near the JHU Carey Business School 
venues_map = folium.Map(location=[latitude, longitude], zoom_start=16) # generate map centred around the JHU Carey Business School 

folium.CircleMarker(
    [latitude, longitude],
    radius=10,
    popup='Johns Hopkins Carey Business School',
    fill=True,
    color='red',
    fill_color='red',
    fill_opacity=0.6
    ).add_to(venues_map)

# add all restaurants as blue circle markers
for lat, lng, label in zip(dataframe_filtered.lat, dataframe_filtered.lng, dataframe_filtered.categories):
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        color='blue',
        popup=label,
        fill = True,
        fill_color='blue',
        fill_opacity=0.6
    ).add_to(venues_map)

# display map
venues_map