In [228]:
# Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels as sm 
import plotly.express as px
import plotly.io as pio
from pandas_profiling import ProfileReport
from matplotlib import colors
import folium
import folium.plugins
import branca.colormap as cmp
from math import radians, cos, sin, asin, sqrt
sns.set_style('darkgrid')
pio.templates.default = 'seaborn'

In [229]:
df = pd.read_csv('../data/canada/listings.csv')

In [230]:
df.info()
#13621 observation and 18 variables
#note the neighbourhood_group has al it entries as null value 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13621 entries, 0 to 13620
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              13621 non-null  int64  
 1   name                            13617 non-null  object 
 2   host_id                         13621 non-null  int64  
 3   host_name                       13620 non-null  object 
 4   neighbourhood_group             0 non-null      float64
 5   neighbourhood                   13621 non-null  object 
 6   latitude                        13621 non-null  float64
 7   longitude                       13621 non-null  float64
 8   room_type                       13621 non-null  object 
 9   price                           13621 non-null  int64  
 10  minimum_nights                  13621 non-null  int64  
 11  number_of_reviews               13621 non-null  int64  
 12  last_review                     

In [231]:
#dropping the neighbourhood_group 
df.drop('neighbourhood_group', axis =1, inplace=True)

In [232]:
#512 differnt licenses, and only 881 out 13621 observation have value, so we will drop it. 
len(df.license.unique()) ,df.license.unique()
df.drop('license',axis =1, inplace=True)

In [233]:
df.dtypes
#id, id_host are int variabe,most likely they are not used in graph or any mathematical operation
#I will change them to string type.
df['id'] = df['id'].astype(str)
df['host_id'] = df['host_id'].astype(str)
df.dtypes

id                                 object
name                               object
host_id                            object
host_name                          object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
number_of_reviews_ltm               int64
dtype: object

In [234]:
df[df['price'] >= 4000].count()['id']

15

In [235]:
df.describe()
#the maximum of minimum nights looks like an error, so we will remove it.
#Also, we are only interested in listing with price less than 4000 $
df = df[df['minimum_nights']< 11000]
df = df[df['price'] < 4000]



In [236]:
#df.neighbourhood.unique()
print(df['neighbourhood'].nunique())

33


In [237]:
#The variation in price is high, the distribution of prices is:
#The distribution of price seems to be right skewed.

fig = px.histogram(
    df, 
    x='price', 
    template= 'ggplot2',
    title="Price Histogram",
    labels={
        'count': 'Count', #this doesnt work 
        'price': 'Price (CAD)'}
    )

fig.update_layout(yaxis_title='Count')
fig.show()

In [238]:
fig = px.scatter( 
    data_frame=df[df['minimum_nights'] < 60],
    y='minimum_nights',
    x='number_of_reviews',
    template='ggplot2',
    opacity=0.2,
    labels= {
        'minimum_nights': 'Minimum Nights',
        'number_of_review': 'Number of Reviews'
    },
    title="Number of review vs. Minimum nights"
)
fig.show()

In [239]:
fig = px.scatter( 
    data_frame=df[df['minimum_nights'] < 60],
    x='availability_365',
    y='number_of_reviews',
    template='ggplot2',
    opacity=0.15
)
fig.show()

In [240]:
fig = px.histogram(
    df[df['minimum_nights'] <=181],
    x='minimum_nights',
    labels={
        'count': 'Count',
        'minimum_nights': 'Minimum Nights '
    },
    title='Minimum Nights Histogram',
    template='ggplot2'
    )
fig.update_layout(yaxis_title='Count')
fig.show()

# Introducing the parks datasets

In [241]:
df3 = pd.read_csv('../data/canada/lieux_d_interet.csv') #load the data
df3.dropna(how='any', inplace=True) #drop any missing value
df3 = df3[df3['Type'].isin(["Parc", "Jardin communautaire"])] #pick only places of interest related parks
df3 #only 4 parks

Unnamed: 0,ID,Famille,Catégorie,Nom français,Nom court,Type,Numéro,rue,Étage,Bureau,Ville,Code postal,Arrondissement,Classification,Longitude,Latitude
528,566,Récréatif / sportif,Parc et autre espace vert,Parc et site archéologique des Saints-Anges,Parc des Saints-Anges,Parc,,,0,,Montréal,H8R 3Z7,LaSalle,niveau 2,-73.656784,45.424198
1085,1148,Récréatif / sportif,Parc et autre espace vert,Jardin communautaire de l’Institut universitai...,Jardin communautaire de l’Institut universitai...,Jardin communautaire,,Rue de Marseille et Rue du Trianon,0,,Montréal,,Mercier–Hochelaga-Maisonneuve,niveau 4,-73.533211,45.589813
1654,1777,Récréatif / sportif,Parc et autre espace vert,Jardin communautaire l’Églantier (biologique),Jardin communautaire l’Églantier,Jardin communautaire,,31e Avenue et Boulevard Rosemont,0,,Montréal,,Rosemont–La Petite-Patrie,niveau 4,-73.568429,45.565778
2650,2865,Récréatif / sportif,Parc et autre espace vert,Plateau de travail (Circuit Jardins),Plateau de travail,Jardin communautaire,1872.0,Rue Saint-André,0,,Montréal,,Ville-Marie,niveau 4,-73.563608,45.518556


## Now mergin the new dataset it with original data set.
I will include two new columns to the original dataset; nerest park, and distance to the nearest park. 

In [242]:
obs  = list(zip(df3['Latitude'], df3['Longitude']))
radians(obs[1][0])
len(obs)

4

In [243]:
obs  = list(zip(df3['Latitude'], df3['Longitude']))
def cal_dist1(temp):
    lat1 = temp['latitude'] 
    lon1 = temp['longitude']
    lat1 = radians(lat1)
    lon1 = radians(lon1)
    results= []
    for i in range(len(obs)):
        lon2 = radians(obs[i][1])
        lat2 = radians(obs[i][0])
        # Haversine formula
        dlon = lon2 - lon1
        dlat = lat2 - lat1
        a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
        c = 2 * asin(sqrt(a))
        # Radius of earth in kilometers. Use 3956 for miles
        r = 6371
        # calculate the result
        results.append(c * r * 1.609344)
    
    min_val = min(results)
    if min_val == results[0]:
        return [min_val, "Park 1"]
    elif min_val == results[1]:
        return [min_val, "Park 2"]
    elif min_val == results[2]:
        return [min_val, "Park 3"]
    elif min_val == results[3]:
        return [min_val, "Park 4"]
    else:
        return [0,"0"]

df['distance_to_parks'] = df.apply(cal_dist1, axis=1)
df[['distance_to_nearest_park', 'nearest_park',]] = pd.DataFrame(df.distance_to_parks.tolist(), index=df.index)
df.drop('distance_to_parks', axis=1,inplace=True)
df.nearest_park.unique()

array(['Park 4', 'Park 3', 'Park 1', 'Park 2'], dtype=object)

In [244]:
df.distance_to_nearest_park.describe()
fig = px.histogram(
    df,
    x='distance_to_nearest_park',
    labels={'distance_to_nearest_park': 'Distance to nearest park (km)'},
    title='Distance to nearest park distribution.',
    template='ggplot2'
)
fig.update_layout(yaxis_title='Count')
fig.show()

In [245]:
fig = px.bar(
    df,
    x = df.nearest_park.unique(),
    y = df.nearest_park.value_counts(),
    title= "Number of Listing Near the Parks",
    labels={
        'y': 'Number of Listing',
        'x': 'Park'
    },
    template='ggplot2'
)
fig.show()

In [246]:
#df_near = df.sort_values(by = 'distance_to_nearest_park', ascending=True)
#df_near.info()
df_neig_dis = df.groupby(['neighbourhood']).agg({'distance_to_nearest_park': ['mean','std','count']})
#df_neig_dis.sort_values(by='mean')
df_neig_dis_sorted = df_neig_dis.distance_to_nearest_park.sort_values('mean').head(6)
df_neig_dis_sorted = pd.DataFrame(df_neig_dis_sorted)
df_neig_dis_sorted.index

Index(['Le Plateau-Mont-Royal', 'Ville-Marie', 'Mercier-Hochelaga-Maisonneuve',
       'Anjou', 'Lachine', 'Saint-Léonard'],
      dtype='object', name='neighbourhood')

In [247]:


fig = px.bar(
    df_neig_dis_sorted,
    x = df_neig_dis_sorted.index,
    y = 'mean',
    color = 'count',
    title= "Mean distance of nearest park organized by neighbourhood.",
    labels={
        'mean': 'Mean distance of nearest park (km)',
        'neighbourhood': 'Neighourhoods'
    }
)
fig.show()

# The Map Code

In [155]:
#center the map around montreal
map_osm = folium.Map(location=[df['latitude'].mean() , df['longitude'].mean()] , zoom_start=10)
temp = df
#adding a title
title_html = '''
             <h3 align="center" style="font-size:20px"><b>Concetration of listing and park locations.</b></h3>
             '''
map_osm.get_root().html.add_child(folium.Element(title_html))
#make the listing location as list
location = list(zip(df.latitude, df.longitude))
folium.plugins.HeatMap(location, min_opacity = 0.09, max_opacity=0.5).add_to(map_osm)

<folium.plugins.heat_map.HeatMap at 0x7fb536f2e9a0>

In [156]:
#make the park's location as list
obs = list(zip(df3['Latitude'], df3['Longitude']))
park_list = ['Park 1', "Park 2", "Park 3", 'Park 4']

for i , el in enumerate(obs):
    #folium.CircleMarker(el[0:2], radius=15, color='#000000', fill_color='#000000').add_to(map_osm)
    folium.Marker(el, popup=park_list[i], icon=folium.Icon(icon='tree', prefix= 'fa', color='green')).add_to(map_osm)
#FastMarkerCluster(data=obs).add_to(map_osm)



In [157]:
map_osm