In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
import pandas as pd
from pyspark.sql import types as T
from pyspark import SparkFiles
from textblob import TextBlob
import plotly.graph_objects as go
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected = True)

In [2]:
spark = SparkSession\
        .builder\
        .master("local")\
        .config("spark.files.overwrite", True)\
        .getOrCreate()

Let's Start by analyzing the boston neighbourhoods

In [3]:
boston_neighbourhoods_df = spark.read.csv("data/Boston/2020/June/neighbourhoods.csv", header=True)
boston_neighbourhoods_df.toPandas()

Unnamed: 0,neighbourhood_group,neighbourhood
0,,Allston
1,,Back Bay
2,,Bay Village
3,,Beacon Hill
4,,Brighton
5,,Charlestown
6,,Chinatown
7,,Dorchester
8,,Downtown
9,,East Boston


We can see that for Boston atleast, there are no neighbourhood_groups, so lets drop the column

In [4]:
boston_neighbourhoods_df = boston_neighbourhoods_df.drop('neighbourhood_group')
boston_neighbourhoods_df.toPandas()

Unnamed: 0,neighbourhood
0,Allston
1,Back Bay
2,Bay Village
3,Beacon Hill
4,Brighton
5,Charlestown
6,Chinatown
7,Dorchester
8,Downtown
9,East Boston


We will be running the analysis only on the top neighbourhoods that we selected from "http://insideairbnb.com/get-the-data.html" ,which had the highest YoY percent change.

In [5]:
neighbourhoods_list = ["Back Bay", 'South Boston', 'South End', 'Fenway', 'Allston', 'Dorchester', 'Downtown']
boston_neighbourhoods_df = spark.createDataFrame(neighbourhoods_list, StringType())
boston_neighbourhoods_df = boston_neighbourhoods_df.selectExpr("value as neighbourhood")
boston_neighbourhoods_df.createOrReplaceTempView('boston_neighbourhoods')
boston_neighbourhoods_df.toPandas()

Unnamed: 0,neighbourhood
0,Back Bay
1,South Boston
2,South End
3,Fenway
4,Allston
5,Dorchester
6,Downtown


Let's create a function to read in listings data, and provide neighbourhood data such as number of listings for a particular year and average price for a year

In [6]:
def neighbourhood_data_analysis_summary(file_path, year):
    boston_listings_df = spark.read.csv(file_path, header=True)
    boston_listings_df.createOrReplaceTempView('boston_listings')
    boston_listings_df = spark.sql("""
        SELECT * FROM boston_listings L
        WHERE L.neighbourhood IN 
        (SELECT neighbourhood FROM boston_neighbourhoods)
    """)
    boston_listings_df.createOrReplaceTempView('boston_listings')
    neighbourhood_data = spark.sql("""
        SELECT neighbourhood, count(neighbourhood) as number_of_listings, ROUND(avg(price), 2) as average_price
        FROM boston_listings
        GROUP BY neighbourhood
        ORDER BY number_of_listings desc
    """)
    return neighbourhood_data.withColumn("year", F.lit(year))
    

2020: 

In [7]:
neighbourhood_data_2020 = neighbourhood_data_analysis_summary("data/Boston/2020/June/listings_original.csv", 2020)
neighbourhood_data_2020.toPandas()

Unnamed: 0,neighbourhood,number_of_listings,average_price,year
0,Dorchester,420,159.15,2020
1,Downtown,277,304.44,2020
2,Back Bay,224,259.63,2020
3,South End,214,171.21,2020
4,Allston,197,107.5,2020
5,South Boston,168,191.32,2020
6,Fenway,167,224.37,2020


2019: 

In [8]:
neighbourhood_data_2019 = neighbourhood_data_analysis_summary("data/Boston/2019/June/listings.csv", 2019)
neighbourhood_data_2019.toPandas()

Unnamed: 0,neighbourhood,number_of_listings,average_price,year
0,Dorchester,558,107.04,2019
1,Downtown,498,264.05,2019
2,Back Bay,471,332.59,2019
3,South End,404,227.28,2019
4,Fenway,400,235.58,2019
5,Allston,338,181.63,2019
6,South Boston,333,225.68,2019


2018:

In [9]:
neighbourhood_data_2018 = neighbourhood_data_analysis_summary("data/Boston/2018/July/listings.csv", 2018)
neighbourhood_data_2018.toPandas()

Unnamed: 0,neighbourhood,number_of_listings,average_price,year
0,Dorchester,519,99.11,2018
1,Fenway,498,237.59,2018
2,Back Bay,464,248.84,2018
3,South End,406,212.26,2018
4,Allston,393,156.82,2018
5,Downtown,361,252.73,2018
6,South Boston,312,211.29,2018


In [10]:
neighbourhood_data_2017 = neighbourhood_data_analysis_summary("data/Boston/2017/October/listings.csv", 2017)
neighbourhood_data_2017.toPandas()

Unnamed: 0,neighbourhood,number_of_listings,average_price,year
0,Back Bay,410,239.88,2017
1,Dorchester,398,98.26,2017
2,Fenway,357,210.86,2017
3,South End,354,214.52,2017
4,Allston,297,99.63,2017
5,Downtown,295,252.32,2017
6,South Boston,264,197.83,2017


In [11]:
neighbourhood_data_2016 = neighbourhood_data_analysis_summary("data/Boston/2016/September/listings.csv", 2016)
neighbourhood_data_2016.toPandas()

Unnamed: 0,neighbourhood,number_of_listings,average_price,year
0,South End,326,204.35,2016
1,Back Bay,302,240.95,2016
2,Fenway,290,220.39,2016
3,Dorchester,269,91.64,2016
4,Allston,260,112.31,2016
5,South Boston,174,187.61,2016
6,Downtown,172,236.46,2016


In [12]:
neighbourhood_data_2015 = neighbourhood_data_analysis_summary("data/Boston/2015/October/listings.csv", 2015)
neighbourhood_data_2015.toPandas()

Unnamed: 0,neighbourhood,number_of_listings,average_price,year
0,South End,251,216.96,2015
1,Allston,223,128.91,2015
2,Back Bay,206,248.8,2015
3,Fenway,185,249.91,2015
4,Dorchester,166,104.14,2015
5,South Boston,114,191.62,2015
6,Downtown,103,243.82,2015


Let's combine all the dataframes

In [13]:
neighbourhood_data = neighbourhood_data_2020.union(neighbourhood_data_2019)
neighbourhood_data = neighbourhood_data.union(neighbourhood_data_2018)
neighbourhood_data = neighbourhood_data.union(neighbourhood_data_2017)
neighbourhood_data = neighbourhood_data.union(neighbourhood_data_2016)
neighbourhood_data = neighbourhood_data.union(neighbourhood_data_2015)

In [14]:
neighbourhood_data = neighbourhood_data.withColumn("number_of_listings", neighbourhood_data["number_of_listings"].cast(IntegerType()))

Let's plot it

In [18]:
import plotly.express as px

In [19]:
init_notebook_mode(connected = True)
fig = px.line(neighbourhood_data.toPandas(), x="year", y=["number_of_listings"], color='neighbourhood')
fig.show()

In [20]:
fig = px.line(neighbourhood_data.toPandas(), x="year", y=["average_price"], color='neighbourhood')
fig.show()

Now, we have seen how each of the neighbourhoods have grown throughout the years

But these neighbourhoods weren't thriving back in 2014, Fenway, Allston and Dorchester were considered to be the poorest neighbourhoods in Boston, according to a research done by Boston Redevelopment Authority, http://www.bostonplans.org/getattachment/f1ecaf8a-d529-40b6-a9bc-8b4419587b86

Let's take a deeper dive into these neighbourhoods and see how Airbnb has helped gentrify these neighbourhoods

We have carried out few analysis on the above neighbourhoods here, but a deeper dive into the Airbnb reviews on these neighbourhoods can be found in **Boston_Aibnb_Analysis_Detailed.ipynb** notebook

For deeper dive into the neighbourhoods, let's create a neighbourhood_data_analysis_detailed function, that tweaks the summary function to include room_type analysis as well for each neighbourhood

In [51]:
def neighbourhood_data_analysis_detailed(file_path, neighbourhood, year):
    boston_listings_df = spark.read.csv(file_path, header=True)
    filter_cond = f"neighbourhood == '{neighbourhood}'".format(neighbourhood = neighbourhood)
    neighbourhood_data = boston_listings_df.where(filter_cond)
    neighbourhood_data.createOrReplaceTempView("neighbourhood")
    return spark.sql("""
        SELECT room_type, COUNT(room_type) as number_of_listings, ROUND(avg(price), 2) as average_price
        FROM neighbourhood
        GROUP BY room_type
        ORDER BY number_of_listings desc
    """).withColumn("year", F.lit(year))
    

In [45]:
file_path = "data/Boston/2015/October/listings.csv"
neighbourhood = "Allston"
boston_listings_df = spark.read.csv(file_path, header=True)
filter_cond = f"neighbourhood == '{neighbourhood}'".format(neighbourhood = neighbourhood)
neighbourhood_data = boston_listings_df.where(filter_cond)
neighbourhood_data.toPandas()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,5103430,Brand New duplex - 3 BDR / 3 BA,26367207,Raphael,,Allston,42.3521929684635,-71.12961763207782,Entire home/apt,310,3,4,2015-08-28,0.88,3,68
1,7052847,1 big room in a cute 2 bedroom apt,10336060,Rebeca,,Allston,42.352805645017064,-71.12398135726829,Private room,60,18,0,,,1,70
2,5824015,Large room with lots of sunlight,30228015,Soc,,Allston,42.35325032658771,-71.12957884634393,Private room,50,1,0,,,1,162
3,8076561,cozy room,39391859,Sijia,,Allston,42.35411021239961,-71.1290134698431,Private room,58,1,0,,,3,11
4,8310653,Spacious and modern room by the T,18604433,Shashank,,Allston,42.353344419809424,-71.12693351553787,Private room,85,1,0,,,1,48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218,6956835,8min walk to Harvard Square-Room 4,36462476,Nina,,Allston,42.36701652141407,-71.11319002262228,Private room,100,1,7,2015-09-14,3.28,5,326
219,5874337,"Great 1 br, short walk to Harvard",4322960,William,,Allston,42.3765708474865,-71.13352206460672,Entire home/apt,210,2,5,2015-09-29,1.35,1,192
220,8118800,Convenient Harvard apartment!,42872576,Jess,,Allston,42.37026013742091,-71.11431998845184,Entire home/apt,150,1,3,2015-09-03,2.81,1,7
221,7968929,Warm & Spacious 3BR by Flatbook,12243051,Flatbook,,Allston,42.348995684400336,-71.11612744441601,Entire home/apt,246,3,2,2015-09-21,2,19,293


In [46]:
neighbourhood_data.createOrReplaceTempView("neighbourhood")
spark.sql("""
    SELECT room_type, COUNT(room_type) as number_of_listings, ROUND(avg(price), 2) as average_price
    FROM neighbourhood
    GROUP BY room_type
    ORDER BY number_of_listings desc
""").toPandas()

Unnamed: 0,room_type,number_of_listings,average_price
0,Private room,140,88.69
1,Entire home/apt,76,208.61
2,Shared room,7,67.86


In [57]:
allston_neighbourhood_2015 = neighbourhood_data_analysis_detailed("data/Boston/2015/October/listings.csv", neighbourhood, 2015)
allston_neighbourhood_2016 = neighbourhood_data_analysis_detailed("data/Boston/2016/September/listings.csv", neighbourhood, 2016)
allston_neighbourhood_2017 = neighbourhood_data_analysis_detailed("data/Boston/2017/October/listings.csv", neighbourhood, 2017)
allston_neighbourhood_2018 = neighbourhood_data_analysis_detailed("data/Boston/2018/July/listings.csv", neighbourhood, 2018)
allston_neighbourhood_2019 = neighbourhood_data_analysis_detailed("data/Boston/2019/June/listings.csv", neighbourhood, 2019)
allston_neighbourhood_2020 = neighbourhood_data_analysis_detailed("data/Boston/2020/June/listings_original.csv", neighbourhood, 2020)
allston_neighbourhood = allston_neighbourhood_2020.union(allston_neighbourhood_2019)
allston_neighbourhood = allston_neighbourhood.union(allston_neighbourhood_2018)
allston_neighbourhood = allston_neighbourhood.union(allston_neighbourhood_2017)
allston_neighbourhood = allston_neighbourhood.union(allston_neighbourhood_2016)
allston_neighbourhood = allston_neighbourhood.union(allston_neighbourhood_2015)
allston_neighbourhood.toPandas()


Unnamed: 0,room_type,number_of_listings,average_price,year
0,Private room,104,69.92,2020
1,Entire home/apt,92,150.82,2020
2,Shared room,1,30.0,2020
3,Private room,185,125.95,2019
4,Entire home/apt,150,253.15,2019
5,Shared room,3,39.33,2019
6,Private room,224,65.62,2018
7,Entire home/apt,161,289.35,2018
8,Shared room,8,43.38,2018
9,Private room,186,68.02,2017


In [58]:
fig = px.line(allston_neighbourhood.toPandas(), x="year", y=["average_price"], color='room_type')
fig.show()

In [59]:
fig = px.line(allston_neighbourhood.toPandas(), x="year", y=["number_of_listings"], color='room_type')
fig.show()