In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
import pandas as pd

In [2]:
spark = SparkSession\
        .builder\
        .master("local")\
        .getOrCreate()

Let's Start by analyzing the boston neighbourhoods

In [3]:
boston_neighbourhoods_df = spark.read.csv("data/Boston/2020/June/neighbourhoods.csv", header=True)
boston_neighbourhoods_df.toPandas()

Unnamed: 0,neighbourhood_group,neighbourhood
0,,Allston
1,,Back Bay
2,,Bay Village
3,,Beacon Hill
4,,Brighton
5,,Charlestown
6,,Chinatown
7,,Dorchester
8,,Downtown
9,,East Boston


We can see that for Boston atleast, there are no neighbourhood_groups, so lets drop the column

In [4]:
boston_neighbourhoods_df = boston_neighbourhoods_df.drop('neighbourhood_group')
boston_neighbourhoods_df.toPandas()

Unnamed: 0,neighbourhood
0,Allston
1,Back Bay
2,Bay Village
3,Beacon Hill
4,Brighton
5,Charlestown
6,Chinatown
7,Dorchester
8,Downtown
9,East Boston


We will be running the analysis only on the top neighbourhoods that we selected from "http://insideairbnb.com/get-the-data.html" ,which had the highest YoY percent change.

In [6]:
neighbourhoods_list = ["Back Bay", 'South Boston', 'South End', 'Fenway', 'Allston', 'Dorchester', 'Downtown']
boston_neighbourhoods_df = spark.createDataFrame(neighbourhoods_list, StringType())
boston_neighbourhoods_df = boston_neighbourhoods_df.selectExpr("value as neighbourhood")
boston_neighbourhoods_df.createOrReplaceTempView('boston_neighbourhoods')
boston_neighbourhoods_df.toPandas()

Unnamed: 0,neighbourhood
0,Back Bay
1,South Boston
2,South End
3,Fenway
4,Allston
5,Dorchester
6,Downtown


Let's create a function to read in listings data, and provide neighbourhood data such as number of listings for a particular year and average price for a year

In [7]:
def neighbourhood_data_analysis(file_path, year):
    boston_listings_df = spark.read.csv(file_path, header=True)
    boston_listings_df.createOrReplaceTempView('boston_listings')
    boston_listings_df = spark.sql("""
        SELECT * FROM boston_listings L
        WHERE L.neighbourhood IN 
        (SELECT neighbourhood FROM boston_neighbourhoods)
    """)
    boston_listings_df.createOrReplaceTempView('boston_listings')
    neighbourhood_data = spark.sql("""
        SELECT neighbourhood, count(neighbourhood) as number_of_listings, ROUND(avg(price), 2) as average_price
        FROM boston_listings
        GROUP BY neighbourhood
        ORDER BY number_of_listings desc
    """)
    return neighbourhood_data.withColumn("year", F.lit(year))
    

2020: 

In [8]:
neighbourhood_data_2020 = neighbourhood_data_analysis("data/Boston/2020/June/listings_original.csv", 2020)
neighbourhood_data_2020.toPandas()

Unnamed: 0,neighbourhood,number_of_listings,average_price,year
0,Dorchester,420,159.15,2020
1,Downtown,277,304.44,2020
2,Back Bay,224,259.63,2020
3,South End,214,171.21,2020
4,Allston,197,107.5,2020
5,South Boston,168,191.32,2020
6,Fenway,167,224.37,2020


2019: 

In [9]:
neighbourhood_data_2019 = neighbourhood_data_analysis("data/Boston/2019/June/listings.csv", 2019)
neighbourhood_data_2019.toPandas()

Unnamed: 0,neighbourhood,number_of_listings,average_price,year
0,Dorchester,558,107.04,2019
1,Downtown,498,264.05,2019
2,Back Bay,471,332.59,2019
3,South End,404,227.28,2019
4,Fenway,400,235.58,2019
5,Allston,338,181.63,2019
6,South Boston,333,225.68,2019


2018:

In [10]:
neighbourhood_data_2018 = neighbourhood_data_analysis("data/Boston/2018/July/listings.csv", 2018)
neighbourhood_data_2018.toPandas()

Unnamed: 0,neighbourhood,number_of_listings,average_price,year
0,Dorchester,519,99.11,2018
1,Fenway,498,237.59,2018
2,Back Bay,464,248.84,2018
3,South End,406,212.26,2018
4,Allston,393,156.82,2018
5,Downtown,361,252.73,2018
6,South Boston,312,211.29,2018


In [11]:
neighbourhood_data_2017 = neighbourhood_data_analysis("data/Boston/2017/October/listings.csv", 2017)
neighbourhood_data_2017.toPandas()

Unnamed: 0,neighbourhood,number_of_listings,average_price,year
0,Back Bay,410,239.88,2017
1,Dorchester,398,98.26,2017
2,Fenway,357,210.86,2017
3,South End,354,214.52,2017
4,Allston,297,99.63,2017
5,Downtown,295,252.32,2017
6,South Boston,264,197.83,2017


In [12]:
neighbourhood_data_2016 = neighbourhood_data_analysis("data/Boston/2016/September/listings.csv", 2016)
neighbourhood_data_2016.toPandas()

Unnamed: 0,neighbourhood,number_of_listings,average_price,year
0,South End,326,204.35,2016
1,Back Bay,302,240.95,2016
2,Fenway,290,220.39,2016
3,Dorchester,269,91.64,2016
4,Allston,260,112.31,2016
5,South Boston,174,187.61,2016
6,Downtown,172,236.46,2016


In [13]:
neighbourhood_data_2015 = neighbourhood_data_analysis("data/Boston/2015/October/listings.csv", 2015)
neighbourhood_data_2015.toPandas()

Unnamed: 0,neighbourhood,number_of_listings,average_price,year
0,South End,251,216.96,2015
1,Allston,223,128.91,2015
2,Back Bay,206,248.8,2015
3,Fenway,185,249.91,2015
4,Dorchester,166,104.14,2015
5,South Boston,114,191.62,2015
6,Downtown,103,243.82,2015


Let's combine all the dataframes

In [14]:
neighbourhood_data = neighbourhood_data_2020.union(neighbourhood_data_2019)
neighbourhood_data = neighbourhood_data.union(neighbourhood_data_2018)
neighbourhood_data = neighbourhood_data.union(neighbourhood_data_2017)
neighbourhood_data = neighbourhood_data.union(neighbourhood_data_2016)
neighbourhood_data = neighbourhood_data.union(neighbourhood_data_2015)

In [15]:
neighbourhood_data = neighbourhood_data.withColumn("number_of_listings", neighbourhood_data["number_of_listings"].cast(IntegerType()))

Let's plot it

In [17]:
import plotly.express as px

In [19]:
fig = px.line(neighbourhood_data.toPandas(), x="year", y=["number_of_listings"], color='neighbourhood')
fig.show()

In [20]:
fig = px.line(neighbourhood_data.toPandas(), x="year", y=["average_price"], color='neighbourhood')
fig.show()

For average_price, we decided that we are going about it in the wrong way, since the listings had to be further divided into room_types to get a better estimate of how each neighbourhood has been affected

For further analysis, we decided to incorporate review data for each listings, and gather the sentiments as well for each review.