In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
import pandas as pd
from pyspark.sql import types as T
from pyspark import SparkFiles
from textblob import TextBlob
import plotly.graph_objects as go
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected = True)

In [None]:
spark = SparkSession\
        .builder\
        .master("local")\
        .config("spark.files.overwrite", True)\
        .getOrCreate()

In [1]:
def get_complete_boston_data(listings_url, reviews_url, year):
    spark.sparkContext.addFile(reviews_url)
    spark.sparkContext.addFile(listings_url)
    boston_reviews_df = spark.read.option("header", "true").csv(SparkFiles.get("reviews.csv.gz"))
    boston_listings_df = spark.read.csv(SparkFiles.get("listings.csv"), header=True)
    boston_reviews_df = boston_reviews_df.where("comments != 'None'")
    boston_reviews_df = boston_reviews_df.withColumnRenamed("id", "review_id")
    boston_top_neighbourhood = boston_listings_df.join(boston_neighbourhoods_df, ["neighbourhood"] , "left_semi")
    boston_top_neighbourhood = boston_top_neighbourhood.join(boston_reviews_df, boston_top_neighbourhood["id"] == boston_reviews_df["listing_id"], "left")
    return boston_top_neighbourhood.withColumn("year", F.lit(year))

In [2]:
neighbourhoods_list = ['Fenway', 'Allston', 'Dorchester']
boston_neighbourhoods_df = spark.createDataFrame(neighbourhoods_list, StringType())
boston_neighbourhoods_df = boston_neighbourhoods_df.selectExpr("value as neighbourhood")

NameError: name 'spark' is not defined

In [4]:
reviews_url = "http://data.insideairbnb.com/united-states/ma/boston/2020-06-10/data/reviews.csv.gz"
listings_url = "http://data.insideairbnb.com/united-states/ma/boston/2020-06-10/visualisations/listings.csv"

boston_neighbourhood_2020 = get_complete_boston_data(listings_url, reviews_url, 2020)
boston_neighbourhood_2020.toPandas()

NameError: name 'spark' is not defined

In [5]:
reviews_url = "http://data.insideairbnb.com/united-states/ma/boston/2019-06-14/data/reviews.csv.gz"
listings_url = "http://data.insideairbnb.com/united-states/ma/boston/2019-06-14/visualisations/listings.csv"
boston_neighbourhood_2019 = get_complete_boston_data(listings_url, reviews_url, 2019)
boston_neighbourhood_2019.toPandas()

NameError: name 'spark' is not defined

In [6]:
reviews_url = "http://data.insideairbnb.com/united-states/ma/boston/2018-07-18/data/reviews.csv.gz"
listings_url = "http://data.insideairbnb.com/united-states/ma/boston/2018-07-18/visualisations/listings.csv"
boston_neighbourhood_2018 = get_complete_boston_data(listings_url, reviews_url, 2018)
boston_neighbourhood_2018.toPandas()

NameError: name 'spark' is not defined

In [7]:
reviews_url = "http://data.insideairbnb.com/united-states/ma/boston/2017-10-06/data/reviews.csv.gz"
listings_url = "http://data.insideairbnb.com/united-states/ma/boston/2017-10-06/visualisations/listings.csv"
boston_neighbourhood_2017 = get_complete_boston_data(listings_url, reviews_url, 2017)
boston_neighbourhood_2017.toPandas()

NameError: name 'spark' is not defined

In [8]:
reviews_url = "http://data.insideairbnb.com/united-states/ma/boston/2016-09-07/data/reviews.csv.gz"
listings_url = "http://data.insideairbnb.com/united-states/ma/boston/2016-09-07/visualisations/listings.csv"
boston_neighbourhood_2016 = get_complete_boston_data(listings_url, reviews_url, 2016)
boston_neighbourhood_2016.toPandas()

NameError: name 'spark' is not defined

In [9]:
reviews_url = "http://data.insideairbnb.com/united-states/ma/boston/2015-10-03/data/reviews.csv.gz"
listings_url = "http://data.insideairbnb.com/united-states/ma/boston/2015-10-03/visualisations/listings.csv"
boston_neighbourhood_2015 = get_complete_boston_data(listings_url, reviews_url, 2015)
boston_neighbourhood_2015.toPandas()

NameError: name 'spark' is not defined

Now that we have the combined data for all 5 years for the required neighbourhoods, we can go into a deepdive on each of the neighbourhoods

Fenway

Let's combine Fenway data from all the years

In [11]:
fenway_neighbourhood = boston_neighbourhood_2015.where("neighbourhood == 'Fenway'").union(boston_neighbourhood_2016.where("neighbourhood == 'Fenway'"))
fenway_neighbourhood = fenway_neighbourhood.union(boston_neighbourhood_2017.where("neighbourhood == 'Fenway'"))
fenway_neighbourhood = fenway_neighbourhood.union(boston_neighbourhood_2018.where("neighbourhood == 'Fenway'"))
fenway_neighbourhood = fenway_neighbourhood.union(boston_neighbourhood_2019.where("neighbourhood == 'Fenway'"))
fenway_neighbourhood = fenway_neighbourhood.union(boston_neighbourhood_2020.where("neighbourhood == 'Fenway'"))
fenway_neighbourhood.toPandas()

NameError: name 'boston_neighbourhood_2015' is not defined

Get the setiment score for each review using textblob library, but before that, the reviews need to be translated to english, as not all the reviews are in english

In [12]:
def translate(sentence):
    try:
        str(TextBlob(x).translate(to= 'en'))
    except:
        return sentence

In [13]:
translate_udf = F.udf(lambda sentence: translate(sentence) if sentence is not None else None , T.StringType())

In [14]:
fenway_neighbourhood = fenway_neighbourhood.withColumn('comments', translate_udf(F.col("comments")))
fenway_neighbourhood.toPandas()

NameError: name 'fenway_neighbourhood' is not defined

Sentiment Score:

In [15]:
sentiment_udf = F.udf(lambda x:TextBlob(x).sentiment.polarity if x is not None else None , T.StringType())

In [16]:
fenway_neighbourhood = fenway_neighbourhood.withColumn('review_polarity', sentiment_udf(F.col("comments")))
fenway_neighbourhood.toPandas()

NameError: name 'fenway_neighbourhood' is not defined

In [17]:
fenway_neighbourhood_home = fenway_neighbourhood.where("room_type == 'Entire home/apt'").toPandas()
fig = px.histogram(fenway_neighbourhood_home, x="review_polarity")
fig.show()

NameError: name 'fenway_neighbourhood' is not defined

In [18]:
fenway_neighbourhood_private = fenway_neighbourhood.where("room_type == 'Private room'").toPandas()
fig = px.histogram(fenway_neighbourhood_private, x="review_polarity")
fig.show()

NameError: name 'fenway_neighbourhood' is not defined

In [19]:
fenway_neighbourhood_shared = fenway_neighbourhood.where("room_type == 'Shared room'").toPandas()
fig = px.histogram(fenway_neighbourhood_private, x="review_polarity")
fig.show()

NameError: name 'fenway_neighbourhood' is not defined

Allston:

Let's combine Allston data from all the years

In [21]:
allston_neighbourhood = boston_neighbourhood_2015.where("neighbourhood == 'Allston'").union(boston_neighbourhood_2016.where("neighbourhood == 'Allston'"))
allston_neighbourhood = allston_neighbourhood.union(boston_neighbourhood_2017.where("neighbourhood == 'Allston'"))
allston_neighbourhood = allston_neighbourhood.union(boston_neighbourhood_2018.where("neighbourhood == 'Allston'"))
allston_neighbourhood = allston_neighbourhood.union(boston_neighbourhood_2019.where("neighbourhood == 'Allston'"))
allston_neighbourhood = allston_neighbourhood.union(boston_neighbourhood_2020.where("neighbourhood == 'Allston'"))
allston_neighbourhood.toPandas()

NameError: name 'boston_neighbourhood_2015' is not defined

Let's translate all the reviews to english first

In [22]:
allston_neighbourhood = allston_neighbourhood.withColumn('comments', translate_udf(F.col("comments")))
allston_neighbourhood.toPandas()

NameError: name 'allston_neighbourhood' is not defined

Calculate the polarity of each review

In [23]:
allston_neighbourhood = allston_neighbourhood.withColumn('review_polarity', sentiment_udf(F.col("comments")))
allston_neighbourhood.toPandas()

NameError: name 'allston_neighbourhood' is not defined

In [24]:
allston_neighbourhood_home = allston_neighbourhood.where("room_type == 'Entire home/apt'").toPandas()
fig = px.histogram(allston_neighbourhood_home, x="review_polarity", color='room_type')
fig.show()

NameError: name 'allston_neighbourhood' is not defined

In [25]:
allston_neighbourhood_private = allston_neighbourhood.where("room_type == 'Private room'").toPandas()
fig = px.histogram(allston_neighbourhood_private, x="review_polarity", color='room_type')
fig.show()

NameError: name 'allston_neighbourhood' is not defined

In [26]:
allston_neighbourhood_shared = allston_neighbourhood.where("room_type == 'Shared room'").toPandas()
fig = px.histogram(allston_neighbourhood_shared, x="review_polarity", color='room_type')
fig.show()

NameError: name 'allston_neighbourhood' is not defined

Dorchester:

Let's combine Dorchester data from all the years

In [27]:
dorchester_neighbourhood = boston_neighbourhood_2015.where("neighbourhood == 'Dorchester'").union(boston_neighbourhood_2016.where("neighbourhood == 'Dorchester'"))
dorchester_neighbourhood = dorchester_neighbourhood.union(boston_neighbourhood_2017.where("neighbourhood == 'Dorchester'"))
dorchester_neighbourhood = dorchester_neighbourhood.union(boston_neighbourhood_2018.where("neighbourhood == 'Dorchester'"))
dorchester_neighbourhood = dorchester_neighbourhood.union(boston_neighbourhood_2019.where("neighbourhood == 'Dorchester'"))
dorchester_neighbourhood = dorchester_neighbourhood.union(boston_neighbourhood_2020.where("neighbourhood == 'Dorchester'"))
dorchester_neighbourhood.toPandas()

NameError: name 'boston_neighbourhood_2015' is not defined

In [28]:
dorchester_neighbourhood = dorchester_neighbourhood.withColumn('comments', translate_udf(F.col("comments")))
dorchester_neighbourhood.toPandas()

NameError: name 'dorchester_neighbourhood' is not defined

Calculate the polarity of each review

In [29]:
dorchester_neighbourhood = dorchester_neighbourhood.withColumn('review_polarity', sentiment_udf(F.col("comments")))
dorchester_neighbourhood.toPandas()

NameError: name 'dorchester_neighbourhood' is not defined

In [30]:
dorchester_neighbourhood_home = dorchester_neighbourhood.where("room_type == 'Entire home/apt'").toPandas()
fig = px.histogram(dorchester_neighbourhood_home, x="review_polarity", color='room_type')
fig.show()

NameError: name 'dorchester_neighbourhood' is not defined

In [31]:
dorchester_neighbourhood_private = dorchester_neighbourhood.where("room_type == 'Private room'").toPandas()
fig = px.histogram(dorchester_neighbourhood_private, x="review_polarity", color='room_type')
fig.show()

NameError: name 'dorchester_neighbourhood' is not defined

In [32]:
dorchester_neighbourhood_shared = dorchester_neighbourhood.where("room_type == 'Shared room'").toPandas()
fig = px.histogram(dorchester_neighbourhood_shared, x="review_polarity", color='room_type')
fig.show()

NameError: name 'dorchester_neighbourhood' is not defined