In [57]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
import pandas as pd
from pyspark.sql import functions as F
from pyspark.sql import types as T


In [58]:
spark = SparkSession\
        .builder\
        .master("local")\
        .getOrCreate()

In [73]:
boston_reviews_df = spark.read.csv("data/Boston/2020/June/reviews.csv", header=True).withColumnRenamed("id", "review_id") 
boston_reviews_df.toPandas()

Unnamed: 0,listing_id,review_id,date,reviewer_id,reviewer_name,comments
0,3781,37776825,2015-07-10,36059247,Greg,The apartment was as advertised and Frank was ...
1,3781,41842494,2015-08-09,10459388,Tai,It was a pleasure to stay at Frank's place. Th...
2,3781,45282151,2015-09-01,12264652,Damien,The apartment description is entirely faithful...
3,3781,49022647,2015-09-30,41426327,Mike,Thoroughly enjoyed my time at Frank's home. Ha...
4,3781,52503327,2015-10-30,15151513,Ivan,Great value for the money! This location has e...
...,...,...,...,...,...,...
179901,43442619,625246771,2020-05-18,41655471,Adrian,Highly recommend this place if you're visiting...
179902,43442619,626642174,2020-05-29,72655982,Eliot,View is great. A nice two bedroom. Park next d...
179903,"At my time of stay (05/20) fees were exorbitant.""",,,,,
179904,43489770,627807090,2020-06-05,347912013,Keiji,Perfect stay


In [60]:
boston_listings_df = spark.read.csv("data/Boston/2020/June/listings_original.csv", header=True)
boston_listings_df.count()

3446

In [61]:
boston_listings_df.createOrReplaceTempView("boston_listings")

In [62]:
spark.sql("SELECT * FROM boston_listings WHERE id='3781'").toPandas()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,3781,HARBORSIDE-Walk to subway,4804,Frank,,East Boston,42.36413,-71.02991,Entire home/apt,125,28,16,2019-12-21,0.27,1,97


In [63]:
boston_reviews_df.createOrReplaceTempView("boston_reviews")

In [64]:
spark.sql("SELECT comments FROM boston_reviews WHERE listing_id='3781'").toPandas()

Unnamed: 0,comments
0,The apartment was as advertised and Frank was ...
1,It was a pleasure to stay at Frank's place. Th...
2,The apartment description is entirely faithful...
3,Thoroughly enjoyed my time at Frank's home. Ha...
4,Great value for the money! This location has e...
5,Frank was very accommodating throughout my sta...
6,Excellent! This was my second stay at the Jeff...
7,Frank was a wonderful and accommodating host. ...
8,"Frank was great, the apartment has everything ..."
9,Very nice. Comfortable apartment. Good locatio...


In [65]:
boston_listings_df = spark.read.csv("data/Boston/2020/June/listings_original.csv", header=True)
boston_listings_df.toPandas()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,3781,HARBORSIDE-Walk to subway,4804,Frank,,East Boston,42.36413,-71.02991,Entire home/apt,125,28,16,2019-12-21,0.27,1,97
1,5506,**$49 Special ** Private! Minutes to center!,8229,Terry,,Roxbury,42.32981,-71.09559,Entire home/apt,145,3,107,2020-05-01,0.78,6,333
2,6695,$99 Special!! Home Away! Condo,8229,Terry,,Roxbury,42.32994,-71.09351,Entire home/apt,169,3,115,2019-11-02,0.87,6,317
3,8789,Curved Glass Studio/1bd facing Park,26988,Anne,,Downtown,42.35919,-71.06265,Entire home/apt,99,91,25,2020-04-15,0.35,8,365
4,10730,Bright 1bed facing Golden Dome,26988,Anne,,Downtown,42.3584,-71.06185,Entire home/apt,150,91,32,2020-04-16,0.25,8,282
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3441,43690671,Beautiful 2 bedroom apartment/2nd floor,127718668,Muny,,Dorchester,42.31281,-71.05591,Entire home/apt,170,1,0,,,13,363
3442,43690782,Beautiful 2-bedroom apartment/3rd floor,127718668,Muny,,Dorchester,42.31235,-71.0553,Entire home/apt,170,1,0,,,13,361
3443,43715032,2.bostonparkplaza · SUPERIOR ROOM,243931054,Boston Park Plaza,,Downtown,42.35167,-71.07058,Hotel room,100,1,0,,,13,348
3444,43715125,Private Escape,349225095,Trevor,,Dorchester,42.29637,-71.07314,Private room,90,1,0,,,1,84


In [66]:
neighbourhoods_list = ["Back Bay", 'South Boston', 'South End', 'Fenway', 'Allston', 'Dorchester', 'Downtown']
boston_neighbourhoods_df = spark.createDataFrame(neighbourhoods_list, StringType())
boston_neighbourhoods_df = boston_neighbourhoods_df.selectExpr("value as neighbourhood")

In [67]:
boston_top_neighbourhood = boston_listings_df.join(boston_neighbourhoods_df, ["neighbourhood"] , "left_semi")
boston_top_neighbourhood.toPandas()

Unnamed: 0,neighbourhood,id,name,host_id,host_name,neighbourhood_group,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,South End,22354,COPLEY SQ...19TH C.TWNHSE SUNNY RM,85770,Robert,,42.34496,-71.07486,Private room,148,2,316,2019-12-13,2.93,5,90
1,South End,225224,4th floor office room,85770,Robert,,42.34643,-71.07503,Private room,105,2,318,2020-03-21,3.02,5,90
2,South End,322593,"Near Hospitals, 28 Day Bking Only,OutdoorSpace",1651480,Edward,,42.34232,-71.07594,Entire home/apt,125,28,408,2020-03-31,4.65,4,91
3,South End,526970,BACK BAY/COPLEY FIRST FLOOR ROOM,85770,Robert,,42.34523,-71.07282,Private room,148,2,175,2019-10-31,1.80,5,90
4,South End,798957,"★Discounted★ NearHospitals★2floors★ 2 Baths,2beds",1651480,Edward,,42.34232,-71.07594,Entire home/apt,347,28,417,2020-04-30,4.82,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1662,South Boston,43102656,Large bedroom in shared apartment with two males,10778805,Paul,,42.32958,-71.05638,Private room,50,90,0,,,3,363
1663,South Boston,43137883,Spacious Queen Room in South Boston,297860058,June,,42.33393,-71.02503,Private room,48,30,0,,,31,364
1664,South Boston,43638802,WB Quarters™ by STRB|Hangout|Priv. Room|Shared BA,814298,Thatch,,42.34313,-71.05606,Private room,90,28,0,,,92,141
1665,South Boston,43639005,WB Quarters™ by STRB|Hangout|Priv. Room|Shared BA,814298,Thatch,,42.34301,-71.0573,Private room,90,28,0,,,92,144


In [None]:
boston_top_neighbourhood = boston_top_neighbourhood.join(boston_reviews_df, boston_top_neighbourhood["id"] == boston_reviews_df["listing_id"], "left")
boston_top_neighbourhood.toPandas()