In [30]:
# Do all imports and installs here
import pandas as pd
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import StringType

In [3]:
spark = SparkSession.builder\
                     .config("spark.jars.packages","org.apache.hadoop:hadoop-aws:2.7.0")\
                     .getOrCreate()

In [4]:
# Read in the data here
df_hotel_reviews = spark.read.csv('Hotel_Reviews.csv', header=True, inferSchema=True)
df_hotel_reviews.printSchema()


root
 |-- Hotel_Address: string (nullable = true)
 |-- Additional_Number_of_Scoring: integer (nullable = true)
 |-- Review_Date: string (nullable = true)
 |-- Average_Score: double (nullable = true)
 |-- Hotel_Name: string (nullable = true)
 |-- Reviewer_Nationality: string (nullable = true)
 |-- Negative_Review: string (nullable = true)
 |-- Review_Total_Negative_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews: integer (nullable = true)
 |-- Positive_Review: string (nullable = true)
 |-- Review_Total_Positive_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews_Reviewer_Has_Given: integer (nullable = true)
 |-- Reviewer_Score: double (nullable = true)
 |-- Tags: string (nullable = true)
 |-- days_since_review: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- lng: string (nullable = true)



In [8]:
print((df_hotel_reviews.count(), len(df_hotel_reviews.columns)))

(515738, 17)


In [20]:
dist_hotel_columns = ['Hotel_Name', 'Hotel_Address', 'lat', 'lng']
df_distinct_hotels = df_hotel_reviews.dropDuplicates(subset=dist_hotel_columns)

In [21]:
print((df_distinct_hotels.count(), len(df_distinct_hotels.columns)))

(1494, 17)


There are therefore only 1494 unique hotels, even though there are more than half a million reviews.

In [22]:
df_distinct_hotels.take(3)

[Row(Hotel_Address='Savoyenstra e 2 16 Ottakring 1160 Vienna Austria', Additional_Number_of_Scoring=86, Review_Date='7/31/2017', Average_Score=8.3, Hotel_Name='Austria Trend Hotel Schloss Wilhelminenberg Wien', Reviewer_Nationality=' Netherlands ', Negative_Review=' Although the building looks majestic on the outside the inner hallways on the higher floors could use some renovation old carpets walls and ceilings need a freshen up The wooden floors expecially in the hallways are quite squeaky Our room was in decent shape but had no airconditioning a fan was provided though We stayed during a 30 degree celcius period so the room was rather warm and hard to ventilate because of the small single window ', Review_Total_Negative_Word_Counts=79, Total_Number_of_Reviews=1558, Positive_Review=' Excellent location with spectacular view over the city provided your room is on the backside of the building Local bus to the city stops right in front of the hotel Ample free outdoor parking on the hote

In [31]:
def get_last_word(address):
    """ Gets the last word of the string 'address'"""
    return address.split()[-1]

udf_get_last_word = F.udf(get_last_word, StringType())

In [36]:
df_distinct_hotels = df_distinct_hotels.withColumn('country', udf_get_last_word("Hotel_Address"))

In [42]:
df_distinct_hotels.select('country').distinct().show()

+-----------+
|    country|
+-----------+
|     France|
|      Italy|
|      Spain|
|    Kingdom|
|    Austria|
|Netherlands|
+-----------+



In [46]:
df_hotel_reviews.select('Reviewer_Nationality').distinct().count()

227

So, we see that there are hotels from 6 countries, but reviewers from 227 countries.