In [1]:
import findspark

In [2]:
findspark.init("/opt/manual/spark/")

In [3]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import *

In [4]:
spark = SparkSession.builder \
.appName("Aggregations") \
.master("local[2]") \
.config("spark.driver.memory","1500m") \
.config("spark.executer.memory","2g") \
.getOrCreate()

In [5]:
df = spark.read \
.option("header", True) \
.option("inferSchema", True) \
.option("compression","gzip") \
.csv("file:///home/train/datasets/Hotel_Reviews.csv.gz")

In [9]:
df2 = df.withColumn("Tags", F.split(F.col("Tags"),",")
                   .cast(ArrayType(StringType()))) \
.withColumn("Review_Date", F.to_date(F.col("Review_Date"),"M/d/yyyy"))

In [10]:
df2.limit(2).toPandas()

Unnamed: 0,Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-08-03,7.7,Hotel Arena,Russia,I am so angry that i made this post available...,397,1403,Only the park outside of the hotel was beauti...,11,7,2.9,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",0 days,52.3605759,4.9159683
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-08-03,7.7,Hotel Arena,Ireland,No Negative,0,1403,No real complaints the hotel was great great ...,105,7,7.5,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",0 days,52.3605759,4.9159683


In [11]:
df2.printSchema()

root
 |-- Hotel_Address: string (nullable = true)
 |-- Additional_Number_of_Scoring: integer (nullable = true)
 |-- Review_Date: date (nullable = true)
 |-- Average_Score: double (nullable = true)
 |-- Hotel_Name: string (nullable = true)
 |-- Reviewer_Nationality: string (nullable = true)
 |-- Negative_Review: string (nullable = true)
 |-- Review_Total_Negative_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews: integer (nullable = true)
 |-- Positive_Review: string (nullable = true)
 |-- Review_Total_Positive_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews_Reviewer_Has_Given: integer (nullable = true)
 |-- Reviewer_Score: double (nullable = true)
 |-- Tags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- days_since_review: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- lng: string (nullable = true)



In [13]:
# count, avg, min, max

df2.select(F.min("Review_Date"), F.max("Review_Date")).show(4)

+----------------+----------------+
|min(Review_Date)|max(Review_Date)|
+----------------+----------------+
|      2015-08-04|      2017-08-03|
+----------------+----------------+



In [14]:
df2.select("Reviewer_Nationality").distinct().count()

227

In [18]:
# group by

df2.select("Review_Date","Average_Score") \
.withColumn("Year", F.year(F.col("Review_Date"))) \
.groupBy("Year") \
.agg(F.sum("Average_Score")).alias("total_Average_Score").show(4)

+----+------------------+
|Year|sum(Average_Score)|
+----+------------------+
|2015| 791701.2000000357|
|2016|2221477.5000000857|
|2017|1317724.4000004139|
+----+------------------+



In [19]:
# order by

df2.select("Review_Date","Average_Score") \
.withColumn("Year", F.year(F.col("Review_Date"))) \
.groupBy("Year") \
.agg(F.sum("Average_Score")).alias("total_Average_Score") \
.orderBy(F.desc("Year")).show(4)

+----+------------------+
|Year|sum(Average_Score)|
+----+------------------+
|2017|1317724.4000004139|
|2016|2221477.5000000857|
|2015| 791701.2000000357|
+----+------------------+



In [20]:
# Multiple Agg Functions

df2.select("Review_Date","Average_Score") \
.withColumn("Year", F.year(F.col("Review_Date"))) \
.groupBy("Year") \
.agg(F.sum("Average_Score").alias("total_Average_Score"), F.avg("Average_Score").alias("avg_Average_Score")) \
.show(4)

+----+-------------------+-----------------+
|Year|total_Average_Score|avg_Average_Score|
+----+-------------------+-----------------+
|2015|  791701.2000000357|8.375397505475004|
|2016| 2221477.5000000857|8.401861930462536|
|2017| 1317724.4000004139|8.403425845622762|
+----+-------------------+-----------------+



In [21]:
df2.select("Review_Date","Average_Score","Reviewer_Nationality") \
.withColumn("Year", F.year(F.col("Review_Date"))) \
.groupBy("Year","Reviewer_Nationality")\
.agg(F.sum("Average_Score").alias("total_Average_Score"), F.avg("Average_Score").alias("avg_Average_Score")) \
.orderBy(F.desc("Year"))\
.show(4)

+----+--------------------+-------------------+-----------------+
|Year|Reviewer_Nationality|total_Average_Score|avg_Average_Score|
+----+--------------------+-------------------+-----------------+
|2017|             Monaco |              805.7|8.663440860215054|
|2017|            Belarus |  525.4999999999998|8.341269841269838|
|2017|          Australia |  54190.59999999995|8.467281249999992|
|2017|          Argentina |             1272.5|8.483333333333333|
+----+--------------------+-------------------+-----------------+
only showing top 4 rows



In [22]:
spark.stop()