In [None]:
import os
# Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.3'
spark_version = 'spark-3.<enter version>'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Q4").getOrCreate()

In [3]:
from pyspark import SparkFiles
# Load in user_data.csv from S3 into a DataFrame
url = "https://s3.amazonaws.com/dataviz-curriculum/day_3/checkpoints/question_four.csv"
spark.sparkContext.addFile(url)

df = spark.read.option('header', 'true').csv(SparkFiles.get("question_four.csv"), inferSchema=True, sep=',', timestampFormat="mm/dd/yy")
df.show(10)

+--------------------+-------------------+
|         review_text|               date|
+--------------------+-------------------+
|1 check-in Love l...|2016-01-25 00:11:00|
|Listed in Date Ni...|2016-01-02 00:12:00|
|1 check-in Listed...|2016-01-30 00:11:00|
|Very cool vibe! G...|2016-01-25 00:11:00|
|1 check-in They a...|2016-01-03 00:12:00|
|1 check-in Very c...|2016-01-20 00:11:00|
|"2 check-ins List...|2016-01-27 00:10:00|
|"2 check-ins Love...|2016-01-02 00:11:00|
|"1 check-in Ok le...|2016-01-25 00:10:00|
|3 check-ins This ...|2016-01-10 00:11:00|
+--------------------+-------------------+
only showing top 10 rows



# What date had the highest number of reviews?

In [5]:
# Order review and date with higesest reviews
date_df = df.groupBy('date').agg({"date": "count"})
date_df.show()

+-------------------+-----------+
|               date|count(date)|
+-------------------+-----------+
|2015-01-07 00:01:00|         11|
|2014-01-05 00:07:00|          1|
|2014-01-17 00:10:00|          2|
|2016-01-24 00:10:00|         11|
|2016-01-27 00:01:00|          7|
|2015-01-20 00:12:00|         10|
|2013-01-08 00:05:00|          1|
|2008-01-11 00:01:00|          1|
|2012-01-12 00:07:00|          1|
|2016-01-05 00:06:00|         19|
|2011-01-22 00:11:00|          1|
|2015-01-21 00:04:00|          6|
|2015-01-09 00:10:00|          8|
|2012-01-17 00:11:00|          1|
|2011-01-01 00:02:00|          2|
|2014-01-17 00:02:00|          2|
|2012-01-30 00:07:00|          1|
|2015-01-17 00:07:00|          3|
|2013-01-10 00:05:00|          1|
|2016-01-01 00:09:00|          7|
+-------------------+-----------+
only showing top 20 rows



In [6]:
from pyspark.sql.functions import desc
date_df.orderBy(desc("count(date)")).show()

+-------------------+-----------+
|               date|count(date)|
+-------------------+-----------+
|2016-01-09 00:10:00|         31|
|2016-01-18 00:09:00|         30|
|2016-01-20 00:11:00|         27|
|2016-01-02 00:11:00|         27|
|2016-01-02 00:12:00|         26|
|2016-01-04 00:12:00|         26|
|2016-01-15 00:09:00|         25|
|2016-01-07 00:10:00|         24|
|2016-01-06 00:11:00|         24|
|2016-01-24 00:07:00|         24|
|2016-01-03 00:12:00|         23|
|2016-01-25 00:10:00|         23|
|2016-01-01 00:12:00|         23|
|2016-01-17 00:04:00|         23|
|2016-01-07 00:08:00|         22|
|2016-01-27 00:06:00|         22|
|2016-01-21 00:11:00|         21|
|2016-01-04 00:01:00|         21|
|2016-01-17 00:01:00|         21|
|2016-01-30 00:07:00|         20|
+-------------------+-----------+
only showing top 20 rows

