# Aggregating DataFrames in PySpark HW

First let's start up our PySpark instance

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import time

import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *#avg, count, expr
from pyspark.sql.types import *

In [2]:
# initialize
sc = pyspark.SparkContext()
spark = SparkSession(sc)
spark.sparkContext.appName = 'aggregating'
# show the number of cores
print('%d cores'%spark._jsc.sc().getExecutorMemoryStatus().keySet().size())
spark

1 cores


In [3]:
# start the timer
stt = time.perf_counter()

## Read in the dataFrame for this Notebook

In [4]:
# load the data
fil = '../../data/nyc_air_bnb.csv'
schem = StructType([StructField('id', IntegerType()), StructField('name', StringType()),
                    StructField('host_id', IntegerType()), StructField('host_name', StringType()),
                    StructField('neighbourhood_group', StringType()), StructField('neighbourhood', StringType()),
                    StructField('latitude', FloatType()), StructField('longitude', FloatType()),
                    StructField('room_type', StringType()), StructField('price', FloatType()),
                    StructField('minimum_nights', IntegerType()), StructField('number_of_reviews', IntegerType()),
                    StructField('last_review', DateType()), StructField('reviews_per_month', FloatType()),
                    StructField('calculated_host_listings_count', IntegerType()), StructField('availability_365', IntegerType())])
airbnb = spark.read.format('csv').options(header=True, dateFormat='yyyy-MM-dd').schema(schem).load(fil).repartition('neighbourhood_group')

In [5]:
display(airbnb.limit(10).toPandas())

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,,18996093,,Queens,Douglaston,40.75754,-73.730682,,45,1.0,12,,,5.0,226,
1,12937.0,"1 Stop fr. Manhattan! Private Suite,Landmark B...",50124.0,Orestes,Queens,Long Island City,40.747711,-73.947403,Private room,130.0,3,248.0,2019-07-01,2.25,1,215.0
2,18198.0,Little King of Queens,70091.0,Justin,Queens,Woodside,40.750381,-73.903343,Private room,70.0,30,25.0,2019-05-31,0.22,1,324.0
3,32363.0,Fully Furnished Basement Apartment,140025.0,Fredah,Queens,Flushing,40.74028,-73.83168,Private room,140.0,2,1.0,2011-09-19,0.01,1,1.0
4,39593.0,"A room w/ a Manhattan view, longer stay",110506.0,Myung,Queens,Sunnyside,40.74559,-73.923126,Private room,79.0,30,28.0,2019-04-12,0.26,1,126.0
5,45910.0,Beautiful Queens Brownstone! - 5BR,204539.0,Mark,Queens,Ridgewood,40.703819,-73.897972,Entire home/apt,350.0,8,10.0,2019-05-12,0.11,5,365.0
6,51850.0,( F) Excellent/Pvt Rm,27848.0,Jullett,Queens,Jamaica,40.67252,-73.765968,Private room,55.0,2,52.0,2019-05-20,0.49,2,365.0
7,53469.0,cozy studio with parking spot,204539.0,Mark,Queens,Middle Village,40.71722,-73.878563,Entire home/apt,98.0,30,33.0,2015-05-09,0.31,5,240.0
8,53470.0,Clean and convenient 2BR apartment,204539.0,Mark,Queens,Ridgewood,40.702339,-73.898163,Private room,140.0,7,6.0,2015-10-08,0.06,5,365.0
9,53477.0,3 floors of luxury!,204539.0,Mark,Queens,Middle Village,40.715462,-73.87854,Entire home/apt,265.0,7,38.0,2019-04-27,0.38,5,365.0


## About this dataset

This dataset describes the listing activity and metrics for Air BNB bookers in NYC, NY for 2019. Each line in the dataset is a booking. 

**Source:** https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data/data

Let's go ahead and view the first few records of the dataset so we know what we are working with.

In [6]:
# did that above already

Now print the schema so we can make sure all the variables have the correct types

In [7]:
# no need as I set the schema

Notice here that some of the columns that are obviously numeric have been incorrectly identified as "strings". Let's edit that. Otherwise we cannot aggregate any of the numeric columns.

In [8]:
# again, no need!

### Alright now we are ready to dig in!


### 1. How many rows are in this dataset?

In [9]:
airbnb.count()

49079

### 2. How many total reviews does each host have?

In [10]:
airbnb.select('host_id', 'number_of_reviews').groupBy('host_id').agg(sum('number_of_reviews').alias('ReviewCount'))\
    .orderBy(col('ReviewCount').desc()).show()

+---------+-----------+
|  host_id|ReviewCount|
+---------+-----------+
| 37312959|       2273|
|   344035|       2205|
| 26432133|       2017|
| 35524316|       1971|
| 40176101|       1818|
|  4734398|       1798|
| 16677326|       1355|
|  6885157|       1346|
|219517861|       1281|
| 23591164|       1269|
| 59529529|       1229|
| 47621202|       1205|
| 22959695|       1157|
| 58391491|       1154|
| 21641206|       1062|
|   137814|       1059|
|156948703|       1052|
|156684502|       1046|
|  3441272|       1013|
|  7831209|        970|
+---------+-----------+
only showing top 20 rows



### 3. Show the min and max of all the numeric variables in the dataset

In [11]:
# get numeric cols
cols = [c.name for c in airbnb.schema if (c.dataType is IntegerType()) or (c.dataType is FloatType())]

summ = airbnb.select(cols).summary('min', 'max')
display(summ.toPandas())

Unnamed: 0,summary,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,min,2539,2438,-74.16254,-74.24442,-74.00828,0,0,0.0,0,0
1,max,36487245,274321313,40.91306,24906404.0,10000.0,1250,629,58.5,365,365


### 4. Which host had the highest number of reviews?

Only display the top result.

Bonus: format the column names

In [12]:
# can already answer this because I sorted the answer for #2 descending by count

### 5. On average, how many nights did most hosts specify for a minimum?

In [13]:
mnmn = airbnb.select('host_id', 'minimum_nights').groupBy('host_id').agg(mean('minimum_nights').alias('mean_min_night'))
mnmn.show()

+---------+--------------+
|  host_id|mean_min_night|
+---------+--------------+
|  1384111|          16.0|
| 22171095|           2.0|
| 11528105|          14.0|
|  7174662|           2.0|
| 13203690|           2.0|
| 84141567|           2.0|
| 96424121|           1.0|
| 15864671|           4.0|
| 70774951|           1.0|
|  2108760|           1.0|
|154848269|           3.0|
|  9956828|           3.0|
| 95014144|           1.0|
| 37286422|           2.0|
|202766239|           1.0|
| 48167507|           2.0|
|129875945|           2.0|
|238685151|           1.0|
|238733096|           2.0|
|257474717|           2.0|
+---------+--------------+
only showing top 20 rows



In [14]:
mnmn.select('mean_min_night').groupBy('mean_min_night').count().orderBy(col('count').desc()).show()

+--------------+-----+
|mean_min_night|count|
+--------------+-----+
|           1.0| 8988|
|           2.0| 8962|
|           3.0| 6399|
|           4.0| 2778|
|           5.0| 2507|
|           7.0| 1665|
|          30.0| 1194|
|           6.0|  636|
|          14.0|  446|
|          10.0|  403|
|           1.5|  285|
|           2.5|  258|
|          15.0|  197|
|          20.0|  164|
|           3.5|  133|
|           8.0|  125|
|          28.0|  125|
|          31.0|  114|
|          21.0|  105|
|          60.0|   84|
+--------------+-----+
only showing top 20 rows



### 6. What is the most expensive neighborhood to stay in on average?

Note: only show the one result

In [15]:
airbnb.select('neighbourhood_group', 'neighbourhood', 'Price').groupBy('neighbourhood_group', 'neighbourhood')\
    .agg(mean('Price').alias('MeanPrice')).orderBy(col('MeanPrice').desc()).show(1, truncate=False)

+-------------------+--------------+---------+
|neighbourhood_group|neighbourhood |MeanPrice|
+-------------------+--------------+---------+
|Staten Island      |Fort Wadsworth|800.0    |
+-------------------+--------------+---------+
only showing top 1 row



### 7. Display a two by two table that shows the average prices by room type (private and shared only) and neighborhood group (Manhattan and Brooklyn only)

In [16]:
pvt = airbnb.select('neighbourhood_group', 'room_type', 'price')\
    .where(col('neighbourhood_group').isin('Manhattan', 'Brooklyn') & col('room_type').isin('Private room', 'Shared room'))\
    .groupBy('neighbourhood_group').pivot('room_type').agg(mean('price').alias('MeanPrice'))
display(pvt.toPandas())

Unnamed: 0,neighbourhood_group,Private room,Shared room
0,Brooklyn,76.47234,50.527845
1,Manhattan,116.054003,89.069038


### Alright that's all folks!

### Great job!

In [17]:
# stop the timer
stp = time.perf_counter()
print('Elapsed time = %0.3f(m)'%((stp-stt)/60))

Elapsed time = 1.033(m)


### Timing without repartitioning on neighbourhood group = 1.055(m)
### Timing with repartitioning on neighbourhood gruop = 1.516(m)

In [18]:
sc.stop()