In [1]:
import pandas as pd
import numpy as np
import datetime as dt

import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *#avg, count, expr
from pyspark.sql.types import *

In [2]:
# initialize
sc = pyspark.SparkContext()
spark = SparkSession(sc)
spark.sparkContext.appName = 'aggregating'
# show the number of cores
print('%d cores'%spark._jsc.sc().getExecutorMemoryStatus().keySet().size())
spark

1 cores


In [3]:
# load the data
fil = '../data/nyc_air_bnb.csv'
schem = StructType([StructField('id', IntegerType()), StructField('name', StringType()),
                    StructField('host_id', IntegerType()), StructField('host_name', StringType()),
                    StructField('neighbourhood_group', StringType()), StructField('neighbourhood', StringType()),
                    StructField('latitude', FloatType()), StructField('longitude', FloatType()),
                    StructField('room_type', StringType()), StructField('price', FloatType()),
                    StructField('minimum_nights', IntegerType()), StructField('number_of_reviews', IntegerType()),
                    StructField('last_review', DateType()), StructField('reviews_per_month', FloatType()),
                    StructField('calculated_host_listings_count', IntegerType()), StructField('availability_365', IntegerType())])
airbnb = spark.read.format('csv').options(header=True, dateFormat='yyyy-MM-dd').schema(schem).load(fil)

In [4]:
display(airbnb.limit(10).toPandas())

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.647491,-73.972366,Private room,149.0,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.983772,Entire home/apt,225.0,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.809021,-73.941902,Private room,150.0,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.685139,-73.959763,Entire home/apt,89.0,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.798512,-73.943993,Entire home/apt,80.0,10,9,2018-11-19,0.1,1,0
5,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.747669,-73.974998,Entire home/apt,200.0,3,74,2019-06-22,0.59,1,129
6,5121,BlissArtsSpace!,7356,Garon,Brooklyn,Bedford-Stuyvesant,40.686878,-73.955963,Private room,60.0,45,49,2017-10-05,0.4,1,0
7,5178,Large Furnished Room Near B'way,8967,Shunichi,Manhattan,Hell's Kitchen,40.764889,-73.984932,Private room,79.0,2,430,2019-06-24,3.47,1,220
8,5203,Cozy Clean Guest Room - Family Apt,7490,MaryEllen,Manhattan,Upper West Side,40.801781,-73.967232,Private room,79.0,2,118,2017-07-21,0.99,1,0
9,5238,Cute & Cozy Lower East Side 1 bdrm,7549,Ben,Manhattan,Chinatown,40.71344,-73.990372,Entire home/apt,150.0,1,160,2019-06-09,1.33,4,188


In [6]:
airbnb.select('neighbourhood_group').groupBy('neighbourhood_group').count().orderBy(col('count').desc()).show()

+-------------------+-----+
|neighbourhood_group|count|
+-------------------+-----+
|          Manhattan|21594|
|           Brooklyn|20055|
|             Queens| 5630|
|              Bronx| 1080|
|      Staten Island|  370|
|               null|  185|
|             Harlem|   13|
| Bedford-Stuyvesant|    9|
|           Elmhurst|    7|
|     Hell's Kitchen|    7|
|    Upper East Side|    7|
|       Williamsburg|    6|
|        East Harlem|    5|
|         Mott Haven|    5|
|       East Village|    4|
|           Canarsie|    4|
|           Bushwick|    4|
|            Midtown|    4|
| Washington Heights|    4|
|   Ditmars Steinway|    3|
+-------------------+-----+
only showing top 20 rows



In [20]:
airbnb.select('neighbourhood_group', 'price', 'reviews_per_month').groupBy('neighbourhood_group')\
    .agg(count('neighbourhood_group').alias('Count_Listings'), min('Price').alias('Min_Price'),
         percentile_approx('Price', 0.5).alias('Median_Price'),
         max('Price').alias('Max_Price'), mean('reviews_per_month').alias('Mean_Reviews'))\
    .orderBy(col('Count_Listings').desc()).show()

+-------------------+--------------+---------+------------+---------+------------------+
|neighbourhood_group|Count_Listings|Min_Price|Median_Price|Max_Price|      Mean_Reviews|
+-------------------+--------------+---------+------------+---------+------------------+
|          Manhattan|         21594|      0.0|       150.0|  10000.0| 1.268360981895352|
|           Brooklyn|         20055|      0.0|        90.0|  10000.0|1.2823536937980953|
|             Queens|          5630|     10.0|        75.0|  10000.0|1.9332430648700696|
|              Bronx|          1080|      0.0|        65.0|   2500.0| 1.830277460184745|
|      Staten Island|           370|     13.0|        75.0|   5000.0|1.8763461493504927|
|             Harlem|            13|      1.0|         2.0|      7.0|               4.0|
| Bedford-Stuyvesant|             9|      1.0|         1.0|      3.0|1.6666666666666667|
|    Upper East Side|             7|      1.0|         1.0|      3.0|               1.0|
|           Elmhurst|

In [31]:
# create a pysaprk UDF for median - but doesn't work with summary, it seems
def median(acol):
    return percentile_approx(acol, 0.5)
medianUDF = udf(lambda x: median(x), FloatType())

# get numeric cols
cols = [c.name for c in airbnb.schema if (c.dataType is IntegerType()) or (c.dataType is FloatType())]

summ = airbnb.select(cols).summary('min', '25%', '50%', 'mean', '75%', 'max')
display(summ.toPandas())

Unnamed: 0,summary,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,min,2539.0,2438.0,-74.16254,-74.24442,-74.00828,0.0,0.0,0.0,0.0,0.0
1,25%,9470554.0,7795299.0,40.68983,-73.98311,69.0,1.0,1.0,0.19,1.0,0.0
2,50%,19675738.0,30672354.0,40.7228,-73.95572,105.0,3.0,5.0,0.72,1.0,44.0
3,mean,19017143.236179568,67495915.89946438,40.36302551530165,437.1107574129136,152.2229629934223,7.12861262809106,23.25793544945319,1.3743823663657475,7.655552600895944,112.59808769518024
4,75%,29149618.0,107434423.0,40.76299,-73.93639,175.0,5.0,23.0,2.01,2.0,226.0
5,max,36487245.0,274321313.0,40.91306,24906404.0,10000.0,1250.0,629.0,58.5,365.0,365.0


In [41]:
airbnb.select('neighbourhood_group', 'room_type').where(col('room_type').isin('Entire home/apt', 'Private room', 'Shared room'))\
    .groupBy('neighbourhood_group', 'room_type').count().show()

+-------------------+---------------+-----+
|neighbourhood_group|      room_type|count|
+-------------------+---------------+-----+
|      Staten Island|   Private room|  186|
|           Brooklyn|   Private room|10105|
|              Bronx|Entire home/apt|  376|
|             Queens|    Shared room|  198|
|      Staten Island|    Shared room|    9|
|          Manhattan|Entire home/apt|13172|
|          Manhattan|   Private room| 7944|
|          Manhattan|    Shared room|  478|
|              Bronx|    Shared room|   60|
|             Queens|   Private room| 3347|
|             Queens|Entire home/apt| 2085|
|              Bronx|   Private room|  644|
|      Staten Island|Entire home/apt|  175|
|           Brooklyn|Entire home/apt| 9537|
|           Brooklyn|    Shared room|  413|
+-------------------+---------------+-----+



In [40]:
airbnb.select('room_type').groupBy('room_type').count().orderBy(col('count').desc()).show()

+---------------+-----+
|      room_type|count|
+---------------+-----+
|Entire home/apt|25345|
|   Private room|22226|
|    Shared room| 1158|
|           null|  185|
|             65|    8|
|             69|    7|
|             55|    6|
|            145|    6|
|             59|    5|
|            350|    5|
|             50|    5|
|            150|    5|
|            120|    5|
|             89|    5|
|             75|    5|
|             45|    5|
|             99|    4|
|            100|    3|
|             95|    3|
|            250|    3|
+---------------+-----+
only showing top 20 rows



In [38]:
# groupby = columns, pivot = rows
pvt = airbnb.select('neighbourhood_group', 'room_type').where(col('room_type').isin('Entire home/apt', 'Private room', 'Shared room'))\
    .groupBy('neighbourhood_group').pivot('room_type').count()
display(pvt.toPandas())

Unnamed: 0,neighbourhood_group,Entire home/apt,Private room,Shared room
0,Queens,2085,3347,198
1,Brooklyn,9537,10105,413
2,Staten Island,175,186,9
3,Manhattan,13172,7944,478
4,Bronx,376,644,60


In [42]:
pvt = airbnb.select('neighbourhood_group', 'room_type', 'Price').where(col('room_type').isin('Entire home/apt', 'Private room', 'Shared room'))\
    .groupBy('neighbourhood_group').pivot('room_type').agg(min('Price').alias('Min_Price'), max('Price').alias('Max_Price'))
display(pvt.toPandas())

Unnamed: 0,neighbourhood_group,Entire home/apt_Min_Price,Entire home/apt_Max_Price,Private room_Min_Price,Private room_Max_Price,Shared room_Min_Price,Shared room_Max_Price
0,Queens,10.0,2600.0,10.0,10000.0,11.0,1800.0
1,Brooklyn,0.0,10000.0,0.0,7500.0,0.0,725.0
2,Staten Island,48.0,5000.0,20.0,300.0,13.0,150.0
3,Manhattan,0.0,10000.0,10.0,9999.0,10.0,1000.0
4,Bronx,28.0,1000.0,0.0,2500.0,20.0,800.0


In [43]:
sc.stop()