In [None]:
pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 53 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 81.8 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845514 sha256=bfae989ee1a43ce45ae6c60df26367c79c8cf076074bfb63942c56d1ff6a606d
  Stored in directory: /root/.cache/pip/wheels/42/59/f5/79a5bf931714dcd201b26025347785f087370a10a3329a899c
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.1


In [None]:
from pyspark.sql.types import StringType, StructField, IntegerType, StructType
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum, min, max, avg, col, lit, when, count
import io
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, rank, dense_rank, percent_rank, ntile, cume_dist, lag, lead, sequence, current_date, date_format,to_date
from google.colab import files
uploaded = files.upload()

Saving Airbnb NYC 2019.csv to Airbnb NYC 2019 (1).csv


**Create Spark Session**

In [None]:
spark = SparkSession.builder \
        .master("local[1]") \
        .appName("airbnb_nyc") \
        .getOrCreate()

In [None]:
airbnb_df = spark.read.csv('Airbnb NYC 2019.csv', header = True, inferSchema=True)
airbnb_df.show()
airbnb_df.printSchema()

+----+--------------------+-------+----------------+-------------------+------------------+--------+---------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+
|  id|                name|host_id|       host_name|neighbourhood_group|     neighbourhood|latitude|longitude|      room_type|price|minimum_nights|number_of_reviews|last_review|reviews_per_month|calculated_host_listings_count|availability_365|
+----+--------------------+-------+----------------+-------------------+------------------+--------+---------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+
|2539|Clean & quiet apt...|   2787|            John|           Brooklyn|        Kensington|40.64749|-73.97237|   Private room|  149|             1|                9| 2018-10-19|             0.21|                             6|             365|
|2595|Skylit Midtown Ca.

**Total count of neighbourhood_group**

In [None]:
print('Total count of neighbourhood_group: '+str(airbnb_df.dropDuplicates(['neighbourhood']).count()))

Total count of neighbourhood_group: 383


In [None]:
neighbourhood_group_df = airbnb_df.groupBy('neighbourhood_group').count()
neighbourhood_group_df.orderBy(neighbourhood_group_df['count'].desc()).show()

+-------------------+-----+
|neighbourhood_group|count|
+-------------------+-----+
|          Manhattan|21594|
|           Brooklyn|20055|
|             Queens| 5630|
|              Bronx| 1080|
|      Staten Island|  370|
|               null|  185|
|             Harlem|   13|
| Bedford-Stuyvesant|    9|
|     Hell's Kitchen|    7|
|    Upper East Side|    7|
|           Elmhurst|    7|
|       Williamsburg|    6|
|        East Harlem|    5|
|         Mott Haven|    5|
|       East Village|    4|
|            Midtown|    4|
|           Bushwick|    4|
|           Canarsie|    4|
| Washington Heights|    4|
|            Midwood|    3|
+-------------------+-----+
only showing top 20 rows



**rank_number windows function**

In [None]:
neighbourhood_group_partition = Window.partitionBy('neighbourhood_group').orderBy('price')
airbnb_neighbourhood_group_partition_df = airbnb_df.withColumn('row_number', row_number().over(neighbourhood_group_partition))
airbnb_neighbourhood_group_partition_df.select('id', 'name', 'neighbourhood_group', 'price', 'row_number').show()

+--------------------+--------------------+-------------------+-----+----------+
|                  id|                name|neighbourhood_group|price|row_number|
+--------------------+--------------------+-------------------+-----+----------+
|              255476|       The BLUE OWL:|               null| null|         1|
|              267708|Charming Hotel Al...|               null| null|         2|
|              405408|Magazine SOHO Stu...|               null| null|         3|
|             1096326|Private studio in...|               null| null|         4|
|,6024006,Johnny,S...|                null|               null| null|         5|
|             3465651|“No Place Like Home”|               null| null|         6|
|             3757461|Manhattan Sun Dre...|               null| null|         7|
|             3768977|2 bedroom Duplex ...|               null| null|         8|
|             4756578|Clean | Green | S...|               null| null|         9|
|             4787527|    Ho

**rank windows function**

In [None]:
airbnb_rank_df = airbnb_df.withColumn('rank', rank().over(neighbourhood_group_partition))
airbnb_rank_df.select('id', 'name', 'neighbourhood_group', 'price', 'rank').show()

+--------------------+--------------------+-------------------+-----+----+
|                  id|                name|neighbourhood_group|price|rank|
+--------------------+--------------------+-------------------+-----+----+
|              255476|       The BLUE OWL:|               null| null|   1|
|              267708|Charming Hotel Al...|               null| null|   1|
|              405408|Magazine SOHO Stu...|               null| null|   1|
|             1096326|Private studio in...|               null| null|   1|
|,6024006,Johnny,S...|                null|               null| null|   1|
|             3465651|“No Place Like Home”|               null| null|   1|
|             3757461|Manhattan Sun Dre...|               null| null|   1|
|             3768977|2 bedroom Duplex ...|               null| null|   1|
|             4756578|Clean | Green | S...|               null| null|   1|
|             4787527|    Home Sweet Home!|               null| null|   1|
|             5286482|New

**dense_rank windows function**

In [None]:
airbnb_dense_rank_df = airbnb_df.withColumn("dense_rank",dense_rank().over(neighbourhood_group_partition))
airbnb_dense_rank_df.select('id', 'name', 'neighbourhood_group', 'price', 'dense_rank').show()

+--------------------+--------------------+-------------------+-----+----------+
|                  id|                name|neighbourhood_group|price|dense_rank|
+--------------------+--------------------+-------------------+-----+----------+
|              255476|       The BLUE OWL:|               null| null|         1|
|              267708|Charming Hotel Al...|               null| null|         1|
|              405408|Magazine SOHO Stu...|               null| null|         1|
|             1096326|Private studio in...|               null| null|         1|
|,6024006,Johnny,S...|                null|               null| null|         1|
|             3465651|“No Place Like Home”|               null| null|         1|
|             3757461|Manhattan Sun Dre...|               null| null|         1|
|             3768977|2 bedroom Duplex ...|               null| null|         1|
|             4756578|Clean | Green | S...|               null| null|         1|
|             4787527|    Ho

**percent_rank window function**

In [None]:
airbnb_percent_rank_df = airbnb_df.withColumn('percent_rank', percent_rank().over(neighbourhood_group_partition))
airbnb_percent_rank_df.select('id', 'name', 'neighbourhood_group', 'price', 'percent_rank').show()

+--------------------+--------------------+-------------------+-----+------------+
|                  id|                name|neighbourhood_group|price|percent_rank|
+--------------------+--------------------+-------------------+-----+------------+
|              255476|       The BLUE OWL:|               null| null|         0.0|
|              267708|Charming Hotel Al...|               null| null|         0.0|
|              405408|Magazine SOHO Stu...|               null| null|         0.0|
|             1096326|Private studio in...|               null| null|         0.0|
|,6024006,Johnny,S...|                null|               null| null|         0.0|
|             3465651|“No Place Like Home”|               null| null|         0.0|
|             3757461|Manhattan Sun Dre...|               null| null|         0.0|
|             3768977|2 bedroom Duplex ...|               null| null|         0.0|
|             4756578|Clean | Green | S...|               null| null|         0.0|
|   

**ntile window function**

In [None]:
airbnb_ntile_df = airbnb_df.withColumn('ntile', ntile(3).over(neighbourhood_group_partition))
airbnb_ntile_df.select('id', 'name', 'neighbourhood_group', 'price', 'ntile').show()

+--------------------+--------------------+-------------------+-----+-----+
|                  id|                name|neighbourhood_group|price|ntile|
+--------------------+--------------------+-------------------+-----+-----+
|              255476|       The BLUE OWL:|               null| null|    1|
|              267708|Charming Hotel Al...|               null| null|    1|
|              405408|Magazine SOHO Stu...|               null| null|    1|
|             1096326|Private studio in...|               null| null|    1|
|,6024006,Johnny,S...|                null|               null| null|    1|
|             3465651|“No Place Like Home”|               null| null|    1|
|             3757461|Manhattan Sun Dre...|               null| null|    1|
|             3768977|2 bedroom Duplex ...|               null| null|    1|
|             4756578|Clean | Green | S...|               null| null|    1|
|             4787527|    Home Sweet Home!|               null| null|    1|
|           

**cume_dist window function**

In [None]:
airbnb_cume_dist_df = airbnb_df.withColumn('cume_dist', cume_dist().over(neighbourhood_group_partition))
airbnb_cume_dist_df.select('id', 'name', 'neighbourhood_group', 'price', 'cume_dist').show()

+--------------------+--------------------+-------------------+-----+---------+
|                  id|                name|neighbourhood_group|price|cume_dist|
+--------------------+--------------------+-------------------+-----+---------+
|              255476|       The BLUE OWL:|               null| null|      1.0|
|              267708|Charming Hotel Al...|               null| null|      1.0|
|              405408|Magazine SOHO Stu...|               null| null|      1.0|
|             1096326|Private studio in...|               null| null|      1.0|
|,6024006,Johnny,S...|                null|               null| null|      1.0|
|             3465651|“No Place Like Home”|               null| null|      1.0|
|             3757461|Manhattan Sun Dre...|               null| null|      1.0|
|             3768977|2 bedroom Duplex ...|               null| null|      1.0|
|             4756578|Clean | Green | S...|               null| null|      1.0|
|             4787527|    Home Sweet Hom

**lag window function**

In [None]:
airbnb_lag_df = airbnb_df.withColumn('lag', lag('price', 2).over(neighbourhood_group_partition))
airbnb_lag_df.select('id', 'name', 'neighbourhood_group', 'price', 'lag').show()

+--------------------+--------------------+-------------------+-----+----+
|                  id|                name|neighbourhood_group|price| lag|
+--------------------+--------------------+-------------------+-----+----+
|              255476|       The BLUE OWL:|               null| null|null|
|              267708|Charming Hotel Al...|               null| null|null|
|              405408|Magazine SOHO Stu...|               null| null|null|
|             1096326|Private studio in...|               null| null|null|
|,6024006,Johnny,S...|                null|               null| null|null|
|             3465651|“No Place Like Home”|               null| null|null|
|             3757461|Manhattan Sun Dre...|               null| null|null|
|             3768977|2 bedroom Duplex ...|               null| null|null|
|             4756578|Clean | Green | S...|               null| null|null|
|             4787527|    Home Sweet Home!|               null| null|null|
|             5286482|New

**lead window function**

In [None]:
airbnb_lead_df = airbnb_df.withColumn('lead', lead('price', 2).over(neighbourhood_group_partition))
airbnb_lead_df.select('id', 'name', 'neighbourhood_group', 'price', 'lead').show()

+--------------------+--------------------+-------------------+-----+----+
|                  id|                name|neighbourhood_group|price|lead|
+--------------------+--------------------+-------------------+-----+----+
|              255476|       The BLUE OWL:|               null| null|null|
|              267708|Charming Hotel Al...|               null| null|null|
|              405408|Magazine SOHO Stu...|               null| null|null|
|             1096326|Private studio in...|               null| null|null|
|,6024006,Johnny,S...|                null|               null| null|null|
|             3465651|“No Place Like Home”|               null| null|null|
|             3757461|Manhattan Sun Dre...|               null| null|null|
|             3768977|2 bedroom Duplex ...|               null| null|null|
|             4756578|Clean | Green | S...|               null| null|null|
|             4787527|    Home Sweet Home!|               null| null|null|
|             5286482|New

**window aggregate function**

In [None]:
airbnb_aggregate = Window.partitionBy('neighbourhood_group')

airbnb_aggregate_df = airbnb_df.withColumn('row', row_number().over(neighbourhood_group_partition)) \
                               .withColumn('avg_price', avg(col('price')).over(airbnb_aggregate)) \
                               .withColumn('total_price', sum(col('price')).over(airbnb_aggregate)) \
                               .withColumn('min_price', min(col('price')).over(airbnb_aggregate)) \
                               .withColumn('max_price', max(col('price')).over(airbnb_aggregate))
              
airbnb_aggregate_df.where(col('row')==1).select('neighbourhood_group', 'avg_price', 'total_price', 'min_price', 'max_price').show()

+-------------------+------------------+-----------+---------------+---------------+
|neighbourhood_group|         avg_price|total_price|      min_price|      max_price|
+-------------------+------------------+-----------+---------------+---------------+
|               null|              null|       null|           null|           null|
|          194716858|         -74.00828|  -74.00828|      -74.00828|      -74.00828|
|          197400421|         -73.99986|  -73.99986|      -73.99986|      -73.99986|
|            Arverne|               2.0|        2.0|              2|              2|
|            Astoria|              12.0|       24.0|             21|              3|
|         Bath Beach|               2.0|        2.0|              2|              2|
| Bedford-Stuyvesant|1.6666666666666667|       15.0|              1|              3|
|       Borough Park|               2.0|        2.0|              2|              2|
|          Briarwood|               1.0|        1.0|             

In [None]:
airbnb_df = airbnb_df.withColumn("price",airbnb_df.price.cast('float'))

In [None]:
airbnb_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- host_id: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- neighbourhood_group: string (nullable = true)
 |-- neighbourhood: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- room_type: string (nullable = true)
 |-- price: float (nullable = true)
 |-- minimum_nights: string (nullable = true)
 |-- number_of_reviews: string (nullable = true)
 |-- last_review: string (nullable = true)
 |-- reviews_per_month: string (nullable = true)
 |-- calculated_host_listings_count: string (nullable = true)
 |-- availability_365: integer (nullable = true)



In [None]:
airbnb_df.groupby('neighbourhood_group').avg('price').show()

+-------------------+------------------+
|neighbourhood_group|        avg(price)|
+-------------------+------------------+
|         Douglaston|               1.0|
|             Queens| 99.57690941385435|
|              Nadia|              null|
|            Midtown|               9.0|
|    Jackson Heights|              16.0|
|     Hell's Kitchen|1.2857142857142858|
|  Greenwich Village|              55.5|
|       Clinton Hill|              14.0|
| Washington Heights|              2.75|
|   Ditmars Steinway|3.3333333333333335|
|           Longwood|               5.0|
|          Briarwood|               1.0|
|        Little Neck|               1.0|
|           Flushing|10.333333333333334|
|      Randall Manor|               7.0|
|             Carmen|              null|
|      East Elmhurst|               1.0|
|    Upper East Side|1.5714285714285714|
|               null|              null|
|         Bath Beach|               2.0|
+-------------------+------------------+
only showing top

In [None]:
airbnb_df.count()

49079

In [None]:
airbnb_df.select('room_type').distinct().collect()

[Row(room_type='51'),
 Row(room_type='205'),
 Row(room_type='54'),
 Row(room_type='200'),
 Row(room_type='279'),
 Row(room_type='138'),
 Row(room_type='69'),
 Row(room_type='42'),
 Row(room_type='Shared room'),
 Row(room_type='-73.95777'),
 Row(room_type='425'),
 Row(room_type='59'),
 Row(room_type='250'),
 Row(room_type='160'),
 Row(room_type='40.57453'),
 Row(room_type='85'),
 Row(room_type='35'),
 Row(room_type='-73.95179'),
 Row(room_type='410'),
 Row(room_type='298'),
 Row(room_type='188'),
 Row(room_type='99'),
 Row(room_type='110'),
 Row(room_type=None),
 Row(room_type='179'),
 Row(room_type='300'),
 Row(room_type='31'),
 Row(room_type='100'),
 Row(room_type='70'),
 Row(room_type='6500'),
 Row(room_type='75'),
 Row(room_type='140'),
 Row(room_type='120'),
 Row(room_type='-73.94639'),
 Row(room_type='46'),
 Row(room_type='130'),
 Row(room_type='78'),
 Row(room_type='89'),
 Row(room_type='-73.94861'),
 Row(room_type='185'),
 Row(room_type='230'),
 Row(room_type='60'),
 Row(room_ty