In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
sc = SparkSession.builder.getOrCreate()
data_path = '/data/train.csv'

schema = StructType([
    StructField("date_time", LongType(), True),
    StructField("site_name", LongType(), True),
    StructField("posa_continent", LongType(), True),
    StructField("user_location_country", LongType(), True),
    StructField("user_location_region", LongType(), True),
    StructField("user_location_city", LongType(), True),
    StructField("orig_destination_distance", DoubleType(), True),
    StructField("user_id", LongType(), True),
    StructField("is_mobile", ByteType(), True),
    StructField("is_package", ByteType(), True),
    StructField("channel", LongType(), True),
    StructField("srch_ci", StringType(), True),
    StructField("srch_co", StringType(), True),
    StructField("srch_adults_cnt", ByteType(), True),
    StructField("srch_children_cnt", ByteType(), True),
    StructField("srch_rm_cnt", ShortType(), True),
    StructField("srch_destination_id", LongType(), True),
    StructField("srch_destination_type_id", LongType(), True),
    StructField("is_booking", ByteType(), True),
    StructField("cnt", LongType(), True),
    StructField("hotel_continent", ByteType(), True),
    StructField("hotel_country", LongType(), True),
    StructField("hotel_market", LongType(), True),
    StructField("hotel_cluster", LongType(), True),
    StructField("__index_level_0__", LongType(), True)
])

In [4]:
data = sc.read.schema(schema) \
    .option("header", "false") \
    .option("delimiter", ",") \
    .csv(data_path)

result = (
    data.select('hotel_continent', 'hotel_country', 'hotel_market', 'srch_adults_cnt')
    .filter(data['srch_adults_cnt'] == '2')
    .groupBy(data['hotel_continent'], data['hotel_country'], data['hotel_market']).count()
)

result = result.orderBy(result['count'], ascending=False)
result.show(3)

+---------------+-------------+------------+-------+
|hotel_continent|hotel_country|hotel_market|  count|
+---------------+-------------+------------+-------+
|              2|           50|         628|1190143|
|              2|           50|         675|1007502|
|              4|            8|         110| 588213|
+---------------+-------------+------------+-------+
only showing top 3 rows

