# 0 Imports

## 0.1. Libraries

In [94]:
# data manipulation
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_timestamp, count, when
from pyspark.sql.types import StructField, StructType, StringType, FloatType

# other
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

## 0.2. Helper functions

## 0.3. Data acquisition
Data source: [Kaggle](https://www.kaggle.com/competitions/airbnb-recruiting-new-user-bookings)

In [3]:
# instantiate spark session object
spark = SparkSession.builder \
                    .master('local[*]') \
                    .config("spark.submit.deployMode", 'client') \
                    .config('spark.executor.instances', "16") \
                    .config('spark.driver.memory', '6g') \
                    .config('spark.executor.memory', "6g") \
                    .config('spark.executor.memoryOverhead', '6g') \
                    .appName('airbnb-first-booking-prediction') \
                    .getOrCreate()

22/05/18 16:23:22 WARN Utils: Your hostname, archlinux resolves to a loopback address: 127.0.1.1; using 192.168.0.12 instead (on interface enp3s0)
22/05/18 16:23:22 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/18 16:23:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/05/18 16:23:23 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [17]:
!ls -lh ../data/raw

total 634M
-rw-r--r-- 1 qwoek qwoek  12K Oct 16  2015 age_gender_bkts.csv
-rw-r--r-- 1 qwoek qwoek  632 Oct 16  2015 countries.csv
-rw-r--r-- 1 qwoek qwoek 910K Dec  6  2015 sample_submission_NDF.csv
-rw-r--r-- 1 qwoek qwoek 603M Dec  6  2015 sessions.csv
-rw-r--r-- 1 qwoek qwoek 6.5M May 16  2018 test_users.csv
-rw-r--r-- 1 qwoek qwoek  24M Dec 10  2015 train_users_2.csv


In [18]:
!head -n 10 ../data/raw/train_users_2.csv

id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US
osr2jwljor,2010-01-01,20100101215619,2010-01-02,-unknown-,,basic,0,en,other,other,omg,Web,Mac Desktop,Chrome,US
lsw9q7uk0j,2010-01-02,20100102012558,2010-01-05,

In [57]:
# define dataset schema
schema_df_train = '''
    id STRING,
    date_account_created TIMESTAMP,
    timestamp_first_active STRING,
    date_first_booking STRING,
    gender STRING,
    age INTEGER,
    signup_method STRING,
    signup_flow STRING,
    language STRING,
    affiliate_channel STRING,
    affiliate_provider STRING,
    first_affiliate_tracked STRING,
    signup_app STRING,
    first_device_type STRING,
    first_browser STRING,
    country_destination STRING
'''

# read dataset
df_train = spark.read.csv(path='../data/raw/train_users_2.csv', schema=schema_df_train, header=True)

# format date columns
df_train = df_train.withColumn('timestamp_first_active', to_timestamp('timestamp_first_active', 'yyyyMMddHHmmss'))
df_train = df_train.withColumn('date_first_booking', to_timestamp('date_first_booking', 'yyyy-MM-dd'))

# check it out
df_train.take(5)

[Row(id='gxn3p5htnn', date_account_created=datetime.datetime(2010, 6, 28, 0, 0), timestamp_first_active=datetime.datetime(2009, 3, 19, 4, 32, 55), date_first_booking=None, gender='-unknown-', age=None, signup_method='facebook', signup_flow='0', language='en', affiliate_channel='direct', affiliate_provider='direct', first_affiliate_tracked='untracked', signup_app='Web', first_device_type='Mac Desktop', first_browser='Chrome', country_destination='NDF'),
 Row(id='820tgsjxq7', date_account_created=datetime.datetime(2011, 5, 25, 0, 0), timestamp_first_active=datetime.datetime(2009, 5, 23, 17, 48, 9), date_first_booking=None, gender='MALE', age=None, signup_method='facebook', signup_flow='0', language='en', affiliate_channel='seo', affiliate_provider='google', first_affiliate_tracked='untracked', signup_app='Web', first_device_type='Mac Desktop', first_browser='Chrome', country_destination='NDF'),
 Row(id='4ft3gnwmtx', date_account_created=datetime.datetime(2010, 9, 28, 0, 0), timestamp_fir

In [59]:
!head -n 10 ../data/raw/sessions.csv

user_id,action,action_type,action_detail,device_type,secs_elapsed
d1mm9tcy42,lookup,,,Windows Desktop,319.0
d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753.0
d1mm9tcy42,lookup,,,Windows Desktop,301.0
d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,22141.0
d1mm9tcy42,lookup,,,Windows Desktop,435.0
d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,7703.0
d1mm9tcy42,lookup,,,Windows Desktop,115.0
d1mm9tcy42,personalize,data,wishlist_content_update,Windows Desktop,831.0
d1mm9tcy42,index,view,view_search_results,Windows Desktop,20842.0


In [63]:
# define dataset schema
schema_df_sessions = StructType([
  StructField('user_id', StringType(), True),
  StructField('action', StringType(), True),
  StructField('action_type', StringType(), True),
  StructField('action_detail', StringType(), True),
  StructField('device_type', StringType(), True),
  StructField('secs_elapsed', FloatType(), True)
])

# read dataset
df_sessions = spark.read.csv(path='../data/raw/sessions.csv', schema=schema_df_sessions, header=True)

# check it out
df_sessions.take(5)

[Row(user_id='d1mm9tcy42', action='lookup', action_type=None, action_detail=None, device_type='Windows Desktop', secs_elapsed=319.0),
 Row(user_id='d1mm9tcy42', action='search_results', action_type='click', action_detail='view_search_results', device_type='Windows Desktop', secs_elapsed=67753.0),
 Row(user_id='d1mm9tcy42', action='lookup', action_type=None, action_detail=None, device_type='Windows Desktop', secs_elapsed=301.0),
 Row(user_id='d1mm9tcy42', action='search_results', action_type='click', action_detail='view_search_results', device_type='Windows Desktop', secs_elapsed=22141.0),
 Row(user_id='d1mm9tcy42', action='lookup', action_type=None, action_detail=None, device_type='Windows Desktop', secs_elapsed=435.0)]

In [70]:
# join datasets
df = df_train.join(df_sessions, on=[df_train.id == df_sessions.user_id], how='inner')

# drop repetead column
df = df.drop('user_id')

# check it out
df.take(3)

                                                                                

[Row(id='00b9hfwaak', date_account_created=datetime.datetime(2014, 6, 5, 0, 0), timestamp_first_active=datetime.datetime(2014, 6, 5, 16, 38, 16), date_first_booking=datetime.datetime(2014, 6, 5, 0, 0), gender='-unknown-', age=None, signup_method='basic', signup_flow='0', language='zh', affiliate_channel='seo', affiliate_provider='google', first_affiliate_tracked='linked', signup_app='Web', first_device_type='Windows Desktop', first_browser='Chrome', country_destination='US', action='show', action_type=None, action_detail=None, device_type='Windows Desktop', secs_elapsed=64.0),
 Row(id='00b9hfwaak', date_account_created=datetime.datetime(2014, 6, 5, 0, 0), timestamp_first_active=datetime.datetime(2014, 6, 5, 16, 38, 16), date_first_booking=datetime.datetime(2014, 6, 5, 0, 0), gender='-unknown-', age=None, signup_method='basic', signup_flow='0', language='zh', affiliate_channel='seo', affiliate_provider='google', first_affiliate_tracked='linked', signup_app='Web', first_device_type='Wind

In [71]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- date_account_created: timestamp (nullable = true)
 |-- timestamp_first_active: timestamp (nullable = true)
 |-- date_first_booking: timestamp (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- signup_method: string (nullable = true)
 |-- signup_flow: string (nullable = true)
 |-- language: string (nullable = true)
 |-- affiliate_channel: string (nullable = true)
 |-- affiliate_provider: string (nullable = true)
 |-- first_affiliate_tracked: string (nullable = true)
 |-- signup_app: string (nullable = true)
 |-- first_device_type: string (nullable = true)
 |-- first_browser: string (nullable = true)
 |-- country_destination: string (nullable = true)
 |-- action: string (nullable = true)
 |-- action_type: string (nullable = true)
 |-- action_detail: string (nullable = true)
 |-- device_type: string (nullable = true)
 |-- secs_elapsed: float (nullable = true)



In [72]:
# split dataset into 16 pieces to leverage spark's engine
df.repartition(16).write.parquet('../data/interim/')

# check it out
!ls -lx  ../data/interim | wc -l

                                                                                

18


In [83]:
# read splited data
df = spark.read.parquet('../data/interim')

# check it out
df.take(3)

[Row(id='9tfyeib61i', date_account_created=datetime.datetime(2014, 2, 21, 0, 0), timestamp_first_active=datetime.datetime(2014, 2, 21, 3, 54, 46), date_first_booking=None, gender='FEMALE', age=None, signup_method='facebook', signup_flow='0', language='en', affiliate_channel='sem-brand', affiliate_provider='google', first_affiliate_tracked='tracked-other', signup_app='Web', first_device_type='iPad', first_browser='Mobile Safari', country_destination='NDF', action='lookup', action_type=None, action_detail=None, device_type='Mac Desktop', secs_elapsed=616.0),
 Row(id='2yhxvwn42t', date_account_created=datetime.datetime(2014, 3, 29, 0, 0), timestamp_first_active=datetime.datetime(2014, 3, 29, 17, 57, 26), date_first_booking=None, gender='-unknown-', age=None, signup_method='basic', signup_flow='0', language='en', affiliate_channel='sem-non-brand', affiliate_provider='google', first_affiliate_tracked='omg', signup_app='Web', first_device_type='Mac Desktop', first_browser='Safari', country_d

# 1. Data cleansing and description

## Index

Column | Meaning
---|--------
id | user id
date_account_created | the date of account creation
timestamp_first_active | timestamp of the first activity, note that it can be earlier than date_account_created or date_first_booking because a user can search before signing up
date_first_booking | date of first booking
gender |
age |
signup_method |
signup_flow | the page a user came to signup up from
language | international language preference
affiliate_channel | what kind of paid marketing
affiliate_provider | where the marketing is e.g. google, craigslist, other
first_affiliate_tracked | whats the first marketing the user interacted with before the signing up
signup_app |
first_device_type |
first_browser |
country_destination | this is the target variable you are to predict
action |
action_type |
action_detail |
device_type |
secs_elapsed |

## 1.1. Data dimensions

In [85]:
print(f'Number of rows: {df.count()}\nNumber of columns: {len(df.columns)}')

Number of rows: 5537957
Number of columns: 21


## 1.2. Duplicate data

In [86]:
if df.count() > df.dropDuplicates().count():
    print(f'There were {df.count() - df.dropDuplicates().count()} duplicate records.')
    df = df.dropDuplicates()
else:
    print('There is not duplicate data.')



There were 149501 duplicate records.


                                                                                

## 1.3. Missing data

In [95]:
# check if there is null values
df.select([count(when(df[x].isNull(), x)).alias(x) for x in df.columns]).show()



+---+--------------------+----------------------+------------------+------+-------+-------------+-----------+--------+-----------------+------------------+-----------------------+----------+-----------------+-------------+-------------------+------+-----------+-------------+-----------+------------+
| id|date_account_created|timestamp_first_active|date_first_booking|gender|    age|signup_method|signup_flow|language|affiliate_channel|affiliate_provider|first_affiliate_tracked|signup_app|first_device_type|first_browser|country_destination|action|action_type|action_detail|device_type|secs_elapsed|
+---+--------------------+----------------------+------------------+------+-------+-------------+-----------+--------+-----------------+------------------+-----------------------+----------+-----------------+-------------+-------------------+------+-----------+-------------+-----------+------------+
|  0|                   0|                     0|           2976349|     0|5388456|            0|



In [None]:
# date_first_booking 2_976_349
# age 5_388_456

## 1.4. Outliers

## 1.5. Imbalance

## 1.6. Descriptive statistics

### 1.6.1. Numerical attributes

### 1.6.2. Categorical attributes