# Airbnb Models

Alex Bass (ujb3bu)



## 5.10 Final Project Ungraded Assignment
At this point in the course, you should be training and evaluating models. Please create a Jupyter Notebook containing a concise summary of your dataset (described in submission instructions).  

At a minimum, the file should include a summary containing:

- Number of records
- Number of columns
- Statistical summary of response variable
- Statistical summary of potential predictor variables (if there are a large number of predictors, select the top 10)
    - Note: Summarize categorical variables with counts and percentages for each level and summarize numerical variables with mean/quantiles/standard deviation.
- Include up to five helpful graphs

In [None]:
# Imports
import os
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
import pyspark.sql.functions as fn

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
# Start spark session
spark = SparkSession.builder.getOrCreate()


### Users Dataset

In [None]:
# Define schema for data
schema = StructType() \
      .add("id",StringType(),True) \
      .add("date_account_created",StringType(),True) \
      .add("timestamp_first_active",DoubleType(),True) \
      .add("date_first_booking",StringType(),True) \
      .add("gender",StringType(),True) \
      .add("age",DoubleType(),True) \
      .add("signup_method",StringType(),True) \
      .add("signup_flow",IntegerType(),True) \
      .add("language",StringType(),True) \
      .add("affiliate_channel",StringType(),True) \
      .add("affiliate_provider",StringType(),True) \
      .add("first_affiliate_tracked",StringType(),True) \
      .add("signup_app",StringType(),True) \
      .add("first_device_type",StringType(),True) \
      .add("first_browser",StringType(),True) \
      .add("country_destination",StringType(),True)

In [None]:
# Lists of columns
response_col = "country_destination"
id_col = "id"
categorical_cols = ["gender", "signup_method", "language", 
                    "affiliate_channel", "affiliate_provider", "first_affiliate_tracked",
                    "signup_app", "first_device_type", "first_browser"]
numeric_cols = ["timestamp_first_active", "age", "signup_flow", ]
date_cols = ["date_account_created", "date_first_booking"]


In [None]:
# Read data in json format
df = spark.read.option("header",True).csv("./data/train_users_2.csv", schema)


### Sessions Data

In [None]:
# Define schema for data
schema_sessions = StructType() \
      .add("user_id",StringType(),True) \
      .add("action",StringType(),True) \
      .add("action_type",StringType(),True) \
      .add("action_detail",StringType(),True) \
      .add("device_type",StringType(),True) \
      .add("secs_elapsed",DoubleType(),True)

In [None]:
# Read data in json format
df_sessions = spark.read.option("header",True) \
    .csv("./data/sessions.csv", schema_sessions)


In [None]:
# Aggregate some session data to user level
session_agg = df_sessions.groupby('user_id').agg(
    fn.sum('secs_elapsed').alias('total_time_elapsed'),
    fn.count('action').alias('total_num_actions'),
    fn.countDistinct('action').alias('num_unique_actions')
)

In [None]:
# Join datasets - use left join to keep as user data
# Doing inner join becuase high % missing session data for train data
# But low % missing session data in test data
df = df.join(session_agg,df.id ==  session_agg.user_id, "inner")

In [None]:
# Percent missing per column
df.agg(*[
    (1 - (fn.count(c) / fn.count('*'))).alias(c + '_missing')
    for c in df.columns
]).show(vertical = True)

In [None]:
# Select a subset of columns for initial model

features = ["age", "gender", "signup_method", 
            "language", "signup_app", 
            "total_time_elapsed", "total_num_actions", 
            "first_device_type", "date_account_created"]

df = df.select(features + ['country_destination', 'id'])

## Response Variable Distribution

In [None]:
# Statistical summary of response variable (count and percentage)

df.groupBy("country_destination") \
  .count() \
  .withColumnRenamed('count', 'Count') \
  .withColumn('Percent', fn.round((fn.col('Count') / df.count()) * 100 , 2)) \
  .show()

In [None]:
# Create binary column (booked place or did not)
df = df.withColumn(
    'booked',
    fn.when((df.country_destination == 'NDF'), 0)\
    .otherwise(1)
)

In [None]:
# Statistical summary of binary response variable (count and percentage)

df.groupBy("booked") \
  .count() \
  .withColumnRenamed('count', 'Count') \
  .withColumn('Percent', fn.round((fn.col('Count') / df.count()) * 100 , 2)) \
  .show()

In [None]:
# Missing values by booked by column
df.groupby("booked").agg(*[
    (1 - (fn.count(c) / fn.count('*'))).alias(c + '_missing')
    for c in df.columns
]).show(vertical = True)

Clean up age column

In [None]:
# Replace age with nan if outside reasonable range
df = df.withColumn(
    'age_new', \
    fn.when((df.age > 100) | (df.age < 16), None)\
    .otherwise(df.age)
)

In [None]:
# Add column indicating whether age is missing or not
# Since % missing is higher for those that didn't book
# may be a proxy for how effort user has put into updating profile
df = df.withColumn(
    'age_missing',
    fn.when(df.age_new.isNull(), 1)\
    .otherwise(0)
)

In [None]:
df.groupby("age_missing").count().show()

In [None]:
df.columns

In [None]:
## ALEX ADDITION: Time Elapsed Since Account Creation
#check device type
df.groupby("first_device_type").count().show()


In [None]:
# some of the `other` categories are a bit small so combining
# Create binary column (booked place or did not)
df = df.withColumn(
    'first_device_type',
    fn.when((df.first_device_type == 'Android Tablet'), 'Android Tablet')\
    .when((df.first_device_type == 'iPad'), 'iPad')\
    .when((df.first_device_type == 'iPhone'), 'iPhone')\
    .when((df.first_device_type == 'Windows Desktop'), 'Windows Desktop')\
    .when((df.first_device_type == 'Android Phone'), 'Android Phone')\
    .when((df.first_device_type == 'Mac Desktop'), 'Mac Desktop')\
    .otherwise("Other")
)

In [None]:
df.groupby("first_device_type").count().show()

In [None]:
df.select(fn.col("date_account_created"),fn.year(fn.col("date_account_created")).alias("year")).groupby("year").count().show()

looks like all accounts were created in 2014, so going to use month instead. I thought there would be more variance here.

In [None]:
# Also going to create a time elapsed variable to get at longer customers
import seaborn as sns
temp_data = df.select(fn.col("date_account_created"),fn.month(fn.col("date_account_created")).alias("month")).toPandas()
sns.histplot(data=temp_data, x="month")
plt.title('Months Accounts are created in Year 2014')

In [None]:
# Can treat this as a numeric variable since all months are in 2014.
df = df.withColumn(
    'date_account_created',
    fn.month(fn.col("date_account_created")).alias("month")
)

In [None]:
#Need to rename the variable 'booked' to 'label'
df = df.withColumn(
    'label',
    df['booked']
)

In [None]:
# Split data into train/test
seed = 123
train, test = df.randomSplit([.7,.3], seed)

In [None]:
# One hot encoding for gender, signup method, language, signup app
from pyspark.ml.feature import OneHotEncoder, StringIndexer

# Gender
gender_idx = StringIndexer(inputCol="gender", outputCol="gender_idx")
gender_ohe = OneHotEncoder(inputCol="gender_idx", outputCol="gender_vec")

# Signup method
signup_method_idx = StringIndexer(inputCol="signup_method", outputCol="signup_method_idx")
signup_method_ohe = OneHotEncoder(inputCol="signup_method_idx", outputCol="signup_method_vec")

# Language
language_idx = StringIndexer(inputCol="language", outputCol="language_idx")
language_ohe = OneHotEncoder(inputCol="language_idx", outputCol="language_vec")

# Signup app
signup_app_idx = StringIndexer(inputCol="signup_app", outputCol="signup_app_idx")
signup_app_ohe = OneHotEncoder(inputCol="signup_app_idx", outputCol="signup_app_vec")

### Alex Additions ###
# First Device Type - Perhaps people browsing on their computer may be more serious about buying vs casual browsing on phone
device_idx = StringIndexer(inputCol="first_device_type", outputCol="first_device_type_idx")
device_ohe = OneHotEncoder(inputCol="first_device_type_idx", outputCol="first_device_type_vec")

In [None]:
# Impute missing values with median
from pyspark.ml.feature import Imputer

# Age
imputer_age = Imputer(
    inputCol='age_new', 
    outputCol='age_new_imputed'
    ).setStrategy("median")

# Total time elapsed
imputer_total_elapsed = Imputer(
    inputCol='total_time_elapsed', 
    outputCol='total_time_elapsed_imputed'
    ).setStrategy("median")

In [None]:
# Select features
features = ["age_new_imputed", "age_missing",
            "gender_vec", "signup_method_vec", "language_vec", "signup_app_vec",
             "total_time_elapsed_imputed", "total_num_actions", "first_device_type_vec", "date_account_created"]

from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=features,
                            outputCol="fts")

In [None]:
# Scale all features
from pyspark.ml.feature import MaxAbsScaler
# Using maxabsscaler because some OHE features are sparse
scaler = MaxAbsScaler(inputCol="fts", outputCol="features")

In [None]:
# Setup logistic regression model
max_iterations = 10

from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=max_iterations,
                        featuresCol = 'features',
                        labelCol = 'label'
                       )

In [None]:
# Configure pipeline
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[gender_idx, gender_ohe, 
                           signup_method_idx, signup_method_ohe,
                           language_idx, language_ohe,
                           signup_app_idx, signup_app_ohe, device_idx, device_ohe,
                           imputer_age, imputer_total_elapsed,
                           assembler, scaler,
                           lr])

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Set up the parameter grid
paramGrid = ParamGridBuilder() \
    .addGrid(lr.elasticNetParam, [1, 0.5, 0.1, 0.01, 0]) \
    .build()

print('-'*30)
print('paramGrid', paramGrid, '\n')
print('len(paramGrid): {}'.format(len(paramGrid)))
print('-'*30)

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=3,
                          seed=1)
import time
t0 = time.time()
cv_model_lr = crossval.fit(df)
print("train time:", time.time() - t0)
print('-'*30)
print("elastic net 1", "to elastic net 0.01")
print(cv_model_lr.avgMetrics)

In [None]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(
    featuresCol = 'features',
    labelCol = 'label'
)

In [None]:
pipeline_rf = Pipeline(stages=[gender_idx, gender_ohe, 
                           signup_method_idx, signup_method_ohe,
                           language_idx, language_ohe,
                           signup_app_idx, signup_app_ohe, device_idx, device_ohe,
                           imputer_age, imputer_total_elapsed,
                           assembler, scaler,
                           rf])

# Set up the parameter grid
paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [5, 20, 50]) \
    .addGrid(rf.maxDepth, [3,5,6]) \
    .build()

print('-'*30)
print('paramGrid', paramGrid, '\n')
print('len(paramGrid): {}'.format(len(paramGrid)))
print('-'*30)

crossval = CrossValidator(estimator=pipeline_rf,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=3,
                          seed=1)

t0 = time.time()
cv_model_rf = crossval.fit(df)
print("train time:", time.time() - t0)
print('-'*30)
print(cv_model_rf.avgMetrics)

In [None]:
from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes(
    featuresCol = 'features',
    labelCol = 'label'
)

pipeline_bayes = Pipeline(stages=[gender_idx, gender_ohe, 
                           signup_method_idx, signup_method_ohe,
                           language_idx, language_ohe,
                           signup_app_idx, signup_app_ohe, device_idx, device_ohe,
                           imputer_age, imputer_total_elapsed,
                           assembler, scaler,
                           nb])

# Set up the parameter grid
paramGrid = ParamGridBuilder() \
    .addGrid(nb.smoothing, [0, 0.5, 1, 5]) \
    .build()

print('-'*30)
print('paramGrid', paramGrid, '\n')
print('len(paramGrid): {}'.format(len(paramGrid)))
print('-'*30)

crossval = CrossValidator(estimator=pipeline_bayes,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=3,
                          seed=1)

t0 = time.time()
cv_model_bayes = crossval.fit(df)
print("train time:", time.time() - t0)
print('-'*30)
print(cv_model_bayes.avgMetrics)

In [None]:
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(
    featuresCol = 'features',
    labelCol = 'label',
    maxIter = 5
)

pipeline_gbt = Pipeline(stages=[gender_idx, gender_ohe, 
                           signup_method_idx, signup_method_ohe,
                           language_idx, language_ohe,
                           signup_app_idx, signup_app_ohe, device_idx, device_ohe,
                           imputer_age, imputer_total_elapsed,
                           assembler, scaler,
                           gbt])

# Set up the parameter grid
paramGrid = ParamGridBuilder() \
    .addGrid(gbt.max_depth, [3, 5, 6]) \
    .addGrid(gbt.minWeightFractionPerNode, [0, 0.01, 0.1]) \
    .build()

print('-'*30)
print('paramGrid', paramGrid, '\n')
print('len(paramGrid): {}'.format(len(paramGrid)))
print('-'*30)

crossval = CrossValidator(estimator=pipeline_gbt,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=3,
                          seed=1)

t0 = time.time()
cv_model_gbt = crossval.fit(df)
print("train time:", time.time() - t0)
print('-'*30)
print(cv_model_gbt.avgMetrics)

In [None]:
import pandas as pd

pd.DataFrame({
    "Model" : [
        "Lasso",
        "Ridge",
        "Naive Bayes",
        "Random Forest",
        "GBT"
    ],
    "AUC" : [
        cv_model_lr.avgMetrics[0],
        cv_model_lr.avgMetrics[max(len(cv_model_lr.avgMetrics))],
        max(cv_model_nb.avgMetrics),
        max(cv_model_rf.avgMetrics),
        max(cv_model_gbt.avgMetrics)
    ]
})