# Airbnb Models

Alex Bass (ujb3bu)



## 5.10 Final Project Ungraded Assignment
At this point in the course, you should be training and evaluating models. Please create a Jupyter Notebook containing a concise summary of your dataset (described in submission instructions).  

At a minimum, the file should include a summary containing:

- Number of records
- Number of columns
- Statistical summary of response variable
- Statistical summary of potential predictor variables (if there are a large number of predictors, select the top 10)
    - Note: Summarize categorical variables with counts and percentages for each level and summarize numerical variables with mean/quantiles/standard deviation.
- Include up to five helpful graphs

In [19]:
# Imports
import os
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
import pyspark.sql.functions as fn

In [20]:
%matplotlib inline
import matplotlib.pyplot as plt

In [21]:
# Start spark session
spark = SparkSession.builder.getOrCreate()


### Reading in Parquet Files
*Note: Don't need to read in a schema because this information is stored in the parquet file and applied when loaded*

In [22]:
# Read data in json format
train = spark.read.option("header",True).parquet("./data_preprocessed/train_data.parquet")
test = spark.read.option("header",True).parquet("./data_preprocessed/test_data.parquet")

### Quick edits before training

In [23]:
train.groupby("language").count().show(30)

+--------+------+
|language| count|
+--------+------+
|      en|144481|
|      pl|    41|
|      pt|   157|
|      ko|   532|
|      cs|    20|
|      tr|    48|
|      de|   523|
|      is|     4|
|      es|   646|
|      hr|     1|
|      el|    19|
|      it|   355|
|      sv|    90|
|      nl|    72|
|      hu|    13|
|      ca|     5|
|      ru|   276|
|      th|    21|
|      no|    20|
|      zh|  1137|
|      fr|   817|
|      ja|   148|
|      id|    17|
|      da|    42|
|      fi|    11|
+--------+------+



In [24]:
test.groupby("language").count().show(30)

+--------+-----+
|language|count|
+--------+-----+
|      en|61833|
|      pl|   13|
|      pt|   83|
|      ko|  215|
|      cs|   12|
|      tr|   16|
|      de|  209|
|      is|    1|
|      es|  269|
|      hr|    1|
|      el|    5|
|      it|  159|
|      sv|   32|
|      nl|   25|
|      hu|    5|
|      ru|  113|
|      th|    3|
|      no|   10|
|      zh|  495|
|      fr|  355|
|      ja|   77|
|      id|    5|
|      da|   16|
|      fi|    3|
+--------+-----+



There are pretty low numbers of some of these languages that are not in both datasets (e.g. `ca`), so going to make a new language variable where the biggest languages have their own category and everything else is considered `other`

In [25]:
train = train.withColumn(
    'language',
    fn.when((train.language == 'en'), 'en')\
    .when((train.language == 'es'), 'es')\
    .when((train.language == 'zh'), 'zh')\
    .when((train.language == 'fr'), 'fr')\
    .when((train.language == 'de'), 'de')\
    .when((train.language == 'ko'), 'ko')\
    .when((train.language == 'it'), 'it')\
    .when((train.language == 'ru'), 'ru')\
    .otherwise("other")
)

test = test.withColumn(
    'language',
    fn.when((test.language == 'en'), 'en')\
    .when((test.language == 'es'), 'es')\
    .when((test.language == 'zh'), 'zh')\
    .when((test.language == 'fr'), 'fr')\
    .when((test.language == 'de'), 'de')\
    .when((test.language == 'ko'), 'ko')\
    .when((test.language == 'it'), 'it')\
    .when((test.language == 'ru'), 'ru')\
    .otherwise("other")
)

In [26]:
#Need to rename the variable 'booked' to 'label'
train = train.withColumn(
    'label',
    fn.when((train.country_destination == 'NDF'), 0)\
    .otherwise(1)
)

test = test.withColumn(
    'label',
    fn.when((test.country_destination == 'NDF'), 0)\
    .otherwise(1)
)

In [34]:
# One hot encoding for gender, signup method, language, signup app
from pyspark.ml.feature import OneHotEncoder, StringIndexer

# Gender
gender_idx = StringIndexer(inputCol="gender", outputCol="gender_idx")
gender_ohe = OneHotEncoder(inputCol="gender_idx", outputCol="gender_vec")

# Signup method
signup_method_idx = StringIndexer(inputCol="signup_method", outputCol="signup_method_idx")
signup_method_ohe = OneHotEncoder(inputCol="signup_method_idx", outputCol="signup_method_vec")

# Language
language_idx = StringIndexer(inputCol="language", outputCol="language_idx")
language_ohe = OneHotEncoder(inputCol="language_idx", outputCol="language_vec")

# Signup app
signup_app_idx = StringIndexer(inputCol="signup_app", outputCol="signup_app_idx")
signup_app_ohe = OneHotEncoder(inputCol="signup_app_idx", outputCol="signup_app_vec")

# First Device Type - Perhaps people browsing on their computer may be more serious about buying vs casual browsing on phone
device_idx = StringIndexer(inputCol="first_device_type", outputCol="first_device_type_idx")
device_ohe = OneHotEncoder(inputCol="first_device_type_idx", outputCol="first_device_type_vec")

In [35]:
# Impute missing values with median
from pyspark.ml.feature import Imputer

# Age
imputer_age = Imputer(
    inputCol='age_new', 
    outputCol='age_new_imputed'
    ).setStrategy("median")

# Total time elapsed
imputer_total_elapsed = Imputer(
    inputCol='total_time_elapsed', 
    outputCol='total_time_elapsed_imputed'
    ).setStrategy("median")

# # Total actions
imputer_total_num_actions = Imputer(
    inputCol='total_num_actions', 
    outputCol='total_num_actions_imputed'
    ).setMissingValue(0)

In [36]:
# Select features
features = [
    "age_new_imputed", 
    "age_missing",
    "gender_vec", 
    "signup_method_vec", 
    "language_vec", 
    "signup_app_vec",
    "total_time_elapsed_imputed", 
    'total_num_actions_imputed', 
    "first_device_type_vec"]

from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=features,
                            outputCol="fts")

In [37]:
# Scale all features
from pyspark.ml.feature import MaxAbsScaler
# Using maxabsscaler because some OHE features are sparse
scaler = MaxAbsScaler(inputCol="fts", outputCol="features")

In [38]:
# Setup logistic regression model
max_iterations = 10

from pyspark.ml.classification import LogisticRegression

lasso = LogisticRegression(maxIter=max_iterations,
                        featuresCol = 'features',
                        labelCol = 'label',
                        elasticNetParam=1
                       )

In [39]:
# Configure pipeline
from pyspark.ml import Pipeline
pipeline_lasso = Pipeline(stages=[
    gender_idx, 
    gender_ohe, 
    signup_method_idx,
    signup_method_ohe,
    signup_app_idx, 
    signup_app_ohe,
    device_idx, 
    device_ohe,
    language_idx, 
    language_ohe,
    imputer_age, 
    imputer_total_elapsed, 
    imputer_total_num_actions, 
    assembler, 
    scaler, 
    lasso])

In [40]:
import time
t0 = time.time()
model_lasso = pipeline_lasso.fit(train)
print("train time:", time.time() - t0)

train time: 4.809465169906616


In [41]:
# Setup logistic regression model
max_iterations = 10

from pyspark.ml.classification import LogisticRegression

ridge = LogisticRegression(maxIter=max_iterations,
                        featuresCol = 'features',
                        labelCol = 'label',
                        elasticNetParam=0
                       )

# Configure pipeline
from pyspark.ml import Pipeline
pipeline_ridge = Pipeline(stages=[
    gender_idx, 
    gender_ohe, 
    signup_method_idx,
    signup_method_ohe,
    signup_app_idx, 
    signup_app_ohe,
    device_idx, 
    device_ohe,
    language_idx, 
    language_ohe,
    imputer_age, 
    imputer_total_elapsed, 
    imputer_total_num_actions, 
    assembler, 
    scaler, 
    lasso])

t0 = time.time()
model_ridge = pipeline_ridge.fit(train)
print("train time:", time.time() - t0)

train time: 4.415439128875732


In [42]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(
    featuresCol = 'features',
    labelCol = 'label'
)

In [44]:
pipeline_rf = Pipeline(stages=[
    gender_idx, 
    gender_ohe, 
    signup_method_idx,
    signup_method_ohe,
    signup_app_idx, 
    signup_app_ohe,
    device_idx, 
    device_ohe,
    language_idx, 
    language_ohe,
    imputer_age, 
    imputer_total_elapsed, 
    imputer_total_num_actions, 
    assembler, 
    scaler, 
    rf])

# Set up the parameter grid
paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [5, 20, 50]) \
    .addGrid(rf.maxDepth, [3,5,6]) \
    .build()

print('-'*30)
print('paramGrid', paramGrid, '\n')
print('len(paramGrid): {}'.format(len(paramGrid)))
print('-'*30)

crossval = CrossValidator(estimator=pipeline_rf,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=3,
                          seed=1)

t0 = time.time()
cv_model_rf = crossval.fit(train)
print("train time:", time.time() - t0)
print('-'*30)
print(cv_model_rf.avgMetrics)

------------------------------
paramGrid [{Param(parent='RandomForestClassifier_4637e8ee2521', name='numTrees', doc='Number of trees to train (>= 1).'): 5, Param(parent='RandomForestClassifier_4637e8ee2521', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 3}, {Param(parent='RandomForestClassifier_4637e8ee2521', name='numTrees', doc='Number of trees to train (>= 1).'): 5, Param(parent='RandomForestClassifier_4637e8ee2521', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 5}, {Param(parent='RandomForestClassifier_4637e8ee2521', name='numTrees', doc='Number of trees to train (>= 1).'): 5, Param(parent='RandomForestClassifier_4637e8ee2521', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 6}, {Param(parent='RandomForestClassif

In [45]:
from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes(
    featuresCol = 'features',
    labelCol = 'label'
)

pipeline_bayes = Pipeline(stages=[
    gender_idx, 
    gender_ohe, 
    signup_method_idx,
    signup_method_ohe,
    signup_app_idx, 
    signup_app_ohe,
    device_idx, 
    device_ohe,
    language_idx, 
    language_ohe,
    imputer_age, 
    imputer_total_elapsed, 
    imputer_total_num_actions, 
    assembler, 
    scaler, 
    nb
])

# Set up the parameter grid
paramGrid = ParamGridBuilder() \
    .addGrid(nb.smoothing, [0, 0.5, 1, 5]) \
    .build()

print('-'*30)
print('paramGrid', paramGrid, '\n')
print('len(paramGrid): {}'.format(len(paramGrid)))
print('-'*30)

crossval = CrossValidator(estimator=pipeline_bayes,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=3,
                          seed=1)

t0 = time.time()
cv_model_bayes = crossval.fit(train)
print("train time:", time.time() - t0)
print('-'*30)
print(cv_model_bayes.avgMetrics)

------------------------------
paramGrid [{Param(parent='NaiveBayes_669eef610a45', name='smoothing', doc='The smoothing parameter, should be >= 0, default is 1.0'): 0.0}, {Param(parent='NaiveBayes_669eef610a45', name='smoothing', doc='The smoothing parameter, should be >= 0, default is 1.0'): 0.5}, {Param(parent='NaiveBayes_669eef610a45', name='smoothing', doc='The smoothing parameter, should be >= 0, default is 1.0'): 1.0}, {Param(parent='NaiveBayes_669eef610a45', name='smoothing', doc='The smoothing parameter, should be >= 0, default is 1.0'): 5.0}] 

len(paramGrid): 4
------------------------------
train time: 37.93347787857056
------------------------------
[0.6743650970573876, 0.6743822631119618, 0.6743786795652509, 0.674403451482228]


In [None]:
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(
    featuresCol = 'features',
    labelCol = 'label',
    maxIter = 5
)

pipeline_gbt = Pipeline(stages=[
    gender_idx, 
    gender_ohe, 
    signup_method_idx,
    signup_method_ohe,
    signup_app_idx, 
    signup_app_ohe,
    device_idx, 
    device_ohe,
    language_idx, 
    language_ohe,
    imputer_age, 
    imputer_total_elapsed, 
    imputer_total_num_actions, 
    assembler, 
    scaler, 
    gbt
])

# Set up the parameter grid
paramGrid = ParamGridBuilder() \
    .addGrid(gbt.maxDepth, [3, 5, 6]) \
    .addGrid(gbt.minWeightFractionPerNode, [0, 0.01, 0.1]) \
    .build()

print('-'*30)
print('paramGrid', paramGrid, '\n')
print('len(paramGrid): {}'.format(len(paramGrid)))
print('-'*30)

crossval = CrossValidator(estimator=pipeline_gbt,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=3,
                          seed=1)

t0 = time.time()
cv_model_gbt = crossval.fit(train)
print("train time:", time.time() - t0)
print('-'*30)
print(cv_model_gbt.avgMetrics)

------------------------------
paramGrid [{Param(parent='GBTClassifier_00811aebe68c', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 3, Param(parent='GBTClassifier_00811aebe68c', name='minWeightFractionPerNode', doc='Minimum fraction of the weighted sample count that each child must have after split. If a split causes the fraction of the total weight in the left or right child to be less than minWeightFractionPerNode, the split will be discarded as invalid. Should be in interval [0.0, 0.5).'): 0.0}, {Param(parent='GBTClassifier_00811aebe68c', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 3, Param(parent='GBTClassifier_00811aebe68c', name='minWeightFractionPerNode', doc='Minimum fraction of the weighted sample count that each child must have after split. If a split causes the fraction of the total weight in

In [None]:
import pandas as pd

pd.DataFrame({
    "Model" : [
        "Lasso",
        "Ridge",
        "Naive Bayes",
        "Random Forest",
        "GBT"
    ],
    "Training AUC" : [
        cv_model_lr.avgMetrics[0],
        max(cv_model_bayes.avgMetrics),
        max(cv_model_rf.avgMetrics),
        max(cv_model_gbt.avgMetrics)
    ]
}).sort_values("AUC", ascending=False)

In [None]:
model_ridge.save('./models/ridge.model')
model_lasso.save('./models/lasso.model')
cv_model_bayes.save('./models/naive_bayes.model')
cv_model_rf.save('./models/random_forest.model')
cv_model_gbt.save('./models/gbt.model')