# Sparkify Project Workspace
This workspace contains a tiny subset (128MB) of the full dataset available (12GB). Feel free to use this workspace to build your project, or to explore a smaller subset with Spark before deploying your cluster on the cloud. Instructions for setting up your Spark cluster is included in the last lesson of the Extracurricular Spark Course content.

You can follow the steps below to guide your data analysis and model building portion of this project.

In [1]:
import os

import findspark

# findspark.init('/home/brunowdev/spark-2.4.5-bin-hadoop2.6/')

findspark.init('/home/bruno/LIBS/spark')

import evaluators

In [65]:
from datetime import datetime

import numpy as np

from sklearn.metrics import confusion_matrix

from pyspark.sql import SparkSession

from pyspark.sql.functions import min as smin, max as smax, sum as ssum, round as sround, abs as sabs, pow as spow
from pyspark.sql.functions import isnan, isnull, when, first, avg, sqrt, last, count, countDistinct, col, lag, lead, coalesce, lit, split, trim

from pyspark.sql.window import Window
from pyspark.sql.functions import to_date, date_format, from_unixtime, to_timestamp

from pyspark.sql.types import DateType, TimestampType, IntegerType
 
import jupyter_utils as j

from pyspark import SparkContext

from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, DecisionTreeClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StandardScaler, MaxAbsScaler, Normalizer, MinMaxScaler, StringIndexer, VectorAssembler


from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator
 
from pyspark.ml.evaluation import BinaryClassificationEvaluator, Evaluator
from pyspark import since, keyword_only

In [None]:
def setup_spark():
     
    SparkContext.setSystemProperty('spark.logConf', 'True')
    SparkContext.setSystemProperty('spark.default.parallelism', '16')
    SparkContext.setSystemProperty('spark.executor.memory', '4g')
    SparkContext.setSystemProperty('spark.driver.memory', '8g')
    SparkContext.setSystemProperty('spark.reducer.maxSizeInFlight', '96m')
    SparkContext.setSystemProperty('spark.shuffle.consolidateFiles', 'True') 
    SparkContext.setSystemProperty('spark.shuffle.service.index.cache.size', '500m')

    SparkContext.setSystemProperty('spark.driver.extraJavaOptions', '-server -Xmx8G')
    # SparkContext.setSystemProperty('spark.executor.extraJavaOptions', '-server -Xmx8G -XX:+UseG1GC')

    SparkContext.setSystemProperty('spark.executor.extraJavaOptions', '-server -XX:+UseG1GC')

In [34]:
j.reload(j)

In [3]:
filepath = 'sparkify_full_csv_data.csv'
# filepath = 'medium_sparkify_event_data.json'

spark = SparkSession \
    .builder \
    .appName("Sparkify") \
    .getOrCreate()

spark.sparkContext.setLogLevel('INFO')

In [4]:
spark.sparkContext.getConf().getAll()

[('spark.driver.port', '39281'),
 ('spark.sql.catalogImplementation', 'hive'),
 ('spark.driver.memory', '6g'),
 ('spark.app.id', 'local-1588530423385'),
 ('spark.rdd.compress', 'True'),
 ('spark.app.name', 'Sparkify'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'local[*]'),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.driver.host', '192.168.0.102'),
 ('spark.ui.showConsoleProgress', 'true')]

# Load and Clean Dataset
In this workspace, the mini-dataset file is `mini_sparkify_event_data.json`. Load and clean the dataset, checking for invalid or missing data - for example, records without userids or sessionids. 

In [5]:
df = spark.read.option("inferSchema", "true").option("header", "true").option("encoding", "utf-8").csv(filepath)
# df = spark.read.option("inferSchema", "true").option("header", "true").option("encoding", "utf-8").json(filepath)

In [8]:
df.cache()

DataFrame[gender: string, length: double, level: string, registration: double, userId: int, ts: bigint, page: string, sessionId: int, itemInSession: int]

In [50]:
df.where(df.userId == 100010).show()

+------+------+-----+------------+------+---+----+---------+-------------+
|gender|length|level|registration|userId| ts|page|sessionId|itemInSession|
+------+------+-----+------------+------+---+----+---------+-------------+
+------+------+-----+------------+------+---+----+---------+-------------+



In [6]:
log4jLogger = spark.sparkContext._jvm.org.apache.log4j

LOGGER = log4jLogger.LogManager.getLogger('driver_logger')

def info(message, print_on_notebook = True):
    LOGGER.info(message)
    
    if print_on_notebook:
        print(message)
    
info('Logger instance created')

Logger instance created


In [9]:
from pyspark import StorageLevel

def set_storage_on_memory():
    info(df.storageLevel)
    df.persist(StorageLevel.MEMORY_ONLY)
    info(df.storageLevel)
    
set_storage_on_memory()

In [16]:
df.printSchema()

root
 |-- gender: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- userId: integer (nullable = true)
 |-- ts: long (nullable = true)
 |-- page: string (nullable = true)
 |-- sessionId: integer (nullable = true)
 |-- itemInSession: integer (nullable = true)



In [17]:
CHURN_CANCELLATION_PAGE = 'Cancellation Confirmation'
REGISTRATION_PAGE = 'Submit Registration'
milliseconds_to_hours = 3600 * 1000
minutes_to_hours = 60 * 60
TRUE = 1
FALSE = 0

def clean_dataframe(df):
    
    info('Starting data cleaning...')
    
    total_before = df.count()
    
    # Keep only logged records
    # df = df.where(df.auth.isin(['Logged In', 'Cancelled']))
    
    # Records without userId
    df = df.where(col('userId').isNotNull())
    
    # Create a date column for the event
    df = df.withColumn('date', from_unixtime(col('ts') / 1000).cast(DateType()))
    
    # Location
    # df = df.withColumn('state', trim(split((split('location', ',').getItem(1)), '-').getItem(0)))
    
    # Relevant windows
    w_session = Window.partitionBy('sessionId').orderBy('ts')
    w_user_session = Window.partitionBy('sessionId', 'userId').orderBy('ts').rangeBetween(Window.unboundedPreceding, Window.unboundedFollowing)
    w_user = Window.partitionBy('userId').orderBy('ts').rangeBetween(Window.unboundedPreceding, Window.unboundedFollowing)
    
    # Create features
    df = df.withColumn('previous_page', lag(df.page).over(w_session))
    df = df.withColumn('last_event_ts', last(col('ts')).over(w_user))
    df = df.withColumn('last_page', last(col('page')).over(w_user))
    df = df.withColumn('register_page', first(col('previous_page')).over(w_user))
    df = df.withColumn('first_ts', first(col('ts')).over(w_user))
    df = df.withColumn('ts_elapsed', last(df.ts).over(w_session) - first(df.ts).over(w_user_session))
    df = df.withColumn('session_duration', smax(df.ts_elapsed).over(w_user_session))
     
    info('Finished data cleaning...')
    info(f'Number of removed rows: {total_before - df.count()}')
    
    return df

In [18]:
df = clean_dataframe(df)

Starting data cleaning...
Finished data cleaning...
Number of removed rows: 0


# Exploratory Data Analysis
When you're working with the full dataset, perform EDA by loading a small subset of the data and doing basic manipulations within Spark. In this workspace, you are already provided a small subset of data you can explore.

### Define Churn

Once you've done some preliminary analysis, create a column `Churn` to use as the label for your model. I suggest using the `Cancellation Confirmation` events to define your churn, which happen for both paid and free users. As a bonus task, you can also look into the `Downgrade` events.

### Explore Data
Once you've defined churn, perform some exploratory data analysis to observe the behavior for users who stayed vs users who churned. You can start by exploring aggregates on these two groups of users, observing how much of a specific action they experienced per a certain time unit or number of songs played.

In [10]:
df.groupBy('page').count().orderBy('count', ascending = False).show(50)

+--------------------+--------+
|                page|   count|
+--------------------+--------+
|            NextSong|20850272|
|                Home| 1343102|
|           Thumbs Up| 1151465|
|     Add to Playlist|  597921|
|         Roll Advert|  385212|
|          Add Friend|  381664|
|               Login|  296350|
|              Logout|  296005|
|         Thumbs Down|  239212|
|           Downgrade|  184240|
|                Help|  155100|
|            Settings|  147074|
|               About|   92759|
|             Upgrade|   50507|
|       Save Settings|   29516|
|               Error|   25962|
|      Submit Upgrade|   15135|
|    Submit Downgrade|    6494|
|Cancellation Conf...|    5003|
|              Cancel|    5003|
|            Register|     802|
| Submit Registration|     401|
+--------------------+--------+



Some questions about the data:

- Are errors related to downgrading canceling the service?
- Having a certain number of friends or a sense of community can decrease the churn?
- Thumbs down are related to churn? (could the quality of the songs catalog affect the churn)
- The advertising is not annoying the users?
- Users with stay connected for more time have less change to churn?
- Is the home page relevant?
- Users, who access the downgrade page are how much more willing to churn?

In [None]:
df.groupBy('status').count().orderBy('count', ascending = False).show(20)

In [None]:
df.filter('userId = 92').groupBy('page').count().orderBy('count', ascending = False).show(50)

In [None]:
df.filter('userId = 92').groupBy('page').count().orderBy('count', ascending = False).show(50)

In [None]:
df.filter('userId = 92').groupBy('userAgent').count().orderBy('count', ascending = False).show(50, False)

In [None]:
df.filter('userId = 92 and song != \'null\' ').groupBy('song').count().orderBy('count', ascending = False).show(50, False)

# Feature Engineering
Once you've familiarized yourself with the data, build out the features you find promising to train your model on. To work with the full dataset, you can follow the following steps.
- Write a script to extract the necessary features from the smaller subset of data
- Ensure that your script is scalable, using the best practices discussed in Lesson 3
- Try your script on the full data set, debugging your script if necessary

If you are working in the classroom workspace, you can just extract features based on the small subset of data contained here. Be sure to transfer over this work to the larger dataset when you work on your Spark cluster.

In [19]:
def create_session_dimension(df):
    
    # sessions from the user
    df_sessions = df.orderBy(df.sessionId).groupBy('sessionId', 'userId').agg(
        smax(df.ts).alias('max_event_ts'),
        smin(df.ts).alias('min_event_ts'),
        ssum(df.length).alias('session_n_total_playback'), # Based on songs length
        count(when(df.page == 'Thumbs Up', True)).alias("session_n_likes"),
        count(when(df.page == 'Thumbs Down', True)).alias("session_n_dislikes"),
        count(when(df.page == 'NextSong', True)).alias("session_n_songs"),
        count(when(df.page == 'Add Friend', True)).alias("session_n_friends"),
        count(when(df.page == 'Add to Playlist', True)).alias("session_n_add_playlist"),
        count(when(df.page == 'Home', True)).alias("session_n_home"),
        count(when(df.page == 'Roll Advert', True)).alias("session_n_ads"),
        count(when(df.page == 'Help', True)).alias("session_n_help"),
        count(when(df.page == 'Error', True)).alias("session_n_error"),
        count(when(df.page == 'Settings', True)).alias("session_n_sets"),
        count(col('page')).alias('session_n_actions'),
        first(col('session_duration')).alias('session_duration')
    ) 
    
    # Calculate the interval until the next session
    w_user_sessions_interval = Window.partitionBy('userId').orderBy('min_event_ts')
    df_sessions = df_sessions.withColumn('interval_to_session', col('min_event_ts') - lag(col('max_event_ts')).over(w_user_sessions_interval))
    
    # Calculate average time in hours for each session
    df_session_time = df_sessions.groupBy('userId').agg(
       (avg(df_sessions.session_duration) / milliseconds_to_hours).alias('session_hours')
    )
    df_sessions = df_sessions.join(df_session_time, on = 'userId')
    
    # We should remove the null lines before count/group to not account 2 times the mean interval
    df_sessions = df_sessions.groupBy('userId').agg(  
        (avg(df_sessions.interval_to_session) / milliseconds_to_hours).alias('session_avg_time_away'),
        ((avg(df_sessions.session_n_total_playback) / minutes_to_hours) / first(col('session_hours'))).alias('session_avg_playback'), 
        (avg(df_sessions.session_n_likes) / first(col('session_hours'))).alias('session_avg_likes'),
        (avg(df_sessions.session_n_dislikes) / first(col('session_hours'))).alias('session_avg_dislikes'),
        (avg(df_sessions.session_n_songs) / first(col('session_hours'))).alias('session_avg_songs'),
        (avg(df_sessions.session_n_friends) / first(col('session_hours'))).alias('session_avg_friends'),
        (avg(df_sessions.session_n_add_playlist) / first(col('session_hours'))).alias('session_avg_added_playlist'),
        (avg(df_sessions.session_n_home) / first(col('session_hours'))).alias('session_avg_home'),
        (avg(df_sessions.session_n_ads) / first(col('session_hours'))).alias('session_avg_ads'),
        (avg(df_sessions.session_n_help) / first(col('session_hours'))).alias('session_avg_help'),
        (avg(df_sessions.session_n_error) / first(col('session_hours'))).alias('session_avg_errors'),
        (avg(df_sessions.session_n_sets) / first(col('session_hours'))).alias('session_avg_settings'),
        (avg(df_sessions.session_n_actions) / first(col('session_hours'))).alias('session_avg_actions')
    )
    
    return df_sessions

def create_user_dimension(df):
    
    df_user_profile = df.groupby('userId')\
        .agg( 

            # first(col('state')).alias('state'),
            first(when(col('gender') == 'M', TRUE).otherwise(FALSE)).alias('male'),

            smin(col('first_ts')).alias('ts_start'),
            smax(col('last_event_ts')).alias('ts_end'),        
        
            ((smax(col('last_event_ts')) - smin(col('first_ts'))) / milliseconds_to_hours).alias('time_window'),
        
            # Subscription
            count(when(col('page') == 'Submit Downgrade', True)).alias('n_downgrades'),
            count(when(col('page') == 'Submit Upgrade', True)).alias('n_upgrades'),
            last(when(col('level') == 'paid', TRUE).otherwise(FALSE)).alias('paid'),
            first(when(col('last_page') == CHURN_CANCELLATION_PAGE, TRUE).otherwise(FALSE)).alias('canceled'),

            # Streaming
            count(when(col('page') == 'NextSong', True)).alias('n_songs'),
            count(when(col('page') == 'Thumbs Up', True)).alias('n_likes'),
            count(when(col('page') == 'Thumbs Down', True)).alias('n_dislikes'),
            countDistinct(col('sessionId')).alias('n_sess'),
            (avg(col('session_duration')) / milliseconds_to_hours).alias('avg_session_duration'),

            # Community
            count(when(col('page') == 'Add Friend', True)).alias('n_friends'),
            count(when(col('page') == 'Add to Playlist', True)).alias('n_added_to_playlist'),

            # Other
            count(when(col('page') == 'Home', True)).alias('n_home'),
            count(when(col('page') == 'Roll Advert', True)).alias('n_ads'),
            count(when(col('page') == 'Help', True)).alias('n_help'),
            count(when(col('page') == 'Error', True)).alias('n_errors'),
            count(when(col('page') == 'Settings', True)).alias('n_settings'),
            count(col('page')).alias('n_actions')
        )
    
    
    # Location
    # states = list(map(lambda c: c[0].strip(), df.select(['state']).distinct().rdd.collect()))
    # for state in states:
    #    df_user_profile = df_user_profile.withColumn(state.lower(), when(df_user_profile.state == state, 1).otherwise(0))
    
    return df_user_profile

def create_days_dimension(df):
    
    df_unique_days = df.groupby('userId').agg(countDistinct('date').alias('n_days'))
    
    df_daily_actions = df.groupby('userId', 'date').agg(count('page').alias('total'))
    df_daily_actions = df_daily_actions.groupby('userId').agg(avg('total').alias('avg_daily_actions')) 

    df_days = df_unique_days.join(df_daily_actions, df_unique_days.userId == df_daily_actions.userId)
    
    # Remove duplicated column after join
    df_days = df_days.drop(df_daily_actions.userId)
    
    return df_days

def sort_features(df, columns_order):
    _columns = df.columns
    _columns.sort()
    
    for _idx, _val in list(enumerate(columns_order)):
        _columns.pop(_columns.index(_val))
        _columns.insert(_idx, _val)
        
    assert len(_columns) == len(df.columns)

    return _columns

In [20]:
binary_features = [ 'paid', 'male' ]

numeric_features = [
    'avg_daily_actions',
    'avg_session_duration', 
    'n_actions',
    'n_added_to_playlist',
    'n_ads',
    'n_days',
    'n_dislikes',
    'n_downgrades',
    'n_errors',
    'n_friends',
    'n_help',
    'n_home',
    'n_likes',
    'n_sess',
    'n_settings',
    'n_songs',
    'n_upgrades', 
    'session_avg_actions',
    'session_avg_added_playlist',
    'session_avg_ads',
    'session_avg_dislikes',
    'session_avg_errors',
    'session_avg_friends',
    'session_avg_help',
    'session_avg_home',
    'session_avg_likes',
    'session_avg_playback',
    'session_avg_settings',
    'session_avg_songs',
    'session_avg_time_away',
    'time_window'
]

columns_all = [
    'canceled',
    'male',
    'paid',
    'avg_daily_actions',
    'avg_session_duration', 
    'n_actions',
    'n_added_to_playlist',
    'n_ads',
    'n_days',
    'n_dislikes',
    'n_downgrades',
    'n_errors',
    'n_friends',
    'n_help',
    'n_home',
    'n_likes',
    'n_sess',
    'n_settings',
    'n_songs',
    'n_upgrades', 
    'session_avg_actions',
    'session_avg_added_playlist',
    'session_avg_ads',
    'session_avg_dislikes',
    'session_avg_errors',
    'session_avg_friends',
    'session_avg_help',
    'session_avg_home',
    'session_avg_likes',
    'session_avg_playback',
    'session_avg_settings',
    'session_avg_songs',
    'session_avg_time_away',
    'time_window'
]

columns_to_train = [
    'male',
    'paid',
    'avg_daily_actions',
    'avg_session_duration', 
    'n_actions',
    'n_added_to_playlist',
    'n_ads',
    'n_days',
    'n_dislikes',
    'n_downgrades',
    'n_errors',
    'n_friends',
    'n_help',
    'n_home',
    'n_likes',
    'n_sess',
    'n_settings',
    'n_songs',
    'n_upgrades', 
    'session_avg_actions',
    'session_avg_added_playlist',
    'session_avg_ads',
    'session_avg_dislikes',
    'session_avg_errors',
    'session_avg_friends',
    'session_avg_help',
    'session_avg_home',
    'session_avg_likes',
    'session_avg_playback',
    'session_avg_settings',
    'session_avg_songs',
    'session_avg_time_away',
    'time_window'
]

#### Transform the data - create a unique row per user

In [21]:
df_sessions = create_session_dimension(df)
df_days = create_days_dimension(df)

df_users = create_user_dimension(df)
df_users = df_users.orderBy(df_users.userId).join(df_days, on = 'userId')

_columns = sort_features(df_users, [ 'userId', 'male', 'paid', 'canceled'])
_columns = list(set(df_users.schema.names + df_sessions.schema.names) - set(['ts_start', 'ts_end', 'state']))

df_users = df_users.orderBy(df_users.userId).join(df_sessions, on = 'userId').select(_columns) 

In [22]:
# Create the new dataframe
df_users = df_users.select(columns_all).fillna(0)

In [23]:
df_users.printSchema()

root
 |-- canceled: integer (nullable = true)
 |-- male: integer (nullable = true)
 |-- paid: integer (nullable = true)
 |-- avg_daily_actions: double (nullable = false)
 |-- avg_session_duration: double (nullable = false)
 |-- n_actions: long (nullable = false)
 |-- n_added_to_playlist: long (nullable = false)
 |-- n_ads: long (nullable = false)
 |-- n_days: long (nullable = false)
 |-- n_dislikes: long (nullable = false)
 |-- n_downgrades: long (nullable = false)
 |-- n_errors: long (nullable = false)
 |-- n_friends: long (nullable = false)
 |-- n_help: long (nullable = false)
 |-- n_home: long (nullable = false)
 |-- n_likes: long (nullable = false)
 |-- n_sess: long (nullable = false)
 |-- n_settings: long (nullable = false)
 |-- n_songs: long (nullable = false)
 |-- n_upgrades: long (nullable = false)
 |-- session_avg_actions: double (nullable = false)
 |-- session_avg_added_playlist: double (nullable = false)
 |-- session_avg_ads: double (nullable = false)
 |-- session_avg_dislik

In [24]:
df_users.cache()

DataFrame[canceled: int, male: int, paid: int, avg_daily_actions: double, avg_session_duration: double, n_actions: bigint, n_added_to_playlist: bigint, n_ads: bigint, n_days: bigint, n_dislikes: bigint, n_downgrades: bigint, n_errors: bigint, n_friends: bigint, n_help: bigint, n_home: bigint, n_likes: bigint, n_sess: bigint, n_settings: bigint, n_songs: bigint, n_upgrades: bigint, session_avg_actions: double, session_avg_added_playlist: double, session_avg_ads: double, session_avg_dislikes: double, session_avg_errors: double, session_avg_friends: double, session_avg_help: double, session_avg_home: double, session_avg_likes: double, session_avg_playback: double, session_avg_settings: double, session_avg_songs: double, session_avg_time_away: double, time_window: double]

In [25]:
df_users.show(1, True, vertical = True)

-RECORD 0-----------------------------------------
 canceled                   | 0                   
 male                       | 1                   
 paid                       | 1                   
 avg_daily_actions          | 56.833333333333336  
 avg_session_duration       | 8.064825676115998   
 n_actions                  | 682                 
 n_added_to_playlist        | 20                  
 n_ads                      | 3                   
 n_days                     | 12                  
 n_dislikes                 | 10                  
 n_downgrades               | 1                   
 n_errors                   | 3                   
 n_friends                  | 12                  
 n_help                     | 5                   
 n_home                     | 27                  
 n_likes                    | 25                  
 n_sess                     | 9                   
 n_settings                 | 4                   
 n_songs                    | 5

In [26]:
from pyspark.sql.window import Window

import sys

w = Window().partitionBy()

def z_score(col, w):
    _avg_ = avg(col).over(w)
    avg_sq = avg(spow(col, 2)).over(w)
    sd_ = sqrt(avg_sq - spow(_avg_, 2))
    return sabs((col - _avg_) / sd_)

In [27]:
 _columns_to_check_outliers = [ 'avg_daily_actions', 'avg_session_duration',  'session_avg_actions', 'session_avg_added_playlist', 'session_avg_ads', 'session_avg_dislikes', 'session_avg_errors', 'session_avg_friends', 'session_avg_help', 'session_avg_home', 'session_avg_likes', 'session_avg_playback', 'session_avg_settings', 'session_avg_songs', 'session_avg_time_away', 'time_window']

In [28]:
for c in _columns_to_check_outliers:
    df_users = df_users.withColumn(f'zscore_{c}', z_score(col(c), w))

In [29]:
zscore_columns = []

for c in _columns_to_check_outliers:
    zscore_columns.append(f'zscore_{c}')

In [30]:
_query = ''
_threshold = 3

for c in zscore_columns:
    _begin = ' and ' if len(_query) > 0 else ''
    _query += f'{_begin}{c} < {_threshold}'

In [71]:
_query

'zscore_avg_daily_actions < 3 and zscore_avg_session_duration < 3 and zscore_session_avg_actions < 3 and zscore_session_avg_added_playlist < 3 and zscore_session_avg_ads < 3 and zscore_session_avg_dislikes < 3 and zscore_session_avg_errors < 3 and zscore_session_avg_friends < 3 and zscore_session_avg_help < 3 and zscore_session_avg_home < 3 and zscore_session_avg_likes < 3 and zscore_session_avg_playback < 3 and zscore_session_avg_settings < 3 and zscore_session_avg_songs < 3 and zscore_session_avg_time_away < 3 and zscore_time_window < 3'

In [72]:
df_users.count()

22278

In [31]:
df_users = df_users.filter(_query)

In [32]:
df_users.show(1, True, vertical = True)

-RECORD 0-------------------------------------------------
 canceled                          | 0                    
 male                              | 1                    
 paid                              | 1                    
 avg_daily_actions                 | 56.833333333333336   
 avg_session_duration              | 8.064825676115998    
 n_actions                         | 682                  
 n_added_to_playlist               | 20                   
 n_ads                             | 3                    
 n_days                            | 12                   
 n_dislikes                        | 10                   
 n_downgrades                      | 1                    
 n_errors                          | 3                    
 n_friends                         | 12                   
 n_help                            | 5                    
 n_home                            | 27                   
 n_likes                           | 25                 

In [21]:
df_users.cache()

DataFrame[canceled: int, male: int, paid: int, avg_daily_actions: double, avg_session_duration: double, n_actions: bigint, n_added_to_playlist: bigint, n_ads: bigint, n_days: bigint, n_dislikes: bigint, n_downgrades: bigint, n_errors: bigint, n_friends: bigint, n_help: bigint, n_home: bigint, n_likes: bigint, n_sess: bigint, n_settings: bigint, n_songs: bigint, n_upgrades: bigint, session_avg_actions: double, session_avg_added_playlist: double, session_avg_ads: double, session_avg_dislikes: double, session_avg_errors: double, session_avg_friends: double, session_avg_help: double, session_avg_home: double, session_avg_likes: double, session_avg_playback: double, session_avg_settings: double, session_avg_songs: double, session_avg_time_away: double, time_window: double]

In [24]:
df_users.count()

22278

In [None]:
_df_filtered = df[(np.abs(stats.zscore(df[_columns_to_check_outliers])) < 3).all(axis=1)]

In [39]:
df_users.where(df_users.userId == int(200002)).show(2, True, vertical = True)

-RECORD 0------------------------------------------
 n_dislikes                 | 5                    
 session_avg_settings       | 0.09123393902531743  
 n_help                     | 1                    
 userId                     | 200002               
 n_actions                  | 395                  
 session_avg_help           | 0.045616969512658714 
 n_settings                 | 2                    
 session_avg_playback       | 0.997184759370486    
 n_sess                     | 5                    
 avg_session_duration       | 5.75821870604782     
 session_avg_home           | 1.0491902987911503   
 n_downgrades               | 0                    
 canceled                   | 1                    
 session_avg_likes          | 0.6842545426898806   
 n_likes                    | 15                   
 session_avg_dislikes       | 0.22808484756329356  
 n_songs                    | 310                  
 n_upgrades                 | 1                    
 n_ads      

In [12]:
### WARN: Only round to display
# Enforces the order for some columns
df_users.select([sround(c, 0).cast(dataType = IntegerType()).alias(c) for c in columns_all]).show(2, True, vertical = True)

-RECORD 0--------------------------
 canceled                   | 0    
 male                       | 1    
 paid                       | 1    
 avg_daily_actions          | 57   
 avg_session_duration       | 8    
 n_actions                  | 682  
 n_added_to_playlist        | 20   
 n_ads                      | 3    
 n_days                     | 12   
 n_dislikes                 | 10   
 n_downgrades               | 1    
 n_errors                   | 3    
 n_friends                  | 12   
 n_help                     | 5    
 n_home                     | 27   
 n_likes                    | 25   
 n_sess                     | 9    
 n_settings                 | 4    
 n_songs                    | 557  
 n_upgrades                 | 0    
 session_avg_actions        | 18   
 session_avg_added_playlist | 1    
 session_avg_ads            | 0    
 session_avg_dislikes       | 0    
 session_avg_errors         | 0    
 session_avg_friends        | 0    
 session_avg_help           

#### OPTIONAL: Save the final dataset to a CSV file

In [21]:
df_users.select(columns_all).fillna(0).toPandas().to_csv('sparkify_data_full_dataset_final.csv', index = False)

In [None]:
df.agg(countDistinct(df.userId).alias('unique_users')).show()

In [None]:
df_users.orderBy(df_users.userId).join(df_sessions, on = 'userId').select(_columns).count()

In [None]:
df_users.orderBy(df_users.userId).join(df_sessions, on = 'userId').select(_columns).groupBy('canceled').agg(count(df_users.canceled).alias('total')).show()

- Advertises number (per session and all)
    - The user **100010** returned after some idle time and received a considerable amount of advertises;
    - Also, after thumbs down, I received two advertisements on four sounds. Then canceled the service.
- Number of sessions
- Paid subscription time
- Avg songs before an ad
- Number of skipped songs

In [None]:
df.schema.names

In [None]:
to_date(df.ts.cast(dataType=TimestampType()))

In [None]:
df.where(df.userId == user_id).select(['artist',
 'auth',
 'firstName',
 'gender',
 'itemInSession',
 'lastName',
 'length',
 'level', 
 'page',
 'sessionId',
 'song', 
 'ts', 
 'userId']).orderBy('sessionId', 'itemInSession').withColumn('datetime', date_format((df.ts/1000).cast(dataType=TimestampType()), 'HH:mm:ss dd-MM-YYYY')).show(350, True)

# Modeling
Split the full dataset into train, test, and validation sets. Test out several of the machine learning methods you learned. Evaluate the accuracy of the various models, tuning parameters as necessary. Determine your winning model based on test accuracy and report results on the validation set. Since the churned users are a fairly small subset, I suggest using F1 score as the metric to optimize.

In [77]:
columns_to_exclude = set(['userId'] + zscore_columns)

columns_to_use = list(set(df_users.columns) - columns_to_exclude)

columns_to_train = list(set(columns_to_use) - set(['canc']))

columns_to_use.sort()
columns_to_train.sort()

print(f'Columns: {columns_to_use}\n')
print(f'Columns to train: {columns_to_train}')

Columns: ['avg_daily_actions', 'avg_session_duration', 'canceled', 'male', 'n_actions', 'n_added_to_playlist', 'n_ads', 'n_days', 'n_dislikes', 'n_downgrades', 'n_errors', 'n_friends', 'n_help', 'n_home', 'n_likes', 'n_sess', 'n_settings', 'n_songs', 'n_upgrades', 'paid', 'session_avg_actions', 'session_avg_added_playlist', 'session_avg_ads', 'session_avg_dislikes', 'session_avg_errors', 'session_avg_friends', 'session_avg_help', 'session_avg_home', 'session_avg_likes', 'session_avg_playback', 'session_avg_settings', 'session_avg_songs', 'session_avg_time_away', 'time_window']

Columns to train: ['avg_daily_actions', 'avg_session_duration', 'canceled', 'male', 'n_actions', 'n_added_to_playlist', 'n_ads', 'n_days', 'n_dislikes', 'n_downgrades', 'n_errors', 'n_friends', 'n_help', 'n_home', 'n_likes', 'n_sess', 'n_settings', 'n_songs', 'n_upgrades', 'paid', 'session_avg_actions', 'session_avg_added_playlist', 'session_avg_ads', 'session_avg_dislikes', 'session_avg_errors', 'session_avg_fr

In [214]:
CHURN_LABEL = 'canceled'
TRAIN_SPLIT_RATIO = .8
TEST_SPLIT_RATIO = .2

SPLIT_RATIO = [TRAIN_SPLIT_RATIO, TEST_SPLIT_RATIO]

In [269]:
def plot_confusion_matrix(y_test, y_predictions):
    
    # auc = roc_auc_score(y_test, y_predictions)
    cm = confusion_matrix(y_test, y_predictions, labels = [1, 0])
    
    tn = cm[1, 1]
    tp = cm[0, 0]
    fp = cm[1, 0]
    fn = cm[0, 1]
    
    total = np.sum(cm) # tn + tp + fn + fp
    accuracy = (tp + tn) / total
    precision = (tp) / (tp + fp)
    recall = (tp) / (tp + fn) 
    
    print(cm)

def evaluate_multiclass_classifier(predictions, columns):
    metrics_to_evaluate = [ 'accuracy', 'f1', 'weightedPrecision', 'weightedRecall', 'recall']
    
    result = {}
    for metric in metrics_to_evaluate:
        evaluator = MulticlassClassificationEvaluator(labelCol = columns[0], predictionCol = columns[1], metricName = metric)
        value = evaluator.evaluate(predictions)
        result[metric] = value
        print(f'{metric}: {value}') 
    
    return result

def train_random_forest_classifier(data, columns, train_cloumns):
    
    # Split train/test
    (train_df, test_df) = data.randomSplit(SPLIT_RATIO, seed = 42)
    
    # Create the indexer for labels
    l_indexer = StringIndexer(inputCol = CHURN_LABEL, outputCol = 'idx_labels')
    f_binaries = VectorAssembler(inputCols = binary_features, outputCol = 'bin_features')
    f_numeric = VectorAssembler(inputCols = numeric_features, outputCol = 'num_features')
    
    # f_scaler = MaxAbsScaler(inputCol="num_features", outputCol="num_features_escaled")
    # f_scaler = Normalizer(inputCol = "num_features", outputCol = "num_features_escaled", p = 3)
    # f_scaler = MinMaxScaler(inputCol = 'num_features', outputCol = 'num_features_escaled', )
    f_scaler = StandardScaler(inputCol = 'num_features', outputCol = 'num_features_escaled', withStd = True, withMean = True)
    
    f_all = VectorAssembler(inputCols = [ 'bin_features' , 'num_features_escaled' ], outputCol = 'features')
    
    l_translator = IndexToString(inputCol = 'prediction', outputCol = 'predictedLabel', labels = [ 'Not churn', 'Churn' ])
    
    # rf_classifier = RandomForestClassifier(labelCol = 'idx_labels', featuresCol = 'features', numTrees = 10, maxBins = 5, impurity = 'entropy', minInstancesPerNode = 3, seed = 42)
    rf_classifier = RandomForestClassifier(labelCol = 'idx_labels', featuresCol = 'features', seed = 42)
    
    pipeline = Pipeline(stages = [ l_indexer, f_binaries, f_numeric, f_scaler, f_all, rf_classifier, l_translator ])
    
    # Train the model
    model = pipeline.fit(train_df)

    # Test the model
    predictions = model.transform(test_df)

    return model.stages[2], predictions


from pyspark.ml.classification import LogisticRegression

def train_logistic_regression(data, columns, train_cloumns):
    
    # Split train/test
    (train_df, test_df) = data.randomSplit(SPLIT_RATIO, seed = 42)
    
    # Create the indexer for labels
    l_indexer = StringIndexer(inputCol = CHURN_LABEL, outputCol = 'idx_labels')
    f_binaries = VectorAssembler(inputCols = binary_features, outputCol = 'bin_features')
    f_numeric = VectorAssembler(inputCols = numeric_features, outputCol = 'num_features')
    
    f_scaler = StandardScaler(inputCol = 'num_features', outputCol = 'num_features_escaled', withStd = True, withMean = True)
    
    f_all = VectorAssembler(inputCols = [ 'bin_features' , 'num_features_escaled' ], outputCol = 'features')
    
    l_translator = IndexToString(inputCol = 'prediction', outputCol = 'predictedLabel', labels = [ 'Not churn', 'Churn' ])
    
    lr = LogisticRegression(featuresCol = 'features', labelCol = 'idx_labels', maxIter = 1000, regParam = 0, elasticNetParam = 0, family = 'binomial', aggregationDepth = 15) 
    
    pipeline = Pipeline(stages = [ l_indexer, f_binaries, f_numeric, f_scaler, f_all, lr, l_translator ])
    
    # Train the model
    model = pipeline.fit(train_df)

    # Test the model
    predictions = model.transform(test_df)

    return model.stages[2], predictions
    

def create_pipeline(model):
    
    l_indexer = StringIndexer(inputCol = CHURN_LABEL, outputCol = 'idx_labels')
    f_binaries = VectorAssembler(inputCols = binary_features, outputCol = 'bin_features')
    f_numeric = VectorAssembler(inputCols = numeric_features, outputCol = 'num_features')
    f_scaler = StandardScaler(inputCol = 'num_features', outputCol = 'num_features_escaled', withStd = True, withMean = True)
    f_all = VectorAssembler(inputCols = [ 'bin_features' , 'num_features_escaled' ], outputCol = 'features')
    pipeline = Pipeline(stages = [ l_indexer, f_binaries, f_numeric, f_scaler, f_all, model ])
    return pipeline

def create_random_forest_pipeline():
    rf_classifier = RandomForestClassifier(labelCol = 'canceled', featuresCol = 'features', seed = 42)
    return create_pipeline(rf_classifier)

def create_gradient_boost_pipeline():
    gbt_classifier = GBTClassifier(labelCol = 'canceled', maxDepth = 5, maxIter = 100, seed = 42)
    return create_pipeline(gbt_classifier)

def create_logistic_regression_pipeline():
    lr_classifier = LogisticRegression(featuresCol = 'features', labelCol = 'idx_labels', weightCol = 'class_weights') 
    return create_pipeline(lr_classifier)

In [262]:
model, predictions = train_logistic_regression(df_users, columns_all, columns_to_train)

In [263]:
df_results = predictions.select(['canceled', 'prediction']).toPandas()
df_results['prediction'] = df_results.prediction.apply(int)

In [264]:
plot_confusion_matrix(df_results['canceled'], df_results['prediction'])

[[ 537  394]
 [ 179 3036]]


In [95]:
plot_confusion_matrix(df_results['canceled'], df_results['prediction'])

[[ 912  690]
 [ 313 5305]]


In [91]:
plot_confusion_matrix(df_results['canceled'], df_results['prediction'])

[[ 875  727]
 [ 345 5273]]


In [83]:
model, predictions = train_random_forest_classifier(df_users, columns_all, columns_to_train)

In [84]:
df_results = predictions.select(['canceled', 'prediction']).toPandas()
df_results['prediction'] = df_results.prediction.apply(int)

In [85]:
plot_confusion_matrix(df_results['canceled'], df_results['prediction'])

[[ 824  778]
 [ 116 5502]]


In [78]:
plot_confusion_matrix(df_results['canceled'], df_results['prediction'])

[[ 737  666]
 [ 123 4712]]


In [50]:
plot_confusion_matrix(df_results['canceled'], df_results['prediction'])

[[ 734  669]
 [ 133 4702]]


In [27]:
model, predictions = train_random_forest_classifier(df_users, columns_all, columns_to_train)

In [82]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.evaluation import MulticlassMetrics

In [84]:
df_r = predictions.select('canceled', 'prediction').toPandas()
df_r['prediction'] = df_r.prediction.apply(int)

In [145]:
import uuid

from pyspark.ml.evaluation import Evaluator

class Recall(Evaluator):
    
    def __init__(self, predictionCol = 'prediction', labelCol = 'label'):
        self.predictionCol = predictionCol
        self.labelCol = labelCol
        self.uid = str(uuid.uuid4())

    def evaluate(self, dataset):
        
        tp = dataset.where((dataset[self.labelCol] == 1) & (dataset[self.predictionCol] == 1)).count()
        fp = dataset.where((dataset[self.labelCol] == 0) & (dataset[self.predictionCol] == 1)).count()
        tn = dataset.where((dataset[self.labelCol] == 0) & (dataset[self.predictionCol] == 0)).count()
        fn = dataset.where((dataset[self.labelCol] == 1) & (dataset[self.predictionCol] == 0)).count()
        
        # fnr = fn / (1 if (tp + fn) == 0 else (tp + fn))
        
        return (100 / (tp + fn )) * tp

    def isLargerBetter(self):
        return True

In [270]:
_eval = BinaryClassificationEvaluator(rawPredictionCol = 'prediction', labelCol = 'canceled', metricName = 'areaUnderROC') # Recall(labelCol = 'canceled')

def create_grid_search(pipeline, param_grid):
    
    return CrossValidator(estimator = pipeline, estimatorParamMaps = param_grid, evaluator = _eval, numFolds = 5, parallelism = 16, seed = 42)
    # return CrossValidator(estimator = pipeline, estimatorParamMaps = param_grid, evaluator = _eval, numFolds = 3, parallelism = 16, seed = 42)

def random_forest_grid_search(pipeline):
    
    model = pipeline.getStages()[-1]

    grid_rf = ParamGridBuilder().addGrid(model.maxDepth, [5, 10, 15, 20, 25]) 
    grid_rf = grid_rf.addGrid(model.impurity, ['gini']) 
    grid_rf = grid_rf.addGrid(model.maxBins, [5, 10, 15, 20, 25, 30, 35, 40])
    grid_rf = grid_rf.addGrid(model.numTrees, [10, 20, 40, 60, 70])
    grid_rf = grid_rf.build()
    
    print(f'Number of models to train: {len(grid_rf)}')
        
    return create_grid_search(pipeline, grid_rf)

def gradient_boost_grid_search(pipeline):
    
    model = pipeline.getStages()[-1]

    grid_gbt = ParamGridBuilder().addGrid(model.maxDepth, [2, 4, 6, 8, 10])
    grid_gbt = grid_gbt.addGrid(model.maxIter, [20, 25, 40, 50, 100])
    grid_gbt = grid_gbt.addGrid(model.maxBins, [2])
    grid_gbt = grid_gbt.addGrid(model.subsamplingRate, [.5, .8, 1])
    grid_gbt = grid_gbt.build()
    
    print(f'Number of models to train: {len(grid_gbt)}')
   
    return create_grid_search(pipeline, grid_gbt)

def logistic_regression_grid_search(pipeline):
    
    model = pipeline.getStages()[-1]

    grid_lr = ParamGridBuilder().addGrid(model.aggregationDepth, [2, 5, 10])
    grid_lr = grid_lr.addGrid(model.elasticNetParam, [.0, .5, 1.0])
    grid_lr = grid_lr.addGrid(model.fitIntercept, [ True, False ])
    grid_lr = grid_lr.addGrid(model.standardization, [ True, False ])
    grid_lr = grid_lr.addGrid(model.maxIter, [10, 100, 1000])
    grid_lr = grid_lr.addGrid(model.regParam, [.0, .01, .5, 2.0])
    grid_lr = grid_lr.addGrid(model.tol, [.0001])
    grid_lr = grid_lr.addGrid(model.weightCol, [ 'class_weights' ])

    grid_lr = grid_lr.build()
    
    print(f'Number of models to train: {len(grid_lr)}')
   
    return create_grid_search(pipeline, grid_lr)

In [271]:
SPLIT_RATIO

[0.8, 0.2]

In [217]:
# Create the new dataframe
# data = df_users.select(columns_to_use).fillna(0)

# Split train/test
(train_df, test_df) = df_users.randomSplit(SPLIT_RATIO, seed = 42)

In [218]:
train_df.cache()
test_df.cache()

DataFrame[canceled: int, male: int, paid: int, avg_daily_actions: double, avg_session_duration: double, n_actions: bigint, n_added_to_playlist: bigint, n_ads: bigint, n_days: bigint, n_dislikes: bigint, n_downgrades: bigint, n_errors: bigint, n_friends: bigint, n_help: bigint, n_home: bigint, n_likes: bigint, n_sess: bigint, n_settings: bigint, n_songs: bigint, n_upgrades: bigint, session_avg_actions: double, session_avg_added_playlist: double, session_avg_ads: double, session_avg_dislikes: double, session_avg_errors: double, session_avg_friends: double, session_avg_help: double, session_avg_home: double, session_avg_likes: double, session_avg_playback: double, session_avg_settings: double, session_avg_songs: double, session_avg_time_away: double, time_window: double, zscore_avg_daily_actions: double, zscore_avg_session_duration: double, zscore_session_avg_actions: double, zscore_session_avg_added_playlist: double, zscore_session_avg_ads: double, zscore_session_avg_dislikes: double, zs

In [219]:
train_df.count() + test_df.count()

20875

In [266]:
balancing_ratio = train_df.filter(train_df.canceled == 0).count() / train_df.count()

train_df = train_df.withColumn('class_weights', when(train_df.canceled == 1, balancing_ratio).otherwise(1 - balancing_ratio))

In [272]:
train_df.show(1, vertical = True)

-RECORD 0------------------------------------------------
 canceled                          | 0                   
 male                              | 0                   
 paid                              | 0                   
 avg_daily_actions                 | 1.0                 
 avg_session_duration              | 0.0                 
 n_actions                         | 1                   
 n_added_to_playlist               | 0                   
 n_ads                             | 0                   
 n_days                            | 1                   
 n_dislikes                        | 0                   
 n_downgrades                      | 0                   
 n_errors                          | 0                   
 n_friends                         | 0                   
 n_help                            | 0                   
 n_home                            | 0                   
 n_likes                           | 0                   
 n_sess       

#### Logistic Regression

In [273]:
pipeline = create_logistic_regression_pipeline()
lr = logistic_regression_grid_search(pipeline)
rf_results = lr.fit(train_df)

Number of models to train: 432


In [160]:
pipeline = create_logistic_regression_pipeline()
lr = logistic_regression_grid_search(pipeline)
rf_results = lr.fit(train_df)

Number of models to train: 216


In [274]:
predictions = rf_results.bestModel.transform(test_df)
# predictions = rf_results.bestModel.transform(train_df)

In [275]:
df_results = predictions.select(['canceled', 'prediction']).toPandas()
df_results['prediction'] = df_results.prediction.apply(int)

In [276]:
plot_confusion_matrix(df_results['canceled'], df_results['prediction'])

[[ 723  208]
 [ 443 2772]]


In [223]:
plot_confusion_matrix(df_results['canceled'], df_results['prediction'])

[[ 596  335]
 [ 340 2875]]


In [209]:
plot_confusion_matrix(df_results['canceled'], df_results['prediction'])

[[ 733  440]
 [ 423 3643]]


In [212]:
plot_confusion_matrix(df_results['canceled'], df_results['prediction'])

[[ 2327  1237]
 [ 1268 10804]]


In [196]:
plot_confusion_matrix(df_results['canceled'], df_results['prediction'])

[[2048 1087]
 [1081 9439]]


In [187]:
plot_confusion_matrix(df_results['canceled'], df_results['prediction'])

[[1016  586]
 [ 610 5008]]


In [163]:
plot_confusion_matrix(df_results['canceled'], df_results['prediction'])

[[1016  586]
 [ 610 5008]]


In [277]:
max(rf_results.avgMetrics)

0.8321030434677803

In [278]:
list(list(zip(rf_results.avgMetrics, rf_results.getEstimatorParamMaps())))[-1]

(0.5,
 {Param(parent='LogisticRegression_ef68b4aed01a', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2).'): 10,
  Param(parent='LogisticRegression_ef68b4aed01a', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 1.0,
  Param(parent='LogisticRegression_ef68b4aed01a', name='fitIntercept', doc='whether to fit an intercept term.'): False,
  Param(parent='LogisticRegression_ef68b4aed01a', name='standardization', doc='whether to standardize the training features before fitting the model.'): False,
  Param(parent='LogisticRegression_ef68b4aed01a', name='maxIter', doc='max number of iterations (>= 0).'): 1000,
  Param(parent='LogisticRegression_ef68b4aed01a', name='regParam', doc='regularization parameter (>= 0).'): 2.0,
  Param(parent='LogisticRegression_ef68b4aed01a', name='tol', doc='the convergence tolerance for iterative algorithms (>= 0).'): 0.0001,
  Pa

In [176]:
rf_results.bestModel.stages[-1].extractParamMap()

{Param(parent='LogisticRegression_1f4e8dd33901', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2)'): 2,
 Param(parent='LogisticRegression_1f4e8dd33901', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty'): 0.0,
 Param(parent='LogisticRegression_1f4e8dd33901', name='family', doc='The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial.'): 'auto',
 Param(parent='LogisticRegression_1f4e8dd33901', name='featuresCol', doc='features column name'): 'features',
 Param(parent='LogisticRegression_1f4e8dd33901', name='fitIntercept', doc='whether to fit an intercept term'): False,
 Param(parent='LogisticRegression_1f4e8dd33901', name='labelCol', doc='label column name'): 'idx_labels',
 Param(parent='LogisticRegression_1f4e8dd33901', name='maxIter', doc='maximum number of iterations (>= 

In [193]:
rf_results.avgMetrics

[0.7569201743883416,
 0.7458524531641905,
 0.5065533019305485,
 0.5,
 0.7656400367053511,
 0.7451379672521136,
 0.5065533019305485,
 0.5,
 0.7686666415584142,
 0.7451379672521136,
 0.5065533019305485,
 0.5,
 0.7598541367196125,
 0.754917110340065,
 0.6789049923471427,
 0.6442362144668135,
 0.768159789539252,
 0.7569633105158089,
 0.6789049923471427,
 0.6442362144668135,
 0.7735699410985479,
 0.7569633105158089,
 0.6789049923471427,
 0.6442362144668135,
 0.7569201743883416,
 0.7362262441393639,
 0.5,
 0.5,
 0.7656400367053511,
 0.7403064173269568,
 0.5,
 0.5,
 0.7686666415584142,
 0.7403064173269568,
 0.5,
 0.5,
 0.7598541367196125,
 0.7559669757940859,
 0.5,
 0.5,
 0.768159789539252,
 0.7558486268810651,
 0.5,
 0.5,
 0.7735699410985479,
 0.7558486268810651,
 0.5,
 0.5,
 0.7569201743883416,
 0.734959860174886,
 0.5,
 0.5,
 0.7656400367053511,
 0.7353038442202462,
 0.5,
 0.5,
 0.7686666415584142,
 0.7353038442202462,
 0.5,
 0.5,
 0.7598541367196125,
 0.7533653046580704,
 0.5,
 0.5,
 0.76

#### Random Forest

In [109]:
pipeline = create_random_forest_pipeline()
cv_rf = random_forest_grid_search(pipeline)

Number of models to train: 200


In [110]:
cv_rf_results = cv_rf.fit(train_df)

In [111]:
predictions = cv_rf_results.bestModel.transform(test_df)

In [112]:
df_results = predictions.select(['canceled', 'prediction']).toPandas()
df_results['prediction'] = df_results.prediction.apply(int)

In [113]:
plot_confusion_matrix(df_results['canceled'], df_results['prediction'])

[[ 605  326]
 [ 120 3095]]


In [82]:
list(list(zip(cv_rf_results.avgMetrics, cv_rf_results.getEstimatorParamMaps())))[-1]

(64.35829817256004,
 {Param(parent='RandomForestClassifier_5b6344e9b3f5', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 25,
  Param(parent='RandomForestClassifier_5b6344e9b3f5', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini'): 'gini',
  Param(parent='RandomForestClassifier_5b6344e9b3f5', name='maxBins', doc='Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.'): 40,
  Param(parent='RandomForestClassifier_5b6344e9b3f5', name='numTrees', doc='Number of trees to train (>= 1).'): 70})

#### Gradient Boost

In [114]:
pipeline = create_gradient_boost_pipeline()
cv_gbt = gradient_boost_grid_search(pipeline)
_results = cv_gbt.fit(train_df)

Number of models to train: 75


In [None]:
pipeline = create_gradient_boost_pipeline()
cv_gbt = gradient_boost_grid_search(pipeline)
_results = cv_gbt.fit(train_df)

In [115]:
predictions = _results.bestModel.transform(test_df)

df_results = predictions.select(['canceled', 'prediction']).toPandas()
df_results['prediction'] = df_results.prediction.apply(int)

plot_confusion_matrix(df_results['canceled'], df_results['prediction'])

[[ 637  294]
 [ 274 2941]]


In [144]:
_results.bestModel.stages[-1].extractParamMap()

{Param(parent='GBTClassifier_ba2654d5e488', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.'): False,
 Param(parent='GBTClassifier_ba2654d5e488', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext'): 10,
 Param(parent='GBTClassifier_ba2654d5e488', name='featureSubsetStrategy', doc='The number of features to consider for splits at each tree node. Supported options: auto, all, onethird, sqrt, log2, (0.0-1.0], [1-n].'): 'all',
 Param(parent='GBTClassifier_ba2654d5e488', name='featuresCol', doc='features column name'): 'features',
 Param(parent='GBTClassifier_ba2654d5e488', name='labelCol', doc='label column name'): 'ca

##### Estimator params and score

In [None]:
import pandas as pd

scores = cv_gbt_results.avgMetrics
params = [{p.name: v for p, v in m.items()} for m in cv_gbt.getEstimatorParamMaps()]
params_pd = pd.DataFrame(params)
params_pd['score'] = scores
params_pd

# Final Steps
Clean up your code, adding comments and renaming variables to make the code easier to read and maintain. Refer to the Spark Project Overview page and Data Scientist Capstone Project Rubric to make sure you are including all components of the capstone project and meet all expectations. Remember, this includes thorough documentation in a README file in a Github repository, as well as a web app or blog post.