# Feature Engineering on cluster for Downgraded users

##### The feature is marked as candidate feature if

      |Δ| > 0.10

In [1]:
print("Welcome to my EMR Notebook!")

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
49,application_1597998755054_0085,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Welcome to my EMR Notebook!

In [2]:
# Import libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import isnan, count, when, col, desc, udf, col, sort_array, asc, \
                                  avg, from_unixtime, split, min, max, lit, mean, col
from pyspark.sql.functions import sum as Fsum
from pyspark.sql.functions import abs as Fabs
from pyspark.sql.functions import row_number
from pyspark.sql.window import Window

import datetime
from pyspark.sql.types import IntegerType, TimestampType, FloatType
from pyspark.sql.functions import to_date, year, month, dayofmonth, dayofweek, hour, date_format, substring

import numpy as np
import time

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
# Create spark session
spark = SparkSession \
    .builder \
    .appName("Sparkify") \
    .getOrCreate()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
# Set time parser policy
spark.conf.set("spark.sql.legacy.timeParserPolicy","LEGACY")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
# Read in full sparkify dataset
event_data = "s3n://udacity-dsnd/sparkify/sparkify_event_data.json"
data = spark.read.json(event_data)
data.head()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Row(artist='Popol Vuh', auth='Logged In', firstName='Shlok', gender='M', itemInSession=278, lastName='Johnson', length=524.32934, level='paid', location='Dallas-Fort Worth-Arlington, TX', method='PUT', page='NextSong', registration=1533734541000, sessionId=22683, song='Ich mache einen Spiegel - Dream Part 4', status=200, ts=1538352001000, userAgent='"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"', userId='1749042')

### Prepare data

In [5]:
# Function that returns all users of a specified churn group
def get_users(churn):
    """Returns all distinct users of a specified churn group.
    
    Args:
        churn (int): A specified churn group - 1 for churned and 0 for non-churned users.
        
    Returns:
        DataFrame: A dataframe query with filtered users.
    """
    return data.where(data.churn == 1).select('userId').dropDuplicates()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
# Count all users
data.select('userId').dropDuplicates().count()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

22278

In [7]:
# Remove rows with missing users
data = data.where(~((col('userId').isNull()) | (col('userId') == '')))

# Exclude non-relevant columns
data = data.drop('firstName')
data = data.drop('lastName')

# Add tsDate and date column
data = data.withColumn('tsDate', (col('ts') / 1000.0).cast(TimestampType()))
data = data.withColumn('date', date_format(col('tsDate'), 'yyyy-MM-dd').alias('date').cast('date'))

# Label churned users using Submit Downgrade event
query_churn_by_cc = data.where(data.page == 'Submit Downgrade')
canceled = query_churn_by_cc.select('userId').dropDuplicates().select('userId')
canceled_uids = [row.userId for row in canceled.collect()];
set_churn = udf(lambda x: 1 if x in canceled_uids else 0, IntegerType())
data = data.withColumn('churn', set_churn('userId'))

# Add [userRowId] column that assigns a 1-based index to every user's log ordered by [ts]
w =  Window.partitionBy(data.userId).orderBy('ts', 'itemInSession')
data = data.withColumn('userRowId', row_number().over(w))

# Add [userRowDescId] column that assigns a 1-based index to every user's log ordered by [ts] descending.
w =  Window.partitionBy(data.userId).orderBy(col('ts').desc(), col('itemInSession').desc())
data = data.withColumn('userRowDescId', row_number().over(w))

# Add last level column
last_levels = dict()
for row in data.where(data.userRowDescId == 1).select('userId', 'level').collect():
    last_levels[row.userId] = row.level
get_level = udf(lambda userId: last_levels[userId])
data = data.withColumn('lastLevel', get_level('userId'))

# Prepare labels
labels = data.select(col('churn').alias('label'), 'userId').dropDuplicates()

# Count churned users
print(f'Churned (downgraded) users: {get_users(1).count()}')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Churned (downgraded) users: 5103

In [8]:
data.printSchema()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: long (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)
 |-- tsDate: timestamp (nullable = true)
 |-- date: date (nullable = true)
 |-- churn: integer (nullable = true)
 |-- userRowId: integer (nullable = true)
 |-- userRowDescId: integer (nullable = true)
 |-- lastLevel: string (nullable = true)

### Global Variables

In [40]:
# Any feature whose delta fulfills this criterion will be automatically selected (by the show_delta function)
delta_threshold = 0.10

# The collection of all selected engineered features
selected_features = []

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Queries

In [10]:
# All unique users
users = data.select('userId').dropDuplicates()

# Pages without churn definition events
page_data = data.where(~data.page.isin(['Submit Downgrade', 'Upgrade', 'Submit Upgrade', 'Cancel', 'Cancellation Confirmation'])) \
    .select('churn', 'page', 'userId', 'sessionId', 'ts', 'date')

# Calc session duration (in hours)
session_hours = page_data \
    .groupby('userId', 'sessionId') \
    .agg(((max('ts') - min('ts'))/1000/3600).alias('sessionHours'))

# User interactions duration per user (in hours)
user_hours = page_data \
    .groupby('userId', 'sessionId') \
    .agg(((max('ts') - min('ts'))/1000/3600).alias('sessionHours')) \
    .groupby('userId') \
    .agg(Fsum('sessionHours').alias('hours'))  

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Functions

In [11]:
# We'll calculate the ratio difference (delta) and show it through this function.
# Note that we haven't incroporated the delta calculation in spark querying due to 
# the simplicity of a manual solution (for a very limited number of features though).
def show_delta(feature, v1, v0, force_selection=False):
    """Calculate delta and print it.
    
    If delta is greater than delta_threshold the feature is selected.
    
    Args:
        feature (string): The name of a feature.
        v1 (float): The statistical value of the churned users.
        v0 (float): The statistical value of the non-churned users.
        force_selection (bool): If True then the feature is selected without the threshold condition. 
    Returns:
        None
    """
    # Calc delta
    delta = (v1 - v0)/(v1 + v0)
    
    # Delete the feature if it already exists (to avoid duplicates):
    ix = None
    try:
        ix = [x['feature'] for x in selected_features].index(feature)
        del selected_features[ix]
    except ValueError:
        pass
    
    if (force_selection == True) or (abs(delta) > delta_threshold):
        selected_features.append({'feature': feature, 'delta': delta})
        print(f'Δ for {feature} feature: {round(delta, 4)} (SELECTED)')
    else:
        print(f'Δ for {feature} feature: {round(delta, 4)}')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [12]:
# Count page logs
def page_count(page):
    return page_data \
        .where(data.page == page) \
        .groupby('userId') \
        .count() \
        .select('userId', col('count').alias(page.replace(' ', '') + 'Count'))

# Average page count per session hour
def page_session_hour(page):
    return page_data \
        .where(data.page == page) \
        .join(session_hours, ['userId', 'sessionId'], 'inner') \
        .groupby( 'userId', 'sessionId', 'sessionHours') \
        .agg((count('userId')/col('sessionHours')).alias('avgPerSession')) \
        .groupby('userId') \
        .agg(avg('avgPerSession').alias('avg')) \
        .select('userId', col('avg').alias(page.replace(' ', '') + 'PerSessionHour'))

# Average page count per hour
def page_hour(page):
    return page_data \
        .where(data.page == page) \
        .join(user_hours, 'userId', 'inner') \
        .groupby('userId', 'hours') \
        .agg((count('userId')/col('hours')).alias('avg')) \
        .select('userId', col('avg').alias(page.replace(' ', '') + 'PerHour'))

# Average page count per day
def page_day(page):
    return page_data \
        .where(data.page == page) \
        .groupby('userId', 'date') \
        .count() \
        .groupby('userId') \
        .agg(avg('count').alias(page.replace(' ', '') + 'PerDay'))


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Features

In [13]:
    f_Gender = data \
        .select('userId', 'gender') \
        .dropDuplicates() \
        .replace(['M', 'F'], ['0', '1'], 'gender') \
        .select('userId', col('gender').cast('int').alias('Gender'))

    f_LastLevel = data \
        .select('userId', 'lastLevel') \
        .dropDuplicates() \
        .replace(['free', 'paid'], ['0', '1'], 'lastLevel') \
        .select('userId', col('lastLevel').cast('int').alias('LastLevel'))

    f_LogCount = data \
        .groupby('userId') \
        .agg(count('userId').alias('LogCount'))

    f_SongCount = data \
        .where(data.page == 'NextSong') \
        .groupby('userId') \
        .agg(count('userId').alias('SongCount'))

    f_NonSongCount = data \
        .where(data.page != 'NextSong') \
        .groupby('userId') \
        .agg(count('userId').alias('NonSongCount'))

    f_AboutCount = page_count('About')

    f_ThumbsUpCount = page_count('Thumbs Up')

    f_RollAdvertCount = page_count('Roll Advert')

    f_SessionCount = data \
        .select('userId', 'sessionId') \
        .dropDuplicates() \
        .groupby('userId') \
        .agg(count('userId').alias('SessionCount'))

    f_AvgSessionLength = data \
       .groupby('userId', 'sessionId') \
       .agg(((max('ts') - min('ts'))/1000).alias('sessionLength')) \
       .groupby('userId') \
       .agg(avg('sessionLength').alias('AvgSessionLength')) \

    f_AvgSessionGap = data \
        .groupby('userId', 'sessionId') \
        .agg(min('ts').alias('startTime'), max('ts').alias('endTime')) \
        .groupby('userId') \
        .agg(count('userId').alias('sessionCount'), \
            ((max('endTime') - min('startTime'))/1000).alias('observationPeriodTime'), \
            (Fsum(col('endTime') - col('startTime'))/1000).alias('totalSessionTime')) \
        .where(col('sessionCount') > 1) \
        .join(users, 'userId', 'outer') \
        .fillna(0) \
        .select('userId', \
            (col('observationPeriodTime') - col('totalSessionTime')/(col('sessionCount') - 1)).alias('AvgSessionGap'))

    f_DowngradePerSessionHour = page_session_hour('Downgrade')

    f_ErrorPerSessionHour = page_session_hour('Error')

    f_SettingsPerSessionHour = page_session_hour('Settings')

    f_SaveSettingsPerSessionHour = page_session_hour('Save Settings')

    f_LogoutPerSessionHour = page_session_hour('Logout')

    f_SubmitDowngradePerSessionHour = page_session_hour('Submit Downgrade')

    f_RollAdvertPerHour = page_hour('Roll Advert')

    f_ThumbsDownPerHour = page_hour('Thumbs Down')

    f_UpgradePerHour = page_hour('Upgrade')

    f_SubmitUpgradePerHour = page_hour('Submit Upgrade')

    f_SessionsPerDay = data \
        .select('userId', 'date', 'sessionId') \
        .dropDuplicates() \
        .groupby('userId', 'date') \
        .count() \
        .groupby('userId') \
        .agg(avg('count').alias('SessionsPerDay'))

    f_AddFriendPerDay = page_day('Add Friend')

    f_RollAdvertPerDay = page_day('Roll Advert')

    f_ThumbsDownPerDay = page_day('Thumbs Down')

    f_ThumbsUpPerDay = page_day('Thumbs Up')

    f_TotalSongLength = data \
        .where(data.page == 'NextSong') \
        .select('userId', 'length') \
        .groupby('userId') \
        .agg(Fsum('length').alias('TotalSongLength'))

    f_UniqueSongCount = data \
        .where(data.page == 'NextSong') \
        .select('userId', 'song') \
        .dropDuplicates() \
        .groupby('userId') \
        .agg(count('userId').alias('UniqueSongCount'))

    f_UniqueSongShare = data \
        .where(data.page == 'NextSong') \
        .select('userId', 'song') \
        .dropDuplicates() \
        .groupby('userId') \
        .count() \
        .join(f_SongCount, on = ['userId'], how = 'inner') \
        .select('userId', (col('count')/col('SongCount')).alias('UniqueSongShare')) 


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Feature Check

### AvgSessionGap (YES)

In [13]:
labels.join(f_AvgSessionGap, 'userId', 'outer') \
    .groupby('label') \
    .agg(avg('AvgSessionGap')) \
    .show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----+------------------+
|label|avg(AvgSessionGap)|
+-----+------------------+
|    1| 4244186.260620306|
|    0| 3304447.189541696|
+-----+------------------+

In [41]:
show_delta('AvgSessionGap', 4244186.260620306, 3304447.189541696)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Δ for AvgSessionGap feature: 0.1245 (SELECTED)

### Per hour features

In [16]:
# Average page count per hour, per churn
page_data \
    .join(user_hours, 'userId', 'inner') \
    .groupby('churn', 'userId', 'page', 'hours') \
    .agg((count('userId')/col('hours')).alias('CountPerHour')) \
    .groupby('churn', 'page') \
    .agg(avg('CountPerHour').alias('AvgCountPerHour')) \
    .sort('page', 'churn') \
    .show(100)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----+--------------------+--------------------+
|churn|                page|     AvgCountPerHour|
+-----+--------------------+--------------------+
|    0|               About| 0.07974660299961202|
|    1|               About|0.038796560736421515|
|    0|          Add Friend|  0.3323238283105589|
|    1|          Add Friend| 0.26757162138438756|
|    0|     Add to Playlist|  0.4307492174289696|
|    1|     Add to Playlist|  0.4065724958388631|
|    0|              Cancel| 0.27058577119877714|
|    1|              Cancel|0.021713037494027587|
|    0|Cancellation Conf...| 0.27058577119877714|
|    1|Cancellation Conf...|0.021713037494027587|
|    0|           Downgrade| 0.15553974027070466|
|    1|           Downgrade| 0.14083388826729795|
|    0|               Error| 0.04366372583976314|
|    1|               Error|  0.0239287671949968|
|    0|                Help|   0.121046621770175|
|    1|                Help| 0.09276112061952985|
|    0|                Home|  1.3876360760280084|


##### Selected per hour features:

 - About
 - Home
 - Help
 - Logout
 - Roll Advert
 - Save Settings
 - Settings
 - Error
 - Add Friend

In [44]:
show_delta('AboutPerHour', 0.038796560736421515, 0.07974660299961202)
show_delta('HomePerHour', 0.6903134160710036, 1.3876360760280084)
show_delta('HelpPerHour', 0.09276112061952985, 0.121046621770175)
show_delta('LogoutPerHour', 0.21806502326821595, 1.4964197305552223)
show_delta('RollAdvertPerHour', 0.41589193128408114, 0.7885870813983161)
show_delta('SaveSettingsPerHour', 0.026714255097028877, 0.051154518301673335)
show_delta('SettingsPerHour', 0.10628656555573869, 0.15361547325206673)
show_delta('ErrorPerHour', 0.0239287671949968, 0.04366372583976314)
show_delta('AddFriendPerHour', 0.26757162138438756, 0.3323238283105589)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Δ for AboutPerHour feature: -0.3454 (SELECTED)
Δ for HomePerHour feature: -0.3356 (SELECTED)
Δ for HelpPerHour feature: -0.1323 (SELECTED)
Δ for LogoutPerHour feature: -0.7456 (SELECTED)
Δ for RollAdvertPerHour feature: -0.3094 (SELECTED)
Δ for SaveSettingsPerHour feature: -0.3139 (SELECTED)
Δ for SettingsPerHour feature: -0.1821 (SELECTED)
Δ for ErrorPerHour feature: -0.292 (SELECTED)
Δ for AddFriendPerHour feature: -0.1079 (SELECTED)

### Per day features

In [17]:
# Average page count per day, per churn
page_data \
    .groupby('churn', 'userId', 'page', 'date') \
    .count() \
    .groupby('churn', 'userId', 'page') \
    .agg(avg('count').alias('AvgCountPerDay')) \
    .groupby('churn', 'page') \
    .agg(avg('AvgCountPerDay').alias('AvgCountPerDay')) \
    .sort('page', 'churn') \
    .show(100)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----+--------------------+------------------+
|churn|                page|    AvgCountPerDay|
+-----+--------------------+------------------+
|    0|               About|1.2359981623425653|
|    1|               About|1.1769044524425163|
|    0|          Add Friend|2.4145275202506804|
|    1|          Add Friend| 2.663828774404083|
|    0|     Add to Playlist|   2.5970465586921|
|    1|     Add to Playlist|3.0302791793537764|
|    0|              Cancel|               1.0|
|    1|              Cancel|               1.0|
|    0|Cancellation Conf...|               1.0|
|    1|Cancellation Conf...|               1.0|
|    0|           Downgrade|1.8330022091560143|
|    1|           Downgrade|1.9526806700667658|
|    0|               Error|1.0759669936571994|
|    1|               Error|1.0808030144777132|
|    0|                Help| 1.433259928460583|
|    1|                Help|1.4588501845064854|
|    0|                Home|  3.52374669352495|
|    1|                Home|3.8617029426

##### Selected per day features: 

 - NextSong


In [45]:
show_delta('NextSongPerDay', 74.89393349200786, 57.47187401184856)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Δ for NextSongPerDay feature: 0.1316 (SELECTED)

### SongCount (YES)

In [18]:
labels.join(f_SongCount, 'userId', 'outer') \
    .fillna(0) \
    .groupby('label').agg(avg('SongCount')).show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----+------------------+
|label|    avg(SongCount)|
+-----+------------------+
|    1|1748.1493239271017|
|    0|  694.583173216885|
+-----+------------------+

In [53]:
show_delta('SongCount', 1748.1493239271017, 694.583173216885)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Δ for SongCount feature: 0.4313 (SELECTED)

### NonSongCount (YES)

In [20]:
labels.join(f_NonSongCount, 'userId', 'outer') \
    .fillna(0) \
    .groupby('label').agg(avg('NonSongCount')).show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----+-----------------+
|label|avg(NonSongCount)|
+-----+-----------------+
|    1| 386.071526553008|
|    0|200.2214847161572|
+-----+-----------------+

In [55]:
show_delta('NonSongCount', 386.071526553008, 200.2214847161572)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Δ for NonSongCount feature: 0.317 (SELECTED)

### SessionCount (NO)

In [21]:
labels.join(f_SessionCount, 'userId', 'outer') \
    .fillna(0) \
    .groupby('label').agg(avg('SessionCount')).show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----+------------------+
|label| avg(SessionCount)|
+-----+------------------+
|    1| 23.54830491867529|
|    0|19.505735080058223|
+-----+------------------+

In [22]:
show_delta('SessionCount', 23.54830491867529, 19.505735080058223)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Δ for SessionCount feature: 0.0939

### AvgSessionLength (NO)

In [22]:
labels.join(f_AvgSessionLength, 'userId', 'outer') \
    .fillna(0) \
    .groupby('label').agg(avg('AvgSessionLength')).show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----+---------------------+
|label|avg(AvgSessionLength)|
+-----+---------------------+
|    1|    18895.01848812512|
|    0|   15908.077736444633|
+-----+---------------------+

In [23]:
show_delta('AvgSessionLength', 18895.01848812512, 15908.077736444633)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Δ for AvgSessionLength feature: 0.0858

### TotalSongLength (YES)

In [23]:
labels.join(f_TotalSongLength, 'userId', 'outer') \
    .fillna(0) \
    .groupby('label').agg(avg('TotalSongLength')).show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----+--------------------+
|label|avg(TotalSongLength)|
+-----+--------------------+
|    1|   434794.7743775052|
|    0|    172764.785853936|
+-----+--------------------+

In [56]:
show_delta('TotalSongLength', 434794.7743775052, 172764.785853936)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Δ for TotalSongLength feature: 0.4313 (SELECTED)

### SessionsPerDay (NO)

In [24]:
labels.join(f_SessionsPerDay, 'userId', 'outer') \
    .fillna(0) \
    .groupby('label').agg(avg('SessionsPerDay')).show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----+-------------------+
|label|avg(SessionsPerDay)|
+-----+-------------------+
|    1| 1.2262613774098055|
|    0| 1.3028645682426432|
+-----+-------------------+

In [25]:
show_delta('SessionsPerDay', 1.2262613774098055, 1.3028645682426432)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Δ for SessionsPerDay feature: -0.0303

### UniqueSongCount (YES)

In [25]:
labels.join(f_UniqueSongCount, 'userId', 'outer') \
    .fillna(0) \
    .groupby('label').agg(avg('UniqueSongCount')).show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----+--------------------+
|label|avg(UniqueSongCount)|
+-----+--------------------+
|    1|  1501.8208896727415|
|    0|   628.5352547307133|
+-----+--------------------+

In [57]:
show_delta('UniqueSongCount', 1501.8208896727415, 628.5352547307133)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Δ for UniqueSongCount feature: 0.4099 (SELECTED)

### UniqueSongShare (NO)

In [26]:
labels.join(f_UniqueSongShare, 'userId', 'outer') \
    .fillna(0) \
    .groupby('label').agg(avg('UniqueSongShare')).show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----+--------------------+
|label|avg(UniqueSongShare)|
+-----+--------------------+
|    1|   0.902333929373722|
|    0|  0.9504775922825538|
+-----+--------------------+

In [27]:
show_delta('UniqueSongShare', 0.902333929373722, 0.9504775922825538)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Δ for UniqueSongShare feature: -0.026

### LogCount (YES)

In [27]:
labels.join(f_LogCount, 'userId', 'outer') \
    .fillna(0) \
    .groupby('label').agg(avg('LogCount')).show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----+------------------+
|label|     avg(LogCount)|
+-----+------------------+
|    1|2134.2208504801097|
|    0| 894.8046579330422|
+-----+------------------+

In [58]:
show_delta('LogCount', 2134.2208504801097, 894.8046579330422)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Δ for LogCount feature: 0.4092 (SELECTED)

### Per session-hour features

In [28]:
page_data \
    .join(session_hours, ['userId', 'sessionId'], 'inner') \
    .groupby('churn', 'page', 'userId', 'sessionId', 'sessionHours') \
    .agg((count('userId')/col('sessionHours')).alias('AvgPerSessionHour')) \
    .groupby('churn', 'page', 'userId') \
    .agg(avg('AvgPerSessionHour').alias('AvgPerSessionHour')) \
    .groupby('churn', 'page') \
    .agg(avg('AvgPerSessionHour').alias('AvgPerSessionHour')) \
    .sort('page', 'churn') \
    .show(100)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----+-------------------+-------------------+
|churn|               page|  AvgPerSessionHour|
+-----+-------------------+-------------------+
|    0|              About| 1.3444645752997937|
|    1|              About|  1.623732947278581|
|    0|         Add Friend| 11.101005878338443|
|    1|         Add Friend|  9.002988046530247|
|    0|    Add to Playlist| 0.8910132604671539|
|    1|    Add to Playlist| 0.8612317970890859|
|    0|          Downgrade| 0.7068792406841425|
|    1|          Downgrade| 0.5054197692132388|
|    0|              Error| 0.8401778912640101|
|    1|              Error|0.35152059539334196|
|    0|               Help| 1.2071021264084552|
|    1|               Help|  1.570043252059434|
|    0|               Home| 13.293379077967844|
|    1|               Home| 12.174322714941066|
|    0|              Login|   916.384243862892|
|    0|             Logout|  21.58056847479438|
|    1|             Logout| 19.548823872578662|
|    0|           NextSong| 28.963111939

##### Selected per session hour features:

 - Error
 - Roll Advert
 - Save Settings
 - Settings
 - Thumbs Down

In [47]:
show_delta('ErrorPerSessionHour', 0.35152059539334196, 0.8401778912640101)
show_delta('RollAdvertPerSessionHour', 2.099524786665363, 2.5716465393871033)
show_delta('SaveSettingsPerSessionHour', 1.572404364599302, 2.247808990781459)
show_delta('SettingsPerSessionHour', 1.0142441973565337, 1.8068410162538084)
show_delta('ThumbsDownPerSessionHour', 2.347093193861148, 3.473955622402571)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Δ for ErrorPerSessionHour feature: -0.4101 (SELECTED)
Δ for RollAdvertPerSessionHour feature: -0.1011 (SELECTED)
Δ for SaveSettingsPerSessionHour feature: -0.1768 (SELECTED)
Δ for SettingsPerSessionHour feature: -0.281 (SELECTED)
Δ for ThumbsDownPerSessionHour feature: -0.1936 (SELECTED)

### Page count features

In [29]:
page_data \
    .groupby('churn', 'page', 'userId') \
    .count() \
    .groupby('churn', 'page') \
    .agg(avg('count').alias('AvgPageCount')) \
    .sort('page', 'churn') \
    .show(100)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----+-------------------+------------------+
|churn|               page|      AvgPageCount|
+-----+-------------------+------------------+
|    0|              About|7.1716640190627485|
|    1|              About| 4.676691729323308|
|    0|         Add Friend|14.336133902198885|
|    1|         Add Friend|32.199802761341225|
|    0|    Add to Playlist| 21.25598663449044|
|    1|    Add to Playlist| 49.89272406354187|
|    0|          Downgrade| 9.738670096972095|
|    1|          Downgrade|16.817754262198708|
|    0|              Error| 2.030910055717697|
|    1|              Error|2.8522088353413655|
|    0|               Help| 7.507071922961179|
|    1|               Help| 11.17043618739903|
|    0|               Home| 55.55833824395993|
|    1|               Home| 78.43954536547129|
|    0|              Login|          296350.0|
|    0|             Logout| 10.61476735506345|
|    1|             Logout|24.658143194335167|
|    0|           NextSong| 695.2713602984031|
|    1|      

##### Selected per count features:

 - About
 - Add Friend
 - Add to Playlist
 - Logout
 - Roll Advert
 - Save Settings
 - Settings

In [48]:
show_delta('AboutCount', 4.676691729323308, 7.1716640190627485)
show_delta('AddFriendCount', 32.199802761341225, 14.336133902198885)
show_delta('AddToPlaylistCount', 49.89272406354187, 21.25598663449044)
show_delta('LogoutCount', 24.658143194335167, 10.61476735506345)
show_delta('RollAdvertCount', 32.30518840007891, 14.764784318954597)
show_delta('SaveSettingsCount', 3.1942740286298568, 2.0444444444444443)
show_delta('SettingsCount', 12.566177947378991, 6.0561129425254405)
show_delta('ThumbsDownCount', 20.531373322809788, 9.032881106729933)
show_delta('ThumbsUpCount', 92.73235294117647, 40.796657046657046)
show_delta('ErrorCount', 2.8522088353413655, 2.030910055717697)
show_delta('HelpCount', 11.17043618739903, 7.507071922961179)
show_delta('HomeCount', 78.43954536547129, 55.55833824395993)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Δ for AboutCount feature: -0.2106 (SELECTED)
Δ for AddFriendCount feature: 0.3839 (SELECTED)
Δ for AddToPlaylistCount feature: 0.4025 (SELECTED)
Δ for LogoutCount feature: 0.3981 (SELECTED)
Δ for RollAdvertCount feature: 0.3726 (SELECTED)
Δ for SaveSettingsCount feature: 0.2195 (SELECTED)
Δ for SettingsCount feature: 0.3496 (SELECTED)
Δ for ThumbsDownCount feature: 0.3889 (SELECTED)
Δ for ThumbsUpCount feature: 0.3889 (SELECTED)
Δ for ErrorCount feature: 0.1682 (SELECTED)
Δ for HelpCount feature: 0.1961 (SELECTED)
Δ for HomeCount feature: 0.1708 (SELECTED)

## List of selected features (28 features)

In [60]:
spark.sparkContext.parallelize(selected_features).toDF(['delta', 'feature']) \
    .select('feature', 'delta', Fabs(col('delta'))) \
    .sort(desc('abs(delta)')) \
    .show(40, False)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------------+--------------------+-------------------+
|feature                   |delta               |abs(delta)         |
+--------------------------+--------------------+-------------------+
|LogoutPerHour             |-0.7456203412927254 |0.7456203412927254 |
|SongCount                 |0.43130639639912827 |0.43130639639912827|
|TotalSongLength           |0.43128280036240824 |0.43128280036240824|
|ErrorPerSessionHour       |-0.41005111724302395|0.41005111724302395|
|UniqueSongCount           |0.40992471481174186 |0.40992471481174186|
|LogCount                  |0.4091798464900924  |0.4091798464900924 |
|AddToPlaylistCount        |0.40249130515647424 |0.40249130515647424|
|LogoutCount               |0.3981348751928029  |0.3981348751928029 |
|ThumbsUpCount             |0.3889469104822356  |0.3889469104822356 |
|ThumbsDownCount           |0.3889322574829049  |0.3889322574829049 |
|AddFriendCount            |0.3838682562317078  |0.3838682562317078 |
|RollAdvertCount    