# Customer Churn Prediction with PySpark on AWS EMR

In [None]:
!pip install pyspark

In [None]:
# import libraries
import time
import numpy as np
import pandas as pd
pd.options.display.max_columns = None
import seaborn as sns
sns.set_style('whitegrid')
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.ml import Pipeline
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, LongType, TimestampType
from pyspark.ml.feature import StandardScaler, VectorAssembler, MinMaxScaler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [None]:
# set up a Spark session
spark = SparkSession \
        .builder \
        .appName('Sparkify Churn Prediction') \
        .getOrCreate()

In [None]:
def load_data(path):
    '''
    Load a data set that is stored under a given path. 
    Rows with missing ids and duplicate rows will be removed.
    '''
    
    # load small subset for feature engineering
    df = spark.read.json(path)
    
    # split smaller dataset to work with
    df, _ = df.randomSplit([0.1, 0.9], 42)
    
    # select features we are going to deal with
    df = df.select(['userId', 'registration', 'ts', 'page'])

    # drop rows with missing user id
    df = df.where(df.userId != '')
    
    # drop duplicate rows if any exists
    df = df.dropDuplicates()
    
    # count number of rows
    print(f'Dataset have {df.count()} rows')
    
    return df

In [None]:
def feature_creation(df):
    '''
    The following Features will be creted at user level
    Afterwards, all features are joined, checked for multicollinearity and strongly correlating features will be removed.
    '''
    
    # use the Cancellation Confirmation event to define churn
    churned_users = df.filter(F.col('page')=='Cancellation Confirmation')
    print(f'Number of churned users in dataset {churned_users.count()}')
    flag_churn = F.udf(lambda x: 1 if x == 'Cancellation Confirmation' else 0, IntegerType())
    df = df.withColumn('churn', flag_churn('page'))
    
    # page-level features
    get_page = F.udf(lambda x: 'page_' + x.replace(' ', '_').lower())
    include_page = ['Roll Advert', 'Settings', 'Thumbs Down', 'Error', 'Upgrade', 'About']
    page = df.filter(df['page'].isin(include_page)).withColumn('page', get_page(df['page']))\
            .groupBy(['userId']).pivot('page').agg(F.count('page')).fillna(0)
    
    # convert Timestamps (ts) to Datetime
    df = df.withColumn('reg_date', (F.col('registration')/1000).cast(TimestampType()))
    df = df.withColumn('date', (F.col('ts')/1000).cast(TimestampType()))

    # user-based observation start/end dates
    min_date = df.agg({'date':'min'}).collect()[0]['min(date)']
    max_date = df.agg({'date':'max'}).collect()[0]['max(date)']
    min_reg_date = df.agg({'reg_date':'min'}).collect()[0]['min(reg_date)']
    max_reg_date = df.agg({'reg_date':'max'}).collect()[0]['max(reg_date)']

    # get first log date
    w = Window.partitionBy('userId').orderBy('ts').rangeBetween(Window.unboundedPreceding, Window.unboundedFollowing)
    df = df.withColumn('first_date', F.first('date').over(w))

    # infer observation start date
    df = df.withColumn('obs_start',
                      (F.when(F.col('reg_date')<min_date, min_date)
                        .when(F.col('reg_date')<F.col('first_date'), F.col('reg_date'))
                        .otherwise(F.col('first_date')))
                      )

    # infer observation end date
    df = df.withColumn('obs_end',
                      (F.when(F.last('churn').over(w)==1, F.last('date').over(w))
                         .otherwise(max_date))
                      )

    # aggregation by user
    df = df.groupby('userId').agg(

        # User-level features
        F.max('churn').alias('churn'),
        F.first('reg_date').alias('reg_date'),
        F.first('obs_start').alias('obs_start'),
        F.first('obs_end').alias('obs_end'),
        )
    
    # user_lifetime
    df = df.withColumn('user_lifetime', F.datediff('obs_end', 'reg_date')).select(['userId', 'user_lifetime', 'churn'])
    
    # bringing all together
    df = df.join(page, ['userId'], how='outer').fillna(0)
    
    return df

In [None]:
def feature_scaling(df):
    '''
    Perform feature scaling on a set of given data . 
    '''
    
    # get features without ids and labels
    feature_cols = df.drop('userId', 'churn').columns
    print(str(len(feature_cols)) + ' Features for Scaling:\n')
    print(feature_cols)

    # Vector assembler
    assembler = VectorAssembler(inputCols=feature_cols, outputCol='FeatureVector')
    
    # PySpark.ML expects the target column to be named as 'labelCol' af data type double
    df = df.withColumn('label', df['churn'].cast('float')).drop('churn')
    
    # feature scaler    
    scaler = MinMaxScaler(inputCol='FeatureVector', outputCol='ScaledFeatures')
    
    # perform assembling and standardizing of features
    df = assembler.transform(df)
    scalerModel = scaler.fit(df)
    scaled_df = scalerModel.transform(df)
    
    # just take scaled feature vector and labels
    scaled_df = scaled_df.select('ScaledFeatures', 'label')
    # print('\nReduced to scaled feature vector and labels:\n')
    # print(scaled_df.printSchema())
    # print(scaled_df.head(1))
    
    return scaled_df

In [None]:
def train_test_split(scaled_df):
    '''
    Split the data into training set and test set. 
    '''
    
    # Train-test split
    ratio = 0.9
    train_set = scaled_df.drop('userId').sampleBy('label', fractions={0:ratio, 1:ratio}, seed=42)
    test_set = scaled_df.drop('userId').subtract(train_set)
    
    # show number of rows in train and test sets
    print(train_set.count(), test_set.count())
    return train_set, test_set

In [None]:
def cross_validation(classifier, parameter_grid):
    '''
    Performs K-fold cross validation for estimating the performance of a predictive model
    '''
    
    crossval = CrossValidator(estimator = classifier,
                              estimatorParamMaps = parameter_grid,
                              evaluator = MulticlassClassificationEvaluator(metricName='f1'),
                              numFolds = 3)
    
    return crossval

In [None]:
def train_model(classifier, train_set, parameter_grid):
    '''
    Training of classification model
    '''
    
    # cross validation
    crossval = cross_validation(classifier, parameter_grid)
    
    # train model
    model = crossval.fit(train_set)
    
    return model

In [None]:
def evaluate_model(classifier, data):
    '''
    Evaluation of trained classification model
    '''
    
    # predict
    predictions = classifier.transform(data)
    
    # evaluator
    evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label')
    
    # calculate metrics
    metrics = {}
    metrics['precision'] = evaluator.evaluate(predictions, {evaluator.metricName: 'weightedPrecision'})
    metrics['recall'] = evaluator.evaluate(predictions, {evaluator.metricName: 'weightedRecall'})
    metrics['f1'] = evaluator.evaluate(predictions, {evaluator.metricName: 'f1'})
    metrics['accuracy'] = evaluator.evaluate(predictions, {evaluator.metricName: 'accuracy'})
    
    return metrics

In [None]:
def ml_pipeline(model, path):
    '''
    Machine Learning pipeline that brings all steps together: 
    '''
    
    print('\nModel: ' + model + '\n')
    
    print('----------------------------')
    print('1. Load Data')
    print('----------------------------')
    
    df = load_data(path)
    
    print('----------------------------')
    print('2. Feature Creation')
    print('----------------------------')
    
    df = feature_creation(df)

    print('----------------------------')
    print('3. Feature Scaling')
    print('----------------------------')
    
    scaled_df = feature_scaling(df)
    
    print('----------------------------')
    print('4. Train-Test Split')
    print('----------------------------')
    
    train_set, test_set = train_test_split(scaled_df)
    
    print('----------------------------')
    print('5. Build model')
    print('----------------------------')

    if model == 'Random Forest':
        classifier = RandomForestClassifier(labelCol='label', featuresCol='ScaledFeatures')
        file_name = 'cv_randomforest_cf.model'
        
    print('----------------------------')
    print('6. & 7. Cross Val. & Training')
    print('----------------------------')
    
    parameter_grid = ParamGridBuilder() \
                    .addGrid(classifier.numTrees, [5, 7, 10]).build()
    
    trained_classifier = train_model(classifier, train_set, parameter_grid)
    
    # Get best model
    bestModel = trained_classifier.bestModel
    print(bestModel)
    
    # for saving the best trained model uncomment:
    bestModel.write().overwrite() #.save(file_name)
    
    print('----------------------------')
    print('8. Model Evaluation')
    print('----------------------------')
    
    metrics = evaluate_model(trained_classifier, test_set)
    print(metrics)
    
    return trained_classifier

In [None]:
# Model: Random Forest Classifier

cv_randomforest_cf = ml_pipeline('Random Forest', 's3n://udacity-dsnd/sparkify/sparkify_event_data.json')