# Introduction: Improved Labeling

The purpose of this notebook is to demonstrate an improved labeling formulation that can be extended to many problems of a similar type. The objective is a general purpose framework for creating labels.

The general framework is based on two functions:

1. `customer_to_label_times(customer_id, transactions, **params)`
2. `make_label_times(transactions, **params)` 

As of now, this notebook is not general purpose due to the nature of the problem. My hope is to make it more extensible by removing the specifics to the current problem. 

In [1]:
import numpy as np
import pandas as pd

In [2]:
PARTITION = '100'
BASE_DIR = 's3://customer-churn-spark/'
PARTITION_DIR = BASE_DIR + 'p' + PARTITION

members = pd.read_csv(f'{PARTITION_DIR}/members.csv', 
                      parse_dates=['registration_init_time'], infer_datetime_format = True)
trans = pd.read_csv(f'{PARTITION_DIR}/transactions.csv',
                   parse_dates=['transaction_date', 'membership_expire_date'], infer_datetime_format = True)
logs = pd.read_csv(f'{PARTITION_DIR}/logs.csv', parse_dates = ['date'])

trans.head()

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
0,G7TmHc9Gg2t8ovG/KFaB53We/0CQPELhZ5UUN2Ol3AQ=,39,30,149,149,1,2015-09-30,2015-11-13,0
1,LPbp8N7VRuqEISEVim8ppTaeYJG/rWS/t4g/dEFuWjw=,34,30,149,149,1,2016-02-29,2016-03-31,0
2,xvYqULBWzJvN8heyFtY3hbY3egyQNbXuDx0igtsoi00=,29,30,180,180,1,2017-01-31,2017-03-01,0
3,UR4iin4mAkajoa7o+AyTTmz5k3N2GR3/rZY8a4KwADI=,41,30,99,99,1,2017-01-31,2017-02-28,0
4,ax8CRhY8BMRA/ZvT1wI+2N/EdPXiSPGxa9y7bntA1Uc=,40,30,149,149,1,2016-05-04,2016-06-08,0


In [3]:
import s3fs

# Credentials
with open('/data/credentials.txt', 'r') as f:
    info = f.read().strip().split(',')
    key = info[0]
    secret = info[1]

fs = s3fs.S3FileSystem(key=key, secret=secret)

In [4]:
def customer_to_label_times(customer_id, transactions, prediction_freq, churn_days, return_trans = False):
    """Make label times for a single customer. Returns a dataframe of labels with times, the binary label, 
       and the number of days until the next churn."""
    
    assert(prediction_freq in ['MS', 'SMS']), "Prediction day must be either 'MS' or 'SMS'"
    assert(transactions['msno'].unique() == [customer_id]), "Transactions must be for only customer"
    
    transactions = transactions.copy()
    
    # Make sure to sort chronalogically
    transactions.sort_values(['transaction_date', 'membership_expire_date'], inplace = True)
    
    # Create next transaction day by shifting back one transaction
    transactions['next_transaction_date'] = transactions['transaction_date'].shift(-1)
    
    # Find number of days between transaction and next
    transactions['difference_days'] = (transactions['next_transaction_date'] - 
                                       transactions['membership_expire_date']).\
                                       dt.total_seconds() / (3600 * 24)
    
    # Determine which transactions are associated with a churn
    transactions['churn'] = transactions['difference_days'] > churn_days
    
    # Find date of churn
    transactions.loc[transactions['churn'] == True, 
                     'churn_date'] = transactions.loc[transactions['churn'] == True, 
                                                      'membership_expire_date'] + pd.Timedelta(value = churn_days,
                                                                                               units = 'days')
    
    # Range for label times is from first to last transaction
    first_transaction = transactions['transaction_date'].min()
    last_transaction = transactions['transaction_date'].max()
    start_date = pd.datetime(first_transaction.year, first_transaction.month, 1)
    end_date = pd.datetime(last_transaction.year, last_transaction.month, 1)
    
    # Make label times dataframe
    label_times = pd.DataFrame({'cutoff_time': pd.date_range(start_date, end_date, freq = prediction_freq),
                                'msno': customer_id
                               })
    # Needed for subsetting to create label time
    label_times['next_cutoff_time'] = label_times['cutoff_time'].shift(-1)
    
    # If no churns
    if (transactions['churn'] == False).all():
        label_times['label'] = 0
        label_times['days_to_churn'] = np.nan
        label_times['churn_date'] = np.nan
        if return_trans: 
            return label_times[['msno', 'cutoff_time', 'label', 'days_to_churn', 'churn_date']], transactions
        
        return label_times[['msno', 'cutoff_time', 'label', 'days_to_churn', 'churn_date']]
    
    # Keep track of last churn
    previous_churn_date = None
    
    # Iterate through the positive churns
    for i, row in transactions.loc[transactions['churn'] == True].iterrows():
        churn_date = row['churn_date']
        
        # Find label time associated with churn and assign label 1 and churn date
        label_idx = label_times.loc[(label_times['cutoff_time'] <= churn_date) & 
                                    (label_times['next_cutoff_time'] > churn_date)].index
        label_times.loc[label_idx, 'label'] = 1
        label_times.loc[label_idx, 'churn_date'] = churn_date
        
        
        if not previous_churn_date:
            before_idx = label_times.loc[(label_times['cutoff_time'] <= churn_date)].index
        else:
            before_idx = label_times.loc[(label_times['cutoff_time'] <= churn_date) & 
                                         (label_times['cutoff_time'] > previous_churn_date)].index
        # Calculate days to next churn for all label times
        label_times.loc[before_idx, 'days_to_churn'] = (churn_date - label_times.loc[before_idx, 
                                                                                     'cutoff_time']).\
                                                        dt.total_seconds() / (3600 * 24)
        previous_churn_date = churn_date
        
    if return_trans:
        return label_times[['msno', 'cutoff_time', 'label', 'days_to_churn', 'churn_date']], transactions
    
    return label_times.fillna(0)[['msno', 'cutoff_time', 'label', 'days_to_churn', 'churn_date']]

In [5]:
CUSTOMER_ID = trans.iloc[8, 0]
customer_transactions = trans.loc[trans['msno'] == CUSTOMER_ID].copy()

label_times, altered_transactions = customer_to_label_times(CUSTOMER_ID, customer_transactions, 'MS', 1, True)
label_times.head(10)

Unnamed: 0,msno,cutoff_time,label,days_to_churn,churn_date
0,xDdNp65JYgYhw0J3MWVmc3o+WXBJrUUZTptR/O4Sa3Q=,2015-09-01,,166.0,NaT
1,xDdNp65JYgYhw0J3MWVmc3o+WXBJrUUZTptR/O4Sa3Q=,2015-10-01,,136.0,NaT
2,xDdNp65JYgYhw0J3MWVmc3o+WXBJrUUZTptR/O4Sa3Q=,2015-11-01,,105.0,NaT
3,xDdNp65JYgYhw0J3MWVmc3o+WXBJrUUZTptR/O4Sa3Q=,2015-12-01,,75.0,NaT
4,xDdNp65JYgYhw0J3MWVmc3o+WXBJrUUZTptR/O4Sa3Q=,2016-01-01,,44.0,NaT
5,xDdNp65JYgYhw0J3MWVmc3o+WXBJrUUZTptR/O4Sa3Q=,2016-02-01,1.0,13.0,2016-02-14 00:00:00.000000001
6,xDdNp65JYgYhw0J3MWVmc3o+WXBJrUUZTptR/O4Sa3Q=,2016-03-01,,,NaT
7,xDdNp65JYgYhw0J3MWVmc3o+WXBJrUUZTptR/O4Sa3Q=,2016-04-01,,,NaT
8,xDdNp65JYgYhw0J3MWVmc3o+WXBJrUUZTptR/O4Sa3Q=,2016-05-01,,,NaT
9,xDdNp65JYgYhw0J3MWVmc3o+WXBJrUUZTptR/O4Sa3Q=,2016-06-01,,,NaT


In [6]:
def make_label_times(transactions, prediction_freq, churn_days):
    """Make labels for an entire series of transactions. 
       Return is a dataframe with customer id, label time, label value, and days
       to next churn (for this problem)."""
    label_times = []
    
    # Iterate through each customer and find labels
    for customer_id, customer_transactions in transactions.groupby('msno'):
        label_times.append(customer_to_label_times(customer_id, customer_transactions,
                                                   prediction_freq, churn_days))
        
    # Concatenate into a single dataframe
    return pd.concat(label_times)

In [7]:
label_times = make_label_times(trans, 'MS', 30)
label_times.tail(10)

Unnamed: 0,msno,cutoff_time,label,days_to_churn,churn_date
17,zzm2UvJnzuTRkXaiaZHtbJwPG9jZQZkZxG0n4PYDTvw=,2016-06-01,0.0,,
18,zzm2UvJnzuTRkXaiaZHtbJwPG9jZQZkZxG0n4PYDTvw=,2016-07-01,0.0,,
19,zzm2UvJnzuTRkXaiaZHtbJwPG9jZQZkZxG0n4PYDTvw=,2016-08-01,0.0,,
20,zzm2UvJnzuTRkXaiaZHtbJwPG9jZQZkZxG0n4PYDTvw=,2016-09-01,0.0,,
21,zzm2UvJnzuTRkXaiaZHtbJwPG9jZQZkZxG0n4PYDTvw=,2016-10-01,0.0,,
22,zzm2UvJnzuTRkXaiaZHtbJwPG9jZQZkZxG0n4PYDTvw=,2016-11-01,0.0,,
23,zzm2UvJnzuTRkXaiaZHtbJwPG9jZQZkZxG0n4PYDTvw=,2016-12-01,0.0,,
24,zzm2UvJnzuTRkXaiaZHtbJwPG9jZQZkZxG0n4PYDTvw=,2017-01-01,0.0,,
25,zzm2UvJnzuTRkXaiaZHtbJwPG9jZQZkZxG0n4PYDTvw=,2017-02-01,0.0,,
26,zzm2UvJnzuTRkXaiaZHtbJwPG9jZQZkZxG0n4PYDTvw=,2017-03-01,0.0,,


In [8]:
def partition_to_labels(partition_number, label_types = ['MS', 'SMS'], churn_periods = [30, 14]):
    """Make labels for all customers in one partition
    Either for one month or twice a month
    
    Params
    --------
        partition (int): number of partition
        label_type (str): either 'monthly' for monthly labels or
                          'bimonthly' for twice a month labels
        churn_period (int): number of days required without a membership for a churn
    
    Returns
    --------
        None: saves the label dataframes with the appropriate name to the partition directory
    """
    partition_dir = BASE_DIR + 'p' + str(partition_number)
    
    # Read in data and filter anomalies
    trans = pd.read_csv(f'{partition_dir}/transactions.csv',
                        parse_dates=['transaction_date', 'membership_expire_date'], 
                        infer_datetime_format = True)
    trans = trans.loc[trans['membership_expire_date'] >= trans['transaction_date']]
    
    

    # Create both sets of lables
    for prediction_freq, churn_days in zip(label_types, churn_periods):
        
        cutoff_list = []
        
        cutoff_list.append(make_label_times(trans, prediction_freq = prediction_freq, 
                                            churn_days = churn_days))
        cutoff_times = pd.concat(cutoff_list)
        cutoff_times = cutoff_times.drop_duplicates()
        
        # Encode in order to write to s3
        bytes_to_write = cutoff_times.to_csv(None, index = False).encode()

        # Write cutoff times to S3
        with fs.open(f'{partition_dir}/{prediction_freq}-{churn_days}_labels.csv', 'wb') as f:
            f.write(bytes_to_write)

In [9]:
partition_to_labels(1)

In [10]:
import findspark
findspark.init('/usr/local/spark/')

import pyspark

In [11]:
conf = pyspark.SparkConf()

# Enable logging
conf.set('spark.eventLog.enabled', True);
conf.set('spark.eventLog.dir', '/data/churn/tmp/');

# Use all cores on all machines
conf.set('spark.num.executors', 1)
conf.set('spark.executor.memory', '24g')
conf.set('spark.executor.cores', 8)

sc = pyspark.SparkContext(master = 'spark://ip-172-31-23-133.ec2.internal:7077',
                          appName = 'labeling', conf = conf)

In [12]:
from timeit import default_timer as timer

start = timer()
sc.parallelize(list(range(1000)), numSlices=1000).\
   map(partition_to_labels).collect()
sc.stop()
end = timer()

In [13]:
print(f'{round(end - start)} seconds elapsed.')

12599 seconds elapsed.


In [15]:
labels = pd.read_csv(f's3://customer-churn-spark/p980/MS-30_labels.csv')
labels.head()

Unnamed: 0,msno,cutoff_time,label,days_to_churn,churn_date
0,+/ArcmV8FBEOABJP5zn7RH2S5lU1EdPN3ucEJjYRXVY=,2016-11-01,0.0,,
1,+/trjV54MKwcuhk/C6P0OSeEX7hDfLpXme/6EDOny6A=,2015-06-01,0.0,144.0,0.0
2,+/trjV54MKwcuhk/C6P0OSeEX7hDfLpXme/6EDOny6A=,2015-07-01,0.0,114.0,0.0
3,+/trjV54MKwcuhk/C6P0OSeEX7hDfLpXme/6EDOny6A=,2015-08-01,0.0,83.0,0.0
4,+/trjV54MKwcuhk/C6P0OSeEX7hDfLpXme/6EDOny6A=,2015-09-01,0.0,52.0,0.0
