In [1]:
# Data science helpers
import pandas as pd 
import numpy as np

import featuretools as ft

# Useful for showing multiple outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

BASE_DIR = 's3://customer-churn-spark/'

import featuretools.variable_types as vtypes

In [10]:

def partition_to_feature_matrix(partition, 
                                cutoff_time_name = 'MS-30_labels.csv', write = False):
    """Take in a partition number, create a feature matrix, and save to Amazon S3
    
    Params
    --------
        partition (int): number of partition
        feature_defs (list of ft features): features to make for the partition
        cutoff_time_name (str): name of cutoff time file
        write: (boolean): whether to write the data to S3. Defaults to True
        
    Return
    --------
        None: saves the feature matrix to Amazon S3
    
    """
    
    partition_dir = BASE_DIR + 'p' + str(partition)
    
    # Read in the data files
    members = pd.read_csv(f'{partition_dir}/members.csv', 
                      parse_dates=['registration_init_time'], 
                      infer_datetime_format = True, 
                      dtype = {'gender': 'category'})

    trans = pd.read_csv(f'{partition_dir}/transactions.csv',
                       parse_dates=['transaction_date', 'membership_expire_date'], 
                        infer_datetime_format = True)
    logs = pd.read_csv(f'{partition_dir}/logs.csv', parse_dates = ['date'])
    
    # Make sure to drop duplicates
    cutoff_times = pd.read_csv(f'{partition_dir}/{cutoff_time_name}', parse_dates = ['cutoff_time'])
    cutoff_times = cutoff_times.drop_duplicates(subset = ['msno', 'cutoff_time'])
    
    # Needed for saving
    cutoff_spec = cutoff_time_name.split('_')[0]
    
    # Create empty entityset
    es = ft.EntitySet(id = 'customers')

    # Add the members parent table
    es.entity_from_dataframe(entity_id='members', dataframe=members,
                             index = 'msno', time_index = 'registration_init_time', 
                             variable_types = {'city': vtypes.Categorical,
                                               'registered_via': vtypes.Categorical})
    # Create new features in transactions
    trans['price_difference'] = trans['plan_list_price'] - trans['actual_amount_paid']
    trans['planned_daily_price'] = trans['plan_list_price'] / trans['payment_plan_days']
    trans['daily_price'] = trans['actual_amount_paid'] / trans['payment_plan_days']

    # Add the transactions child table
    es.entity_from_dataframe(entity_id='transactions', dataframe=trans,
                             index = 'transactions_index', make_index = True,
                             time_index = 'transaction_date', 
                             variable_types = {'payment_method_id': vtypes.Categorical, 
                                               'is_auto_renew': vtypes.Boolean, 'is_cancel': vtypes.Boolean})

    # Add transactions interesting values
    es['transactions']['is_cancel'].interesting_values = [0, 1]
    es['transactions']['is_auto_renew'].interesting_values = [0, 1]
    
    # Create new features in logs
    logs['total'] = logs[['num_25', 'num_50', 'num_75', 'num_985', 'num_100']].sum(axis = 1)
    logs['percent_100'] = logs['num_100'] / logs['total']
    logs['percent_unique'] = logs['num_unq'] / logs['total']
    logs['seconds_per_song'] = logs['total_secs'] / logs['total'] 
    
    # Add the logs child table
    es.entity_from_dataframe(entity_id='logs', dataframe=logs,
                         index = 'logs_index', make_index = True,
                         time_index = 'date')

    # Add the relationships
    r_member_transactions = ft.Relationship(es['members']['msno'], es['transactions']['msno'])
    r_member_logs = ft.Relationship(es['members']['msno'], es['logs']['msno'])
    es.add_relationships([r_member_transactions, r_member_logs])
    
    # Calculate the feature matrix using pre-calculated features
    feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity='members', trans_primitives = ['time_since_previous'],
                                        agg_primitives = ['time_since_last'], verbose = 1,
                                        cutoff_time=cutoff_times, cutoff_time_in_index = True,
                                        chunk_size = 1000, n_jobs = 8)
    
    return feature_matrix

    if write:
        # Save to Amazon S3
        bytes_to_write = feature_matrix.to_csv(None).encode()

        with fs.open(f'{partition_dir}/{cutoff_spec}_feature_matrix.csv', 'wb') as f:
            f.write(bytes_to_write)

In [11]:
feature_matrix = partition_to_feature_matrix(372)

Built 6 features
EntitySet scattered to workers in 1.850 seconds
Elapsed: 00:04 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 28/28 chunks


In [32]:
feature_matrix.loc[(feature_matrix['label'] == 1) & (feature_matrix['TIME_SINCE_LAST(transactions.transaction_date)'] < (10 * 24 * 3600))]

Unnamed: 0,msno,time,bd,city,registered_via,gender,TIME_SINCE_LAST(logs.date),TIME_SINCE_LAST(transactions.transaction_date),label,days_to_churn,churn_date,DAYS
914,fJfnR3hB6Y6I66UeeN3gN03/fuPg5Il3T/vt0w7uEqs=,2015-02-01,0.0,1.0,7.0,,,518400.0,1.0,25.0,2015-02-26,6.0
16230,SuBvN2LjNoibIDAnZbqQMeZEMwVyCGNrtbXuYGRiRE4=,2016-07-01,0.0,1.0,4.0,,19958400.0,604800.0,1.0,29.0,2016-07-30,7.0


In [17]:
trans = pd.read_csv('s3://customer-churn-spark/p372/transactions.csv', 
                    parse_dates = ['transaction_date', 'membership_expire_date']).sort_values(['msno', 'transaction_date'])
trans.head()

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
6385,+/Jks8tmNXiN/eq2VFfpD7FM7ZO79rK6Uw4w+m5KXSE=,37,30,149,149,1,2015-05-30,2015-06-30,0
8712,+/Jks8tmNXiN/eq2VFfpD7FM7ZO79rK6Uw4w+m5KXSE=,37,30,149,149,1,2015-07-01,2015-07-31,0
5584,+/Jks8tmNXiN/eq2VFfpD7FM7ZO79rK6Uw4w+m5KXSE=,37,30,149,149,1,2015-08-01,2015-08-31,0
9647,+/Jks8tmNXiN/eq2VFfpD7FM7ZO79rK6Uw4w+m5KXSE=,37,30,149,149,1,2015-09-01,2015-09-30,0
1006,+/Jks8tmNXiN/eq2VFfpD7FM7ZO79rK6Uw4w+m5KXSE=,37,30,149,149,1,2015-10-01,2015-10-31,0


In [33]:
trans.loc[trans['msno'] == 'fJfnR3hB6Y6I66UeeN3gN03/fuPg5Il3T/vt0w7uEqs=']

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
2141,fJfnR3hB6Y6I66UeeN3gN03/fuPg5Il3T/vt0w7uEqs=,41,30,149,149,1,2015-01-26,2015-01-26,1
17063,fJfnR3hB6Y6I66UeeN3gN03/fuPg5Il3T/vt0w7uEqs=,41,30,149,149,1,2015-11-02,2015-12-01,0
4010,fJfnR3hB6Y6I66UeeN3gN03/fuPg5Il3T/vt0w7uEqs=,41,30,149,149,1,2015-12-01,2016-01-01,0
7678,fJfnR3hB6Y6I66UeeN3gN03/fuPg5Il3T/vt0w7uEqs=,41,30,149,149,1,2016-01-01,2016-02-01,0
15944,fJfnR3hB6Y6I66UeeN3gN03/fuPg5Il3T/vt0w7uEqs=,41,30,149,149,1,2016-02-01,2016-03-01,0
15248,fJfnR3hB6Y6I66UeeN3gN03/fuPg5Il3T/vt0w7uEqs=,41,30,149,149,1,2016-03-01,2016-04-01,0
685,fJfnR3hB6Y6I66UeeN3gN03/fuPg5Il3T/vt0w7uEqs=,41,30,149,149,1,2016-04-01,2016-05-01,0
7139,fJfnR3hB6Y6I66UeeN3gN03/fuPg5Il3T/vt0w7uEqs=,41,30,149,149,1,2016-05-01,2016-06-01,0
3201,fJfnR3hB6Y6I66UeeN3gN03/fuPg5Il3T/vt0w7uEqs=,41,30,149,149,1,2016-06-01,2016-07-01,0
2254,fJfnR3hB6Y6I66UeeN3gN03/fuPg5Il3T/vt0w7uEqs=,41,30,149,149,1,2016-06-29,2016-06-29,1


In [34]:
labels = pd.read_csv('s3://customer-churn-spark/p372/MS-30_labels.csv', parse_dates = ['cutoff_time'])
labels.loc[labels['msno'] == 'fJfnR3hB6Y6I66UeeN3gN03/fuPg5Il3T/vt0w7uEqs=']

Unnamed: 0,msno,cutoff_time,label,days_to_churn,churn_date
17035,fJfnR3hB6Y6I66UeeN3gN03/fuPg5Il3T/vt0w7uEqs=,2015-01-01,,56.0,
17036,fJfnR3hB6Y6I66UeeN3gN03/fuPg5Il3T/vt0w7uEqs=,2015-02-01,1.0,25.0,2015-02-26
17037,fJfnR3hB6Y6I66UeeN3gN03/fuPg5Il3T/vt0w7uEqs=,2015-03-01,,,
17038,fJfnR3hB6Y6I66UeeN3gN03/fuPg5Il3T/vt0w7uEqs=,2015-04-01,,,
17039,fJfnR3hB6Y6I66UeeN3gN03/fuPg5Il3T/vt0w7uEqs=,2015-05-01,,,
17040,fJfnR3hB6Y6I66UeeN3gN03/fuPg5Il3T/vt0w7uEqs=,2015-06-01,,,
17041,fJfnR3hB6Y6I66UeeN3gN03/fuPg5Il3T/vt0w7uEqs=,2015-07-01,,,
17042,fJfnR3hB6Y6I66UeeN3gN03/fuPg5Il3T/vt0w7uEqs=,2015-08-01,,,
17043,fJfnR3hB6Y6I66UeeN3gN03/fuPg5Il3T/vt0w7uEqs=,2015-09-01,,,
17044,fJfnR3hB6Y6I66UeeN3gN03/fuPg5Il3T/vt0w7uEqs=,2015-10-01,,,


In [36]:
(pd.datetime(2018, 2, 26) - pd.datetime(2018, 1, 26)).days

31

In [22]:
feature_matrix.reset_index(inplace = True)

In [30]:
feature_matrix['DAYS'] = feature_matrix['TIME_SINCE_LAST(transactions.transaction_date)'] / (24 * 3600)

In [31]:
feature_matrix.loc[feature_matrix['msno'] == 'g/1LRF0Gr/qtPKDZfUOiw3XpSOT8vkGJL9WIQMjzkws=', 
                   ['TIME_SINCE_LAST(transactions.transaction_date)', 'label', 'DAYS']]

Unnamed: 0,TIME_SINCE_LAST(transactions.transaction_date),label,DAYS
21014,,,
22112,2505600.0,1.0,29.0
23182,5184000.0,,60.0
24220,7862400.0,,91.0
25209,10281600.0,,119.0
