In [4]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
import pandas as pd
import featuretools as ft
import featuretools.variable_types as vtypes

partition = 20
directory = 's3://customer-churn-spark/partitions/p' + str(partition)
cutoff_times_file = 'monthly_labels_30.csv'
# Read in the data files
members = pd.read_csv(f'{directory}/members.csv', 
                  parse_dates=['registration_init_time'], 
                  infer_datetime_format = True, 
                  dtype = {'gender': 'category'})

trans = pd.read_csv(f'{directory}/transactions.csv',
                   parse_dates=['transaction_date', 'membership_expire_date'], 
                    infer_datetime_format = True)

logs = pd.read_csv(f'{directory}/logs.csv', parse_dates = ['date'])

cutoff_times = pd.read_csv(f'{directory}/{cutoff_times_file}', parse_dates = ['cutoff_time'])
cutoff_times = cutoff_times.drop_duplicates()

# Create empty entityset
es = ft.EntitySet(id = 'customers')

# Add the members parent table
es.entity_from_dataframe(entity_id='members', dataframe=members,
                         index = 'msno', time_index = 'registration_init_time', 
                         variable_types = {'city': vtypes.Categorical, 'bd': vtypes.Categorical,
                                           'registered_via': vtypes.Categorical})
# Create new features in transactions
trans['price_difference'] = trans['plan_list_price'] - trans['actual_amount_paid']
trans['planned_daily_price'] = trans['plan_list_price'] / trans['payment_plan_days']
trans['daily_price'] = trans['actual_amount_paid'] / trans['payment_plan_days']

# Add the transactions child table
es.entity_from_dataframe(entity_id='transactions', dataframe=trans,
                         index = 'transactions_index', make_index = True,
                         time_index = 'transaction_date', 
                         variable_types = {'payment_method_id': vtypes.Categorical, 
                                           'is_auto_renew': vtypes.Boolean, 'is_cancel': vtypes.Boolean})

# Add transactions interesting values
es['transactions']['is_cancel'].interesting_values = [0, 1]
es['transactions']['is_auto_renew'].interesting_values = [0, 1]

# Create new features in logs
logs['total'] = logs[['num_25', 'num_50', 'num_75', 'num_985', 'num_100']].sum(axis = 1)
logs['percent_100'] = logs['num_100'] / logs['total']
logs['percent_unique'] = logs['num_unq'] / logs['total']

# Add the logs child table
es.entity_from_dataframe(entity_id='logs', dataframe=logs,
                     index = 'logs_index', make_index = True,
                     time_index = 'date')

# Add the relationships
r_member_transactions = ft.Relationship(es['members']['msno'], es['transactions']['msno'])
r_member_logs = ft.Relationship(es['members']['msno'], es['logs']['msno'])
es.add_relationships([r_member_transactions, r_member_logs])

agg_primitives = ['sum', 'time_since_last', 'avg_time_between', 'all', 'mode', 'num_unique', 'min', 'last', 
                  'mean', 'percent_true', 'max', 'std', 'count']
trans_primitives = ['weekend', 'cum_sum', 'day', 'month', 'diff', 'time_since_previous']
where_primitives = ['sum', 'count', 'mean', 'percent_true', 'all', 'any']

feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity='members', 
                                      cutoff_time = cutoff_times, 
                                      agg_primitives = agg_primitives,
                                      trans_primitives = trans_primitives,
                                      where_primitives = where_primitives,
                                      max_depth = 2, features_only = False,
                                      chunk_size = 100, n_jobs = 1, verbose = 1)

Built 230 features


Elapsed: 00:00 | Remaining: ? | Progress:   0%|          | Calculated: 0/344 chunks[A[A

Elapsed: 00:01 | Remaining: 08:42 | Progress:   0%|          | Calculated: 1/344 chunks[A[A

Elapsed: 00:02 | Remaining: 08:18 | Progress:   1%|          | Calculated: 2/344 chunks[A[A

Elapsed: 00:04 | Remaining: 08:17 | Progress:   1%|          | Calculated: 3/344 chunks[A[A

Elapsed: 00:05 | Remaining: 08:13 | Progress:   1%|          | Calculated: 4/344 chunks[A[A

Elapsed: 00:07 | Remaining: 08:09 | Progress:   1%|▏         | Calculated: 5/344 chunks[A[A

Elapsed: 00:08 | Remaining: 08:04 | Progress:   2%|▏         | Calculated: 6/344 chunks[A[A

Elapsed: 00:09 | Remaining: 08:00 | Progress:   2%|▏         | Calculated: 7/344 chunks[A[A

Elapsed: 00:11 | Remaining: 07:56 | Progress:   2%|▏         | Calculated: 8/344 chunks[A[A

Elapsed: 00:12 | Remaining: 07:54 | Progress:   3%|▎         | Calculated: 9/344 chunks[A[A

Elapsed: 00:14 | Remaining: 07:53

Elapsed: 01:58 | Remaining: 06:00 | Progress:  25%|██▍       | Calculated: 85/344 chunks[A[A

Elapsed: 01:59 | Remaining: 05:59 | Progress:  25%|██▌       | Calculated: 86/344 chunks[A[A

Elapsed: 02:01 | Remaining: 05:57 | Progress:  25%|██▌       | Calculated: 87/344 chunks[A[A

Elapsed: 02:02 | Remaining: 05:56 | Progress:  26%|██▌       | Calculated: 88/344 chunks[A[A

Elapsed: 02:03 | Remaining: 05:54 | Progress:  26%|██▌       | Calculated: 89/344 chunks[A[A

Elapsed: 02:05 | Remaining: 05:53 | Progress:  26%|██▌       | Calculated: 90/344 chunks[A[A

Elapsed: 02:06 | Remaining: 05:51 | Progress:  26%|██▋       | Calculated: 91/344 chunks[A[A

Elapsed: 02:07 | Remaining: 05:50 | Progress:  27%|██▋       | Calculated: 92/344 chunks[A[A

Elapsed: 02:09 | Remaining: 05:49 | Progress:  27%|██▋       | Calculated: 93/344 chunks[A[A

Elapsed: 02:10 | Remaining: 05:48 | Progress:  27%|██▋       | Calculated: 94/344 chunks[A[A

Elapsed: 02:12 | Remaining: 05:47 | Prog

Elapsed: 03:50 | Remaining: 03:59 | Progress:  49%|████▉     | Calculated: 169/344 chunks[A[A

Elapsed: 03:52 | Remaining: 03:57 | Progress:  49%|████▉     | Calculated: 170/344 chunks[A[A

Elapsed: 03:53 | Remaining: 03:56 | Progress:  50%|████▉     | Calculated: 171/344 chunks[A[A

Elapsed: 03:54 | Remaining: 03:54 | Progress:  50%|█████     | Calculated: 172/344 chunks[A[A

Elapsed: 03:56 | Remaining: 03:53 | Progress:  50%|█████     | Calculated: 173/344 chunks[A[A

Elapsed: 03:57 | Remaining: 03:51 | Progress:  51%|█████     | Calculated: 174/344 chunks[A[A

Elapsed: 03:58 | Remaining: 03:50 | Progress:  51%|█████     | Calculated: 175/344 chunks[A[A

Elapsed: 04:00 | Remaining: 03:49 | Progress:  51%|█████     | Calculated: 176/344 chunks[A[A

Elapsed: 04:01 | Remaining: 03:48 | Progress:  51%|█████▏    | Calculated: 177/344 chunks[A[A

Elapsed: 04:03 | Remaining: 03:46 | Progress:  52%|█████▏    | Calculated: 178/344 chunks[A[A

Elapsed: 04:04 | Remaining: 03

Elapsed: 05:42 | Remaining: 02:03 | Progress:  74%|███████▎  | Calculated: 253/344 chunks[A[A

Elapsed: 05:43 | Remaining: 02:01 | Progress:  74%|███████▍  | Calculated: 254/344 chunks[A[A

Elapsed: 05:45 | Remaining: 02:00 | Progress:  74%|███████▍  | Calculated: 255/344 chunks[A[A

Elapsed: 05:46 | Remaining: 01:59 | Progress:  74%|███████▍  | Calculated: 256/344 chunks[A[A

Elapsed: 05:47 | Remaining: 01:57 | Progress:  75%|███████▍  | Calculated: 257/344 chunks[A[A

Elapsed: 05:49 | Remaining: 01:56 | Progress:  75%|███████▌  | Calculated: 258/344 chunks[A[A

Elapsed: 05:50 | Remaining: 01:54 | Progress:  75%|███████▌  | Calculated: 259/344 chunks[A[A

Elapsed: 05:51 | Remaining: 01:53 | Progress:  76%|███████▌  | Calculated: 260/344 chunks[A[A

Elapsed: 05:52 | Remaining: 01:52 | Progress:  76%|███████▌  | Calculated: 261/344 chunks[A[A

Elapsed: 05:53 | Remaining: 01:50 | Progress:  76%|███████▌  | Calculated: 262/344 chunks[A[A

Elapsed: 05:55 | Remaining: 01

Elapsed: 07:29 | Remaining: 00:09 | Progress:  98%|█████████▊| Calculated: 337/344 chunks[A[A

Elapsed: 07:30 | Remaining: 00:07 | Progress:  98%|█████████▊| Calculated: 338/344 chunks[A[A

Elapsed: 07:31 | Remaining: 00:06 | Progress:  99%|█████████▊| Calculated: 339/344 chunks[A[A

Elapsed: 07:33 | Remaining: 00:05 | Progress:  99%|█████████▉| Calculated: 340/344 chunks[A[A

Elapsed: 07:34 | Remaining: 00:04 | Progress:  99%|█████████▉| Calculated: 341/344 chunks[A[A

Elapsed: 07:35 | Remaining: 00:02 | Progress:  99%|█████████▉| Calculated: 342/344 chunks[A[A

Elapsed: 07:37 | Remaining: 00:01 | Progress: 100%|█████████▉| Calculated: 343/344 chunks[A[A

Elapsed: 07:38 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 344/344 chunks[A[A

[A[A

In [None]:
pd.read_csv('https://s3.amazonaws.com/customer-churn-spark/partitions/p0/bimonthly_labels_14.csv')

In [None]:
es.to_pickle('/Users/fl/Downloads/es')

In [None]:
ft.save_features(feature_defs, 'features.txt')
ft.load_features('features.txt')

In [None]:
ft.read_entityset('s3://customer-churn-spark/es')