# Introduction: Feature Engineering with Spark

In this notebook, we will run the feature engineering using Spark. We'll start with a single machine.

In [1]:
import findspark
# Initialize with Spark file location
findspark.init('/usr/local/spark-2.3.1-bin-hadoop2.7/')

import pyspark
import random

## Test Spark 

In [2]:
conf = pyspark.SparkConf()

# Enable logging
conf.set('spark.eventLog.enabled', True);
conf.set('spark.eventLog.dir', 'tmp/');

# Use all cores on all machines
conf.set('spark.num.executors', 3)
conf.set('spark.executor.memory', '12g')
conf.set('spark.executor.cores', 4)

# Set the master
conf.set('spark.master', 'spark://ip-172-31-8-174.ec2.internal:7077')
conf.getAll()

dict_items([('spark.eventLog.enabled', 'True'), ('spark.eventLog.dir', 'tmp/'), ('spark.num.executors', '3'), ('spark.executor.memory', '12g'), ('spark.executor.cores', '4'), ('spark.master', 'spark://ip-172-31-8-174.ec2.internal:7077')])

In [None]:
sc = pyspark.SparkContext(appName="Pi", 
                          conf = conf)
sc

In [None]:
num_samples = 100000000
def inside(p):     
  x, y = random.random(), random.random()
  return x*x + y*y < 1

count = sc.parallelize(range(0, num_samples)).filter(inside).count()
pi = 4 * count / num_samples
print(pi)
sc.stop()

## Make Features Once

We'll make the features once so we don't have to recalculate them every time. It also is possible to reload the features from disk.

Before running this code, make sure to authenticate with Amazon Web Services from the command line to access your files in S3. Run `aws configure` and then input the appropriate information. 

In [None]:
import pandas as pd
import featuretools as ft
import featuretools.variable_types as vtypes

partition = 20
directory = 's3://customer-churn-spark/partitions/p' + str(partition)
cutoff_times_file = 'monthly_labels_30.csv'


# Read in the data files
members = pd.read_csv(f'{directory}/members.csv', 
                  parse_dates=['registration_init_time'], 
                  infer_datetime_format = True, 
                  dtype = {'gender': 'category'})

trans = pd.read_csv(f'{directory}/transactions.csv',
                   parse_dates=['transaction_date', 'membership_expire_date'], 
                    infer_datetime_format = True)

logs = pd.read_csv(f'{directory}/logs.csv', parse_dates = ['date'])

cutoff_times = pd.read_csv(f'{directory}/{cutoff_times_file}', parse_dates = ['cutoff_time'])
cutoff_times = cutoff_times.drop_duplicates()

# Create empty entityset
es = ft.EntitySet(id = 'customers')

# Add the members parent table
es.entity_from_dataframe(entity_id='members', dataframe=members,
                         index = 'msno', time_index = 'registration_init_time', 
                         variable_types = {'city': vtypes.Categorical, 'bd': vtypes.Categorical,
                                           'registered_via': vtypes.Categorical})
# Create new features in transactions
trans['price_difference'] = trans['plan_list_price'] - trans['actual_amount_paid']
trans['planned_daily_price'] = trans['plan_list_price'] / trans['payment_plan_days']
trans['daily_price'] = trans['actual_amount_paid'] / trans['payment_plan_days']

# Add the transactions child table
es.entity_from_dataframe(entity_id='transactions', dataframe=trans,
                         index = 'transactions_index', make_index = True,
                         time_index = 'transaction_date', 
                         variable_types = {'payment_method_id': vtypes.Categorical, 
                                           'is_auto_renew': vtypes.Boolean, 'is_cancel': vtypes.Boolean})

# Add transactions interesting values
es['transactions']['is_cancel'].interesting_values = [0, 1]
es['transactions']['is_auto_renew'].interesting_values = [0, 1]

# Create new features in logs
logs['total'] = logs[['num_25', 'num_50', 'num_75', 'num_985', 'num_100']].sum(axis = 1)
logs['percent_100'] = logs['num_100'] / logs['total']
logs['percent_unique'] = logs['num_unq'] / logs['total']

# Add the logs child table
es.entity_from_dataframe(entity_id='logs', dataframe=logs,
                     index = 'logs_index', make_index = True,
                     time_index = 'date')

# Add the relationships
r_member_transactions = ft.Relationship(es['members']['msno'], es['transactions']['msno'])
r_member_logs = ft.Relationship(es['members']['msno'], es['logs']['msno'])
es.add_relationships([r_member_transactions, r_member_logs])

agg_primitives = ['sum', 'time_since_last', 'avg_time_between', 'all', 'mode', 'num_unique', 'min', 'last', 
                  'mean', 'percent_true', 'max', 'std', 'count']
trans_primitives = ['weekend', 'cum_sum', 'day', 'month', 'diff', 'time_since_previous']
where_primitives = ['sum', 'count', 'mean', 'percent_true', 'all', 'any']

feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity='members', 
                                      cutoff_time = cutoff_times, 
                                      agg_primitives = agg_primitives,
                                      trans_primitives = trans_primitives,
                                      where_primitives = where_primitives,
                                      max_depth = 2, features_only = False,
                                      chunk_size = 100, n_jobs = 1, verbose = 1)

In [None]:
ft.save_features(feature_defs, '/data/churn/features.txt')

In [None]:
import featuretools as ft
import featuretools.variable_types as vtypes 

feature_defs = ft.load_features('/data/churn/features.txt')
print(f'There are {len(feature_defs)} features.')

## Writing Feature Matrix to S3 

In order to save each feature matrix, we'll want to write it to s3. For this we can use the `s3fs` (s3 file system) Python library. We first have to authenticate with aws, and then we can upload our csv much the same as we would write any csv.

In [None]:
import s3fs

with open('/data/credentials.txt', 'r') as f:
    info = f.read().strip().split(',')
    key = info[0]
    secret = info[1]

fs = s3fs.S3FileSystem(key=key, secret=secret)

directory = 's3://customer-churn-spark/partitions/p' + str(partition)

# Encode in order to write to s3
bytes_to_write = feature_matrix.to_csv(None).encode()

# Write to s3
with fs.open(f'{directory}/feature_matrix.csv', 'wb') as f:
    f.write(bytes_to_write)

# Partition to Feature Matrix

This is the function that will be parallelized. For a single partition, we want to define the entityset, add the entities and relationships, calculate the feature matrix, and save the feature matrix to the partition directory.

In [None]:
N_PARTITIONS = 1000

def partition_to_feature_matrix(partition, cutoff_times_file, feature_defs=feature_defs):
    """Take in a partition number, create a feature matrix, and save to disk
    
    Params
    --------
        partition (int): number of partition
        cutoff_times_file (str): name of cutoff time file
        feature_defs (list of ft features): features to make for the partition
        
    Return
    --------
        None: saves the feature matrix to disk
    
    """
    directory = 's3://customer-churn-spark/partitions/p' + str(partition)
    
    # Read in the data files
    members = pd.read_csv(f'{directory}/members.csv', 
                      parse_dates=['registration_init_time'], 
                      infer_datetime_format = True, 
                      dtype = {'gender': 'category'})

    trans = pd.read_csv(f'{directory}/transactions.csv',
                       parse_dates=['transaction_date', 'membership_expire_date'], 
                        infer_datetime_format = True)

    logs = pd.read_csv(f'{directory}/logs.csv', parse_dates = ['date'])
    
    cutoff_times = pd.read_csv(f'{directory}/{cutoff_times_file}', parse_dates = ['cutoff_time'])
    cutoff_times = cutoff_times.drop_duplicates()
    
    labeled_customers = set(cutoff_times['msno'])
    
    # Subset to only customers with labels
    members = members[members['msno'].isin(labeled_customers)]
    trans = trans[trans['msno'].isin(labeled_customers)]
    logs = logs[logs['msno'].isin(labeled_customers)]
    
    # Create empty entityset
    es = ft.EntitySet(id = 'customers')

    # Add the members parent table
    es.entity_from_dataframe(entity_id='members', dataframe=members,
                             index = 'msno', time_index = 'registration_init_time', 
                             variable_types = {'city': vtypes.Categorical, 'bd': vtypes.Categorical,
                                               'registered_via': vtypes.Categorical})
    # Create new features in transactions
    trans['price_difference'] = trans['plan_list_price'] - trans['actual_amount_paid']
    trans['planned_daily_price'] = trans['plan_list_price'] / trans['payment_plan_days']
    trans['daily_price'] = trans['actual_amount_paid'] / trans['payment_plan_days']

    # Add the transactions child table
    es.entity_from_dataframe(entity_id='transactions', dataframe=trans,
                             index = 'transactions_index', make_index = True,
                             time_index = 'transaction_date', 
                             variable_types = {'payment_method_id': vtypes.Categorical, 
                                               'is_auto_renew': vtypes.Boolean, 'is_cancel': vtypes.Boolean})

    # Add transactions interesting values
    es['transactions']['is_cancel'].interesting_values = [0, 1]
    es['transactions']['is_auto_renew'].interesting_values = [0, 1]
    
    # Create new features in logs
    logs['total'] = logs[['num_25', 'num_50', 'num_75', 'num_985', 'num_100']].sum(axis = 1)
    logs['percent_100'] = logs['num_100'] / logs['total']
    logs['percent_unique'] = logs['num_unq'] / logs['total']
    
    # Add the logs child table
    es.entity_from_dataframe(entity_id='logs', dataframe=logs,
                         index = 'logs_index', make_index = True,
                         time_index = 'date')

    # Add the relationships
    r_member_transactions = ft.Relationship(es['members']['msno'], es['transactions']['msno'])
    r_member_logs = ft.Relationship(es['members']['msno'], es['logs']['msno'])
    es.add_relationships([r_member_transactions, r_member_logs])

    # Calculate and save the feature matrix
    feature_matrix = ft.calculate_feature_matrix(entityset=es, 
                                                 features=feature_defs, 
                                                 cutoff_time=cutoff_times,
                                                 chunk_size = len(es['members'].df))
    
    # Encode in order to write to s3
    bytes_to_write = feature_matrix.to_csv(None).encode()
    
    # Write to s3
    with fs.open(f'{directory}/feature_matrix.csv', 'wb') as f:
        f.write(bytes_to_write)
    
    # Report progress every 10th of number of partitions
    if (partition % (N_PARTITIONS / 10) == 0):
        print(f'{100 * round(partition / N_PARTITIONS)}% complete.', end = '\r')

### Test Function

Let's give the function a test with 2 different partitions.

In [None]:
from timeit import default_timer as timer

start = timer()
partition_to_feature_matrix(950, 'monthly_labels_30.csv', feature_defs)
end = timer()
print(f'{round(end - start)} seconds elapsed.')

In [None]:
start = timer()
partition_to_feature_matrix(530, 'monthly_labels_30.csv', feature_defs)
end = timer()
print(f'{round(end - start)} seconds elapsed.')

# Run with Spark

The next cell runs all of the partitions with Spark. We have 3 workers, each of which has 4 cores. 

In [None]:
from timeit import default_timer as timer

partitions = list(range(N_PARTITIONS))

start = timer()
sc = pyspark.SparkContext(master = 'spark://ip-172-31-8-174.ec2.internal:7077', 
                          appName = 'featuretools')
r = sc.parallelize(partitions, 
                   numSlices=1000).map(lambda x: partition_to_feature_matrix(x, 'monthly_labels_30.csv',
                                                                             feature_defs)).collect()
sc.stop()
end = timer()

In [None]:
print(f'Complete run took {round(end - start)} seconds.')

While the run is going on, we can look at the status of the cluster at localhost:8080 and the state of the particular job at localhost:4040. 