# Introduction: Feature Engineering with Spark

In this notebook, we will run the feature engineering using Spark. We'll start with a single machine.

In [1]:
import findspark
findspark.init('/usr/local/spark')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import pyspark
import random
sc = pyspark.SparkContext(appName="Pi")
num_samples = 100000000
def inside(p):     
  x, y = random.random(), random.random()
  return x*x + y*y < 1
count = sc.parallelize(range(0, num_samples)).filter(inside).count()
pi = 4 * count / num_samples
print(pi)
sc.stop()

3.14128384


In [2]:
import featuretools as ft
import featuretools.variable_types as vtypes 

feature_defs = ft.load_features('/data/churn/features.txt')
print(f'There are {len(feature_defs)} features.')

import pandas as pd
import numpy as np

N_PARTITIONS = 1000

from timeit import default_timer as timer

There are 230 features.


# Partition to Feature Matrix

This is the function that will be parallelized. For a single partition, we want to define the entityset, add the entities and relationships, calculate the feature matrix, and save the feature matrix to the partition directory.

In [6]:
def partition_to_feature_matrix(partition, cutoff_times_file, feature_defs=feature_defs):
    """Take in a partition number, create a feature matrix, and save to disk
    
    Params
    --------
        partition (int): number of partition
        cutoff_times_file (str): name of cutoff time file
        feature_defs (list of ft features): features to make for the partition
        
    Return
    --------
        None: saves the feature matrix to disk
    
    """
    directory = '/data/churn/partitions/p' + str(partition)
    
    # Read in the data files
    members = pd.read_csv(f'{directory}/members.csv', 
                      parse_dates=['registration_init_time'], 
                      infer_datetime_format = True, 
                      dtype = {'gender': 'category'})

    trans = pd.read_csv(f'{directory}/transactions.csv',
                       parse_dates=['transaction_date', 'membership_expire_date'], 
                        infer_datetime_format = True)

    logs = pd.read_csv(f'{directory}/logs.csv', parse_dates = ['date'])
    
    cutoff_times = pd.read_csv(f'{directory}/{cutoff_times_file}', parse_dates = ['cutoff_time'])
    cutoff_times = cutoff_times.drop_duplicates()
    
    labeled_customers = set(cutoff_times['msno'])
    
    # Subset to only customers with labels
    members = members[members['msno'].isin(labeled_customers)]
    trans = trans[trans['msno'].isin(labeled_customers)]
    logs = logs[logs['msno'].isin(labeled_customers)]
    
    # Create empty entityset
    es = ft.EntitySet(id = 'customers')

    # Add the members parent table
    es.entity_from_dataframe(entity_id='members', dataframe=members,
                             index = 'msno', time_index = 'registration_init_time', 
                             variable_types = {'city': vtypes.Categorical, 'bd': vtypes.Categorical,
                                               'registered_via': vtypes.Categorical})
    # Create new features in transactions
    trans['price_difference'] = trans['plan_list_price'] - trans['actual_amount_paid']
    trans['planned_daily_price'] = trans['plan_list_price'] / trans['payment_plan_days']
    trans['daily_price'] = trans['actual_amount_paid'] / trans['payment_plan_days']

    # Add the transactions child table
    es.entity_from_dataframe(entity_id='transactions', dataframe=trans,
                             index = 'transactions_index', make_index = True,
                             time_index = 'transaction_date', 
                             variable_types = {'payment_method_id': vtypes.Categorical, 
                                               'is_auto_renew': vtypes.Boolean, 'is_cancel': vtypes.Boolean})

    # Add transactions interesting values
    es['transactions']['is_cancel'].interesting_values = [0, 1]
    es['transactions']['is_auto_renew'].interesting_values = [0, 1]
    
    # Create new features in logs
    logs['total'] = logs[['num_25', 'num_50', 'num_75', 'num_985', 'num_100']].sum(axis = 1)
    logs['percent_100'] = logs['num_100'] / logs['total']
    logs['percent_unique'] = logs['num_unq'] / logs['total']
    
    # Add the logs child table
    es.entity_from_dataframe(entity_id='logs', dataframe=logs,
                         index = 'logs_index', make_index = True,
                         time_index = 'date')

    # Add the relationships
    r_member_transactions = ft.Relationship(es['members']['msno'], es['transactions']['msno'])
    r_member_logs = ft.Relationship(es['members']['msno'], es['logs']['msno'])
    es.add_relationships([r_member_transactions, r_member_logs])

    # Calculate and save the feature matrix
    feature_matrix = ft.calculate_feature_matrix(entityset=es, 
                                                 features=feature_defs, 
                                                 cutoff_time=cutoff_times,
                                                 chunk_size = len(es['members'].df))
    
    feature_matrix.to_csv(f'{directory}/feature_matrix.csv')
    
    # Report progress every 10th of number of partitions
    if (partition % (N_PARTITIONS / 10) == 0):
        print(f'{100 * round(partition / N_PARTITIONS)}% complete.', end = '\r')

In [None]:
from timeit import default_timer as timer

start = timer()
partition_to_feature_matrix(350, 'monthly_labels_30.csv', feature_defs)
end = timer()
print(f'{round(end - start)} seconds elapsed.')

In [5]:
start = timer()
partition_to_feature_matrix(20, 'monthly_labels_30.csv', feature_defs)
end = timer()
print(f'{round(end - start)} seconds elapsed.')

202 seconds elapsed.


In [7]:
conf = pyspark.SparkConf()
conf.set('spark.eventLog.enabled', True);
conf.set('spark.eventLog.dir', '/usr/local/spark/tmp');
conf.getAll()

<pyspark.conf.SparkConf at 0x7fd87df2ce80>

<pyspark.conf.SparkConf at 0x7fd87df2ce80>

[('spark.eventLog.dir', '/usr/local/spark/tmp'),
 ('spark.eventLog.enabled', 'True'),
 ('spark.master', 'local[*]'),
 ('spark.submit.deployMode', 'client'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.app.name', 'pyspark-shell')]

# Run with Spark

The next cell runs the code with Spark. Each core can run one feature matrix at a time.

In [None]:
from timeit import default_timer as timer

start = timer()
sc = pyspark.SparkContext(master = 'spark://ip-172-31-23-133.ec2.internal:7077', 
                          appName = 'featuretools', conf = conf)
r = sc.parallelize(partitions, 
                   numSlices=1000).map(lambda x: partition_to_feature_matrix(x, 'monthly_labels_30.csv',
                                                                             feature_defs)).collect()
sc.stop()
end = timer()