# Introduction: Partition Pipeline

In this notebook, we will work with a single partition to develop a pipeline for processing the data. The end goal is code that can take a partition on disk and generate a feature matrix from the partition. This will then be parallelized using Spark in PySpark.

In [31]:
import pandas as pd 
import numpy as np

import featuretools as ft

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [32]:
directory = '/data/churn/partitions/p0'
import os
os.listdir(directory)

['transactions.csv', 'members.csv', 'test.csv', 'train.csv', 'logs.csv']

In [33]:
all_partitions = os.listdir('/data/churn/partitions/')
len(all_partitions)

711

In [11]:
members = pd.read_csv(f'{directory}/members.csv', 
                      parse_dates=['registration_init_time'], infer_datetime_format = True)
trans = pd.read_csv(f'{directory}/transactions.csv',
                   parse_dates=['transaction_date', 'membership_expire_date'], infer_datetime_format = True)
logs = pd.read_csv(f'{directory}/logs.csv', parse_dates = ['date'])
train = pd.read_csv(f'{directory}/train.csv')
test = pd.read_csv(f'{directory}/test.csv')

In [12]:
members.head()
trans.head()
logs.head()
train.head()
test.head()

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time
0,7Bx3zw0JFFwCXjkWgqhMxnQQLzy7Idmc+J1qXU8GhAY=,13,25,female,9,2014-11-30
1,CxKIZcJY75vsiUlfeO9/J1Ld8fdAi3Y3JEK+ig7sXGQ=,1,0,,4,2015-11-21
2,w2O0vchmncaBEQLoC5Pn7qRoSPiJ6X2x5O+woVFDhnY=,13,0,,9,2014-06-05
3,Ad/jjTPiR09TQn3FDC2AxdoCrg2/N+umBYjGHt5GKMk=,1,0,,4,2016-12-25
4,7xNkNoVMi3Sm40SjyZ8IA3OHP6/+KJUEIBY7Zfeul18=,13,31,male,9,2015-09-24


Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
0,mg7wWEZpN18hS9cUpx7BgHx0GsA77IcfE4YiXxnMxsw=,33,30,149,149,1,2016-05-31,2016-06-30,0
1,v1LhGZzRFxYwWjW98zEG47ZGHfxiOK885mFyaRH2+CE=,32,180,536,536,0,2016-03-22,2016-09-18,0
2,2nuseI+OuZjgpRFuctjjK52kQDf6fEaXLvHMZ+/oUg4=,38,30,149,149,0,2016-10-20,2016-11-19,0
3,delh/z8YrqHIQWtzbQF8TU4dNaX4IXQgvkFYLwCuUAQ=,33,30,149,149,1,2015-11-30,2016-02-03,0
4,pfbBzgY9yynsa13gaAlrdoLjCvYnPXgoCiOIpbWLhPs=,41,30,149,149,1,2015-03-31,2015-04-30,0


Unnamed: 0,msno,date,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
0,/LQ2vrNFkNhrsKQu9K22YKq65yidZzQJmq7fD8KYqXI=,2015-04-05,9,8,8,1,9,29,4664.321
1,/LQ2vrNFkNhrsKQu9K22YKq65yidZzQJmq7fD8KYqXI=,2015-07-18,10,1,0,0,24,20,6464.364
2,/LQ2vrNFkNhrsKQu9K22YKq65yidZzQJmq7fD8KYqXI=,2015-11-21,6,2,0,1,15,13,4293.397
3,/LQ2vrNFkNhrsKQu9K22YKq65yidZzQJmq7fD8KYqXI=,2016-03-29,2,0,1,0,2,4,628.291
4,/LQ2vrNFkNhrsKQu9K22YKq65yidZzQJmq7fD8KYqXI=,2016-12-17,0,1,0,0,12,4,3892.04


Unnamed: 0,msno,is_churn
0,40hbsrJVUFV4tTR4lZbNhdQ2SYfXdSjbNyRI3rwyCpM=,1
1,lnVIfviVfvaaubDQmGAhOK6oDJmBdVgIvlJuRMvWajM=,1
2,fgYoXPN3yAl6/frLAlp7c5z6Em1xVwwXcCV5RmNcSzs=,1
3,4JH4w5jFRxt4unMPhQX6yEr6d6Xw1vAauqUkX2MBtoo=,1
4,RSZE3ePxlJ2/Pndl7AmNxaKltkFIwPXE1tVk8ujQTWU=,1


Unnamed: 0,msno,is_churn
0,ugx0CjOMzazClkFzU2xasmDZaoIqOUAZPsH1q0teWCg=,0
1,f/NmvEzHfhINFEYZTR05prUdr+E+3+oewvweYz9cCQE=,0
2,zLo9f73nGGT1p21ltZC3ChiRnAVvgibMyazbCxvWPcg=,0
3,8iF/+8HY8lJKFrTc7iR9ZYGCG2Ecrogbc2Vy5YhsfhQ=,0
4,K6fja4+jmoZ5xG6BypqX80Uw/XKpMgrEMdG2edFOxnA=,0


In [21]:
train.groupby('msno')['is_churn'].nunique().head()

msno
+/w1UrZwyka4C9oNH3+Q8fUf3fD8R3EwWrx57ODIsqk=    2
+2rgJpEKJWYFwVkHKnSzQUnieMwfLMrHiJzCxK9AhGo=    2
+3KltBa/1dUuXwOzDKksw11Nwdwf7/pXv47sDv4mInY=    2
+4lC2x3ltrVTmmT3CgS+vuFD/1yzi97C6icTr7hFuRY=    2
+6KgKovFigr5lk3+G8srZUoUHhPS8a+rTa/N2Vg1wsg=    2
Name: is_churn, dtype: int64

In [15]:
trans.groupby('msno').count().sort_values('plan_list_price').tail()

Unnamed: 0_level_0,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
prvwhLDR5WjSgche9r0XTDuF2lbr9KpPKyqBPAAUwio=,41,41,41,41,41,41,41,41
8O9UXcStlZzME2YTyJDJf4m4WnfBV1FNOOBI1hoeueE=,42,42,42,42,42,42,42,42
AKZNktiVbiDxV6Un0C84th9hKQPn/NwGXNffbVbR9Ps=,42,42,42,42,42,42,42,42
zwhR3q2j/NM4e56g3ekekoyx/8s0Ghomij/3/BsSBWs=,44,44,44,44,44,44,44,44
fItJlEs671EQOapBdqMZ/9zJe0/Mzzt6A5wCp4Iu/wA=,54,54,54,54,54,54,54,54


In [24]:
ex = trans[trans['msno'] == 'fItJlEs671EQOapBdqMZ/9zJe0/Mzzt6A5wCp4Iu/wA='].copy().\
     sort_values(['transaction_date', 'membership_expire_date'])
ex.head(10)

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
23433,fItJlEs671EQOapBdqMZ/9zJe0/Mzzt6A5wCp4Iu/wA=,41,30,149,149,1,2015-01-08,2016-08-10,0
537,fItJlEs671EQOapBdqMZ/9zJe0/Mzzt6A5wCp4Iu/wA=,41,30,149,119,1,2015-01-08,2016-09-10,0
23080,fItJlEs671EQOapBdqMZ/9zJe0/Mzzt6A5wCp4Iu/wA=,41,30,149,149,1,2015-01-11,2016-10-11,0
8397,fItJlEs671EQOapBdqMZ/9zJe0/Mzzt6A5wCp4Iu/wA=,41,30,149,149,1,2015-02-08,2016-11-08,0
10817,fItJlEs671EQOapBdqMZ/9zJe0/Mzzt6A5wCp4Iu/wA=,41,30,149,119,1,2015-02-08,2016-12-06,0
7868,fItJlEs671EQOapBdqMZ/9zJe0/Mzzt6A5wCp4Iu/wA=,41,30,149,149,1,2015-02-11,2017-01-03,0
11836,fItJlEs671EQOapBdqMZ/9zJe0/Mzzt6A5wCp4Iu/wA=,41,30,149,149,1,2015-03-08,2017-02-03,0
24714,fItJlEs671EQOapBdqMZ/9zJe0/Mzzt6A5wCp4Iu/wA=,41,30,149,119,1,2015-03-08,2017-03-06,0
27308,fItJlEs671EQOapBdqMZ/9zJe0/Mzzt6A5wCp4Iu/wA=,41,30,149,149,1,2015-03-11,2017-04-06,0
26136,fItJlEs671EQOapBdqMZ/9zJe0/Mzzt6A5wCp4Iu/wA=,41,30,149,149,1,2015-04-08,2017-05-06,0


In [25]:
ex_start = members.loc[members['msno'] == 'fItJlEs671EQOapBdqMZ/9zJe0/Mzzt6A5wCp4Iu/wA=', 'registration_init_time']
ex_start

1692   2013-10-09
Name: registration_init_time, dtype: datetime64[ns]

In [26]:
months = pd.date_range(pd.datetime(ex_start.dt.year, ex_start.dt.month, 1),
                       pd.datetime(2018, 1, 1), freq = 'M')
len(months)

51

In [27]:
months[1]

Timestamp('2013-11-30 00:00:00', freq='M')

In [30]:
statuses = []
is_subscribed = True

for month in months:
    if month < (ex['transaction_date'].min() - pd.Timedelta(30, 'D')):
        statuses.append(np.nan)
    else:
        status = 0 
        subset = ex.loc[(ex['transaction_date'].dt.year == month.year) & (ex['transaction_date'].dt.month == month.month)].copy()
        
        if any(subset['is_cancel'] == 1):
            is_subscribed = 0
                