## Calculation of payment plan days (looping through groupby)

In [31]:
# refine number of days to match plans
def match_plan(df_in):
    # set to 30-days when delta days is between 28 and 32
    crit_30 = (df_in.delta_days >= 28) & (df_in.delta_days <= 32)
    df_in.loc[crit_30, 'delta_days'] = 30
    
    return df_in

In [32]:
def compute_plan_days(df_current):
    # offset date by one row (must use .values otherwise it will attempt to align data)
    # df_current.iloc[1:, 10] = df_current.iloc[:-1, 7].values
    df_current.loc[df_current.index[1:],'prior_expiration'] =\
                                df_current.loc[df_current.index[:-1],'membership_expire_date'].values

    # compute delta days when transaction date is anterior to prior expiration date and no active cancellation
    crit_mem = (df_current.transaction_date <= df_current.prior_expiration) & (df_current.is_cancel == False)
    df_current.loc[crit_mem,'delta_days'] = (df_current.loc[crit_mem, 'membership_expire_date'] -\
                                df_current.loc[crit_mem, 'prior_expiration']).astype('timedelta64[D]')

    # take care of one time occurrence when transaction date and expiration date do not overlap
    crit_unq = (df_current.delta_days.isnull()) & (df_current.is_cancel == False)
    df_current.loc[crit_unq, 'delta_days'] = np.floor((df_current.loc[crit_unq, 'membership_expire_date'] - \
                                            df_current.loc[crit_unq, 'transaction_date'])/pd.Timedelta('1 day'))

    # fillna() with forward fill for active cancellation (all remaining NaN except if first record in current df)
    df_current.delta_days = df_current.delta_days.fillna(method='ffill')

    return df_current

In [157]:
# 2,363,626 users
# df_group.groups.keys()
import time

In [54]:
df_group = df_transac.groupby('msno', sort=False)

In [57]:
# Take 345 second for 100 => 3.45s per group! that's 92 days of running time! not acceptable
# Not dropping any records and keeping track of them instead, 28.5s for 100 => 0.285s per group, that's 7 days
# In addition not making changes and keeping track of them, 11s for 100 => 0.11s per group, that's 3 days
#(1.7 days if msno is categorical)
# get start time of timer for processing time
start_time = time.time()
counter = 0
record2remove = []

for user_name, user_df in df_group:
    # get transactions from user
    df_current = compute_plan_days(user_df.sort_values(['transaction_date', 'membership_expire_date']))
    
    # remove any remaining NaN and negative delta days
    crit_rm = (df_current.delta_days.isnull())  | ( df_current.delta_days <= 0 )
    
    # remove them in the master dataframe
#     df_transac.drop(df_current[crit_rm].index, inplace = True)
    record2remove.extend(df_current[crit_rm].index)
    # have to do it after I used it for df_transac
    df_current = df_current[~crit_rm].copy()

    # adjust days to match monthly plan when applicable
    df_current = match_plan(df_current)

    # replace 0-days payment plan by delta days
    crit_rpl = df_current.payment_plan_days == 0
    # df_current.loc[crit_rpl, 'payment_plan_days'] = df_current.loc[crit_rpl, 'delta_days'].astype('int64')
    # do it again on the master dataframe...
    index2change = df_current[crit_rpl].index
#     df_transac.loc[index2change, 'payment_plan_days'] = df_current.loc[crit_rpl, 'delta_days'].astype('int64')
    
    counter += 1
    if counter > 100: break

print("--- %s seconds ellapsed in for loop---" % (time.time() - start_time))

--- 6.549419403076172 seconds ellapsed in for loop---


### Unique transaction, missing payment plan days replacement strategy
From unique transaction without missing payment plan days values, get the most popular actual amount paid.
When an amount is linked to multiple plan days then pick the most significant plan days in terms of count.

In [None]:
# discard active cancellation, unique transaction with missing payment days
crit_unq_cancel = (df_transac.msno.isin(users_unq_trans)) & (df_transac.payment_plan_days == 0) & (df_transac.is_cancel == True)
df_transac = df_transac.loc[ ~crit_unq_cancel ,:]

In [329]:
crit_fill_plan = (df_transac.msno.isin(users_unq_trans)) & (df_transac.payment_plan_days != 0) & (df_transac.is_cancel == False)

df_plan_grouped = df_transac.loc[ crit_fill_plan, ['payment_plan_days', 'actual_amount_paid']].groupby('payment_plan_days')

# most common actual ammount paid and its associated count
df_mcommon = df_plan_grouped.agg([lambda x: x.mode(), 'size'] )

# remove top level column
df_mcommon.columns = df_mcommon.columns.droplevel(0)

# rename columns
df_mcommon.columns = ['top_actual_amount', 'freq_count']

# group by top actual amount paid and keep top payment plan days
s_mcommon = df_mcommon.groupby('top_actual_amount')['freq_count'].nlargest(n=1)

# move second level index (payment plan days) to a column
s_mcommon = s_mcommon.reset_index(level = 1)

# drop frequency count column (not needed anymore)
s_mcommon.drop('freq_count', axis = 1, inplace = True)

In [317]:
# df_plan_grouped.nth(0)
# df_plan_grouped.ngroup()
# df_plan_grouped.describe()
# len(df_plan_grouped)

In [308]:
# _ , axunq = plt.subplots(len(df_plan_grouped),1, figsize=(8, 80))

# for ind, (plan_name, plan_df) in enumerate(df_plan_grouped):
#     pl_x = plan_df['actual_amount_paid'].sort_values().values
#     pl_y = np.arange(1, len(pl_x)+1) / len(pl_x)
#     axunq[ind].plot(pl_x, pl_y, '.')
#     axunq[ind].set_ylabel('{:.2f}'.format(plan_name))