In [60]:
# default_exp preprocess.label_data

# Labelling customers as churner or nonchurner

> API details.

In [2]:
#hide
from nbdev.showdoc import *

In [3]:
#export
import os
import pandas as pd
from sample_project import config
from sample_project.helper import write_to_csv, read_from_csv
from fastcore.utils import store_attr
import numpy as np

In [4]:
#hide
import warnings
warnings.filterwarnings("ignore")

In [14]:
#export
def create_labelled_data(trnx_dataset = config.CSV_CUSTOMIZED_TRNX, 
                 account_info_dataset=config.CSV_ACCOUNT_INFO, 
                 reference_date = 981231,
                 apply_all_steps = True,
                 to_csv = True):
    
    '''
    This function is built to label main dataset which is created with prepare_transaction_data module. Below steps are followed:
    1. Adjust error in end_date field in account information dataset 
    2. Label first individual accounts by taking reference date into consideration
    3. Consider activeness of all accounts belong to the customers and label them as churner if there is no active account anymore
    
     Args:
            trnx_dataset (Pandas DataFrame): Csv file name which has transaction dataset with at least these fields: "account_id","client_id"
            account_info_dataset (Pandas DataFrame): Csv file name which has account info dataset with at least these fields: "account_id", "start_date", "end_date"
            reference_date (integer): The date to be considered in activeness check of accounts
            apply_all_steps (boolean): True or False
            to_csv (boolean): If the returned dataframe is desired to be written into csv file 

        Return:
            labelled_data (pandas DataFrame)
            
    '''
        
        
    df_trnx = read_from_csv(trnx_dataset)
    df_acc = read_from_csv(account_info_dataset)

    if apply_all_steps:

        df_acc = fix_error_in_account_info_dataset(df_acc, df_trnx)
        df_acc = label_accounts(df_acc,reference_date)
        labelled_cust_data = label_customers(df_acc, df_trnx)
        return_data = merge_label_with_main_data(df_trnx, labelled_cust_data)

        if to_csv:

            write_to_csv(return_data,config.CSV_LABELLED_TRNX)

        return return_data
    
    else:
        
        return df_trnx, df_acc
                
        
def fix_error_in_account_info_dataset(df_acc, df_trnx): 


    df_acc = df_acc.merge(
                    df_trnx.groupby("account_id",as_index=False).date.max().rename(columns={'date':'Last_Trnx_Date'}),
                    on="account_id", how="left"
                   )

    df_acc["end_date"] = np.where(df_acc["end_date"] < df_acc["Last_Trnx_Date"], df_acc["Last_Trnx_Date"],df_acc["end_date"])

    return df_acc

def label_accounts(df_acc,reference_date): 

    df_acc["active_or_not"] = np.where(df_acc["end_date"] <= reference_date, 0 ,
                                       np.where(df_acc["start_date"]<reference_date,1,0))
    return df_acc

def label_customers(df_acc,df_trnx):

    labelled_cust_data = (df_trnx[["account_id","client_id"]]
                           .drop_duplicates()
                           .merge(df_acc[["account_id","active_or_not"]],on="account_id",how="left")
                          )

    labelled_cust_data = labelled_cust_data.groupby("client_id",as_index=False).active_or_not.sum()
    labelled_cust_data["churn_or_not"] = np.where(labelled_cust_data["active_or_not"] == 0, 1 ,0)
    
    return labelled_cust_data

def merge_label_with_main_data(df_trnx, labelled_cust_data ):

    return df_trnx.merge(labelled_cust_data[["client_id","churn_or_not"]],on="client_id",how="left")

## Create Labelled Transactional Data Automatically

In [13]:
#hide
labelled_trnx_data = create_labelled_data(reference_date = 981231, apply_all_steps = True, to_csv = True)

labelled_trnx_data

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,client_id,district_id,Total_Loan_Amount,churn_or_not
0,1548749,5270,930113,"""PRIJEM""","""VKLAD""",800.0,800.0,6367,44,79608.0,0
1,1548750,5270,930114,"""PRIJEM""","""PREVOD Z UCTU""",44749.0,45549.0,6367,44,79608.0,0
2,3393738,11265,930114,"""PRIJEM""","""VKLAD""",1000.0,1000.0,13845,15,52788.0,0
3,3122924,10364,930117,"""PRIJEM""","""VKLAD""",1100.0,1100.0,12754,55,21924.0,0
4,3122924,10364,930117,"""PRIJEM""","""VKLAD""",1100.0,1100.0,12755,55,21924.0,0
...,...,...,...,...,...,...,...,...,...,...,...
94778,2548927,8411,961231,"""VYDAJ""","""VYBER""",14.6,20259.5,10389,20,220620.0,0
94779,2592488,8564,961231,"""VYDAJ""","""VYBER""",14.6,44131.0,10563,68,76680.0,0
94780,2592488,8564,961231,"""VYDAJ""","""VYBER""",14.6,44131.0,10564,68,76680.0,0
94781,516795,1766,961231,"""VYDAJ""","""VYBER""",14.6,27107.3,2141,28,30060.0,0


## Check step by step

In [15]:
#hide
df_trnx, df_acc = create_labelled_data(reference_date = 981231, apply_all_steps = False, to_csv = False)

In [17]:
#hide
df_acc_fixed = fix_error_in_account_info_dataset(df_acc, df_trnx)
df_acc_fixed

Unnamed: 0,account_id,district_id,frequency,start_date,end_date,sample,Last_Trnx_Date
0,576,55,"""POPLATEK MESICNE""",930101,998106.0,0.0,
1,3818,74,"""POPLATEK MESICNE""",930101,998106.0,0.0,
2,704,55,"""POPLATEK MESICNE""",930101,998106.0,0.0,
3,2378,16,"""POPLATEK MESICNE""",930101,998106.0,0.0,
4,2632,24,"""POPLATEK MESICNE""",930102,998106.0,0.0,
...,...,...,...,...,...,...,...
4495,124,55,"""POPLATEK MESICNE""",971228,998106.0,0.0,
4496,3958,59,"""POPLATEK MESICNE""",971228,998106.0,0.0,
4497,777,30,"""POPLATEK MESICNE""",971228,1007289.0,0.0,
4498,1573,63,"""POPLATEK MESICNE""",971229,1007289.0,0.0,


In [19]:
#hide
df_acc_labelled = label_accounts(df_acc_fixed,reference_date=981231)
df_acc_labelled

Unnamed: 0,account_id,district_id,frequency,start_date,end_date,sample,Last_Trnx_Date,active_or_not
0,576,55,"""POPLATEK MESICNE""",930101,998106.0,0.0,,1
1,3818,74,"""POPLATEK MESICNE""",930101,998106.0,0.0,,1
2,704,55,"""POPLATEK MESICNE""",930101,998106.0,0.0,,1
3,2378,16,"""POPLATEK MESICNE""",930101,998106.0,0.0,,1
4,2632,24,"""POPLATEK MESICNE""",930102,998106.0,0.0,,1
...,...,...,...,...,...,...,...,...
4495,124,55,"""POPLATEK MESICNE""",971228,998106.0,0.0,,1
4496,3958,59,"""POPLATEK MESICNE""",971228,998106.0,0.0,,1
4497,777,30,"""POPLATEK MESICNE""",971228,1007289.0,0.0,,1
4498,1573,63,"""POPLATEK MESICNE""",971229,1007289.0,0.0,,1


In [20]:
#hide
df_cust_labelled = label_customers(df_acc_labelled,df_trnx)
df_cust_labelled

Unnamed: 0,client_id,active_or_not,churn_or_not
0,2,1,0
1,3,1,0
2,25,1,0
3,31,1,0
4,78,1,0
...,...,...,...
679,13924,1,0
680,13955,1,0
681,13956,1,0
682,13968,1,0


In [21]:
#hide
labelled_trnx_data = merge_label_with_main_data(df_trnx, df_cust_labelled)
labelled_trnx_data

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,client_id,district_id,Total_Loan_Amount,churn_or_not
0,1548749,5270,930113,"""PRIJEM""","""VKLAD""",800.0,800.0,6367,44,79608.0,0
1,1548750,5270,930114,"""PRIJEM""","""PREVOD Z UCTU""",44749.0,45549.0,6367,44,79608.0,0
2,3393738,11265,930114,"""PRIJEM""","""VKLAD""",1000.0,1000.0,13845,15,52788.0,0
3,3122924,10364,930117,"""PRIJEM""","""VKLAD""",1100.0,1100.0,12754,55,21924.0,0
4,3122924,10364,930117,"""PRIJEM""","""VKLAD""",1100.0,1100.0,12755,55,21924.0,0
...,...,...,...,...,...,...,...,...,...,...,...
94778,2548927,8411,961231,"""VYDAJ""","""VYBER""",14.6,20259.5,10389,20,220620.0,0
94779,2592488,8564,961231,"""VYDAJ""","""VYBER""",14.6,44131.0,10563,68,76680.0,0
94780,2592488,8564,961231,"""VYDAJ""","""VYBER""",14.6,44131.0,10564,68,76680.0,0
94781,516795,1766,961231,"""VYDAJ""","""VYBER""",14.6,27107.3,2141,28,30060.0,0
