In [60]:
# default_exp preprocess.label_data

# Labelling customers as churner or nonchurner

> API details.

In [61]:
#hide
from nbdev.showdoc import *

In [1]:
#export
import os
import pandas as pd
from sample_project import config
from sample_project.helper import write_to_csv, read_from_csv
from fastcore.utils import store_attr
import numpy as np

In [2]:
#hide
import warnings
warnings.filterwarnings("ignore")

In [4]:
#export
class Label_Data:
    
    '''
    This class is built to label main dataset which is created with prepare_transaction_data module. Below steps are followed:
    1. Adjust error in end_date field in account information dataset 
    2. Label first individual accounts by taking reference date into consideration
    3. Consider activeness of all accounts belong to the customers and label them as churner if there is no active account anymore
    
     Args:
            trnx_dataset (Pandas DataFrame): Csv file name which has transaction dataset with at least these fields: "account_id","client_id"
            account_info_dataset (Pandas DataFrame): Csv file name which has account info dataset with at least these fields: "account_id", "start_date", "end_date"
            reference_date (integer): The date to be considered in activeness check of accounts
            to_csv (boolean): If the returned dataframe is desired to be written into csv file 

        Return:
            labelled_data (pandas DataFrame)
            
    '''
    
    def __init__(self, trnx_dataset = None, account_info_dataset=None, reference_date = 981231, to_csv=True):

        store_attr()
        
        if trnx_dataset == None: 
            trnx_dataset = config.CSV_CUSTOMIZED_TRNX
        
        self.trnx_dataset = read_from_csv(trnx_dataset)
        
        if account_info_dataset == None:
            account_info_dataset = config.CSV_ACCOUNT_INFO
        
        self.account_info_dataset = read_from_csv(account_info_dataset)
    
    def __call__(self):
        
        self._fix_error_in_account_info_dataset()
        self._label_accounts()
        self._label_customers()
        return_data = self._merge_label_with_main_data()
        
        if self.to_csv:
            
            write_to_csv(return_data,config.CSV_LABELLED_TRNX)
        
        return return_data
        
    def _fix_error_in_account_info_dataset(self): 
        
        self.account_info_dataset = self.account_info_dataset.merge(
                                                                    self.trnx_dataset.groupby("account_id",as_index=False).date.max().rename(columns={'date':'Last_Trnx_Date'}),
                                                                    on="account_id", how="left"
                                                                   )
        
        self.account_info_dataset["end_date"] = np.where(self.account_info_dataset["end_date"] < self.account_info_dataset["Last_Trnx_Date"], self.account_info_dataset["Last_Trnx_Date"],self.account_info_dataset["end_date"])
        return
    
    def _label_accounts(self): 
        
        self.account_info_dataset["active_or_not"] = np.where(self.account_info_dataset["end_date"] <= self.reference_date, 0 ,np.where(self.account_info_dataset["start_date"]<self.reference_date,1,0))
        return
     
    def _label_customers(self):
        
        self.labelled_cust_data = (self.trnx_dataset[["account_id","client_id"]]
                               .drop_duplicates()
                               .merge(self.account_info_dataset[["account_id","active_or_not"]],on="account_id",how="left")
                              )
        
        self.labelled_cust_data = self.labelled_cust_data.groupby("client_id",as_index=False).active_or_not.sum()
        self.labelled_cust_data["churn_or_not"] = np.where(self.labelled_cust_data["active_or_not"] == 0, 1 ,0)
        return
    
    def _merge_label_with_main_data(self):
        
        return self.trnx_dataset.merge(self.labelled_cust_data[["client_id","churn_or_not"]],on="client_id",how="left")

In [5]:
#hide
labeller = Label_Data( reference_date = 981231, to_csv=True)
labelled_trnx_data = labeller()
labelled_trnx_data

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,client_id,district_id,Total_Loan_Amount,churn_or_not
0,1121963,3834,930119,"""PRIJEM""","""VKLAD""",700.0,700.0,4620,54,23052.0,0
1,1121963,3834,930119,"""PRIJEM""","""VKLAD""",700.0,700.0,4621,54,23052.0,0
2,2809952,9307,930124,"""PRIJEM""","""VKLAD""",900.0,900.0,11461,70,41904.0,0
3,2809959,9307,930131,"""PRIJEM""","""VKLAD""",5282.0,6182.0,11461,70,41904.0,0
4,3479816,9307,930131,"""PRIJEM""","""""",0.8,6182.8,11461,70,41904.0,0
...,...,...,...,...,...,...,...,...,...,...,...
62108,3526939,11021,981231,"""PRIJEM""","""""",227.6,108432.0,13548,1,168984.0,0
62109,3521857,10478,981231,"""PRIJEM""","""""",161.2,31106.7,12890,70,267600.0,0
62110,3523087,10652,981231,"""PRIJEM""","""""",171.4,44224.8,13098,1,45720.0,0
62111,3625495,2872,981231,"""PRIJEM""","""""",67.9,17036.9,3471,1,268320.0,0
