In [1]:
# default_exp preprocess.extract_features

# Extaction features

> API details.

In [2]:
#hide
from nbdev.showdoc import *

In [3]:
#export
import os
import pandas as pd
from sample_project import config
from sample_project.helper import write_to_csv, read_from_csv
from fastcore.utils import store_attr
import numpy as np

In [4]:
#hide
import warnings
warnings.filterwarnings("ignore")

In [5]:
#export
def extract_features(df, to_csv=True, with_label=True):
    '''
    This function is built to extract two types of features which are:
    1. Total transaction amount within each date bin 
    2. Total transaction count within each date bin
    
    Args:
        df (Pandas DataFrame): The transaction dataset which has at least these fields: "account_id","date","client_id"
        to_csv (boolean): If the returned dataframe is desired to be written into csv file
        with_label (boolean): If label is also asked to be in the data
    
    Return:
        feats (pandas DataFrame): the customer list together with extracted features 
    '''
    
    df["date_bin"] = pd.cut(df.date, 10, labels=np.arange(0,10))
    
    feats = df.groupby(["client_id","date_bin"]).agg({'amount':['sum','count']})
    feats.columns = ["sum","count"]
    feats.reset_index(inplace=True)
    
    feats = feats.pivot( index="client_id",columns='date_bin', values=["sum","count"])
    feats = feats.fillna(0).reset_index()
    feats.columns=["client_id"]+["{}_bin_{}".format(j,i) for i in np.arange(0,10) for j in ["sum","count"]] 
    
    if with_label: feats = feats.merge(df[["client_id","churn_or_not"]].drop_duplicates(),on="client_id",how="left")
    if to_csv: write_to_csv(feats, config.CSV_CUST_FEATS)
    
    return feats

In [6]:
#hide
df = read_from_csv(config.CSV_LABELLED_TRNX)
extract_features(df)

Unnamed: 0,client_id,sum_bin_0,count_bin_0,sum_bin_1,count_bin_1,sum_bin_2,count_bin_2,sum_bin_3,count_bin_3,sum_bin_4,...,count_bin_5,sum_bin_6,count_bin_6,sum_bin_7,count_bin_7,sum_bin_8,count_bin_8,sum_bin_9,count_bin_9,churn_or_not
0,2,429529.9,0.0,0.0,524233.8,0.0,0.0,524215.4,0.0,0.0,...,0.0,0.0,85.0,0.0,0.0,85.0,0.0,0.0,85.0,0
1,3,429529.9,0.0,0.0,524233.8,0.0,0.0,524215.4,0.0,0.0,...,0.0,0.0,85.0,0.0,0.0,85.0,0.0,0.0,85.0,0
2,25,0.0,0.0,0.0,0.0,0.0,0.0,320047.7,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,54.0,0.0,0.0,90.0,0
3,31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46.0,0
4,78,0.0,0.0,0.0,149168.8,0.0,0.0,1094839.6,0.0,0.0,...,0.0,0.0,7.0,0.0,0.0,85.0,0.0,0.0,97.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
679,13924,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0
680,13955,0.0,0.0,0.0,0.0,0.0,0.0,639264.2,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,35.0,0.0,0.0,88.0,0
681,13956,0.0,0.0,0.0,0.0,0.0,0.0,639264.2,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,35.0,0.0,0.0,88.0,0
682,13968,0.0,0.0,0.0,137091.9,0.0,0.0,726317.1,0.0,0.0,...,0.0,0.0,10.0,0.0,0.0,81.0,0.0,0.0,95.0,0
