In [11]:
# default_exp preprocess.extract_features

# Extaction features

> API details.

In [1]:
#hide
from nbdev.showdoc import *

In [1]:
#export
import os
import pandas as pd
from sample_project import config
from sample_project.helper import write_to_csv, read_from_csv
from fastcore.utils import store_attr
import numpy as np

In [2]:
#hide
import warnings
warnings.filterwarnings("ignore")

In [3]:
#export
def extract_features(df, to_csv=True, with_label=True):
    '''
    This function is built to extract two types of features which are:
    1. Total transaction amount within each date bin 
    2. Total transaction count within each date bin
    
    Args:
        df (Pandas DataFrame): The transaction dataset which has at least these fields: "account_id","date","client_id"
        to_csv (boolean): If the returned dataframe is desired to be written into csv file
        with_label (boolean): If label is also asked to be in the data
    
    Return:
        feats (pandas DataFrame): the customer list together with extracted features 
    '''
    
    df["date_bin"] = pd.cut(df.date, 10, labels=np.arange(0,10))
    
    feats = df.groupby(["client_id","date_bin"]).agg({'amount':['sum','count']})
    feats.columns = ["sum","count"]
    feats.reset_index(inplace=True)
    
    feats = feats.pivot( index="client_id",columns='date_bin', values=["sum","count"])
    feats = feats.fillna(0).reset_index()
    feats.columns=["client_id"]+["{}_bin_{}".format(j,i) for i in np.arange(0,10) for j in ["sum","count"]] 
    
    if with_label: feats = feats.merge(df[["client_id","churn_or_not"]].drop_duplicates(),on="client_id",how="left")
    if to_csv: write_to_csv(feats, "customer_list_w_feats.csv")
    
    return feats

In [4]:
#hide
df = read_from_csv(config.CSV_LABELLED_TRNX)
extract_features(df)

Unnamed: 0,client_id,sum_bin_0,count_bin_0,sum_bin_1,count_bin_1,sum_bin_2,count_bin_2,sum_bin_3,count_bin_3,sum_bin_4,...,count_bin_5,sum_bin_6,count_bin_6,sum_bin_7,count_bin_7,sum_bin_8,count_bin_8,sum_bin_9,count_bin_9,churn_or_not
0,2,429529.9,118920.7,405313.1,214904.4,309311.0,350379.9,212737.2,438670.3,124000.9,...,22.0,63.0,39.0,46.0,51.0,34.0,67.0,17.0,85.0,0
1,3,429529.9,118920.7,405313.1,214904.4,309311.0,350379.9,212737.2,438670.3,124000.9,...,22.0,63.0,39.0,46.0,51.0,34.0,67.0,17.0,85.0,0
2,25,0.0,0.0,0.0,36022.6,284025.1,278950.5,169142.5,310514.3,101337.3,...,0.0,0.0,8.0,46.0,55.0,35.0,59.0,19.0,81.0,0
3,31,0.0,0.0,0.0,0.0,0.0,900.0,878147.2,868048.3,252012.4,...,0.0,0.0,0.0,0.0,1.0,45.0,98.0,26.0,104.0,0
4,45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,163457.9,128077.3,...,0.0,0.0,0.0,0.0,0.0,0.0,9.0,20.0,101.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
822,13924,0.0,0.0,0.0,0.0,0.0,0.0,74937.7,311447.8,84070.1,...,0.0,0.0,0.0,0.0,0.0,6.0,40.0,13.0,87.0,0
823,13955,0.0,0.0,0.0,200.0,639064.2,635662.3,458298.0,823055.2,297860.6,...,0.0,0.0,1.0,34.0,53.0,35.0,71.0,22.0,88.0,0
824,13956,0.0,0.0,0.0,200.0,639064.2,635662.3,458298.0,823055.2,297860.6,...,0.0,0.0,1.0,34.0,53.0,35.0,71.0,22.0,88.0,0
825,13968,0.0,0.0,137091.9,288381.6,437935.5,416773.2,287820.5,536427.9,172790.8,...,0.0,10.0,29.0,52.0,56.0,39.0,74.0,23.0,95.0,0
