# Features Generation

In [1]:
# Imports
import pandas as pd

In [2]:
# Load the data

# customer_profiles_table = pd.read_csv("./data/customer_profiles_table.csv")
# terminal_profiles_table = pd.read_csv("./data/terminal_profiles_table.csv")
transactions_df = pd.read_csv("./data/transactions_df.csv", parse_dates = ['post_ts'])


In [3]:
def is_weekend(tx_datetime):
    
    # Transform date into weekday (0 is Monday, 6 is Sunday)
    weekday = tx_datetime.weekday()
    # Binary value: 0 if weekday, 1 if weekend
    is_weekend = weekday>=5
    
    return int(is_weekend)

def is_night(tx_datetime):
    
    # Get the hour of the transaction
    tx_hour = tx_datetime.hour
    # Binary value: 1 if hour less than 6, and 0 otherwise
    is_night = tx_hour<=6
    
    return int(is_night)

def get_customer_spending_behaviour_features(customer_transactions, windows_size_in_days=[1,7,30]):
    
    # Let us first order transactions chronologically
    customer_transactions=customer_transactions.sort_values('post_ts')
    
    # The transaction date and time is set as the index, which will allow the use of the rolling function 
    customer_transactions.index=customer_transactions.post_ts
    
    # For each window size
    for window_size in windows_size_in_days:
        
        # Compute the sum of the transaction amounts and the number of transactions for the given window size
        SUM_AMOUNT_TX_WINDOW=customer_transactions['amt'].rolling(str(window_size)+'d').sum()
        NB_TX_WINDOW=customer_transactions['amt'].rolling(str(window_size)+'d').count()
    
        # Compute the average transaction amount for the given window size
        # NB_TX_WINDOW is always >0 since current transaction is always included
        AVG_AMOUNT_TX_WINDOW=SUM_AMOUNT_TX_WINDOW/NB_TX_WINDOW
    
        # Save feature values
        customer_transactions['customer_id_nb_tx_'+str(window_size)+'day_window']=list(NB_TX_WINDOW)
        customer_transactions['customer_id_avg_amount_'+str(window_size)+'day_window']=list(AVG_AMOUNT_TX_WINDOW)
    
    # Reindex according to transaction IDs
    customer_transactions.index=customer_transactions.transaction_id
        
    # And return the dataframe with the new features
    return customer_transactions

def get_count_risk_rolling_window(terminal_transactions, delay_period=7, windows_size_in_days=[1,7,30], feature="terminal_id"):
    
    terminal_transactions=terminal_transactions.sort_values('post_ts')
    
    terminal_transactions.index=terminal_transactions.post_ts
    
    NB_FRAUD_DELAY=terminal_transactions['fraud'].rolling(str(delay_period)+'d').sum()
    NB_TX_DELAY=terminal_transactions['fraud'].rolling(str(delay_period)+'d').count()
    
    for window_size in windows_size_in_days:
    
        NB_FRAUD_DELAY_WINDOW=terminal_transactions['fraud'].rolling(str(delay_period+window_size)+'d').sum()
        NB_TX_DELAY_WINDOW=terminal_transactions['fraud'].rolling(str(delay_period+window_size)+'d').count()
    
        NB_FRAUD_WINDOW=NB_FRAUD_DELAY_WINDOW-NB_FRAUD_DELAY
        NB_TX_WINDOW=NB_TX_DELAY_WINDOW-NB_TX_DELAY
    
        RISK_WINDOW=NB_FRAUD_WINDOW/NB_TX_WINDOW
        
        terminal_transactions[feature+'_nb_tx_'+str(window_size)+'day_window']=list(NB_TX_WINDOW)
        terminal_transactions[feature+'_risk_'+str(window_size)+'day_window']=list(RISK_WINDOW)
        
    terminal_transactions.index=terminal_transactions.transaction_id
    
    # Replace NA values with 0 (all undefined risk scores where NB_TX_WINDOW is 0) 
    terminal_transactions.fillna(0,inplace=True)
    
    return terminal_transactions


In [4]:
%time
transactions_df['during_weekend']=transactions_df.post_ts.apply(is_weekend)
transactions_df['during_night']=transactions_df.post_ts.apply(is_night)
transactions_df=transactions_df.groupby('customer_id').apply(lambda x: get_customer_spending_behaviour_features(x, windows_size_in_days=[1,7,30]))
transactions_df=transactions_df.sort_values('post_ts').reset_index(drop=True)
transactions_df=transactions_df.groupby('terminal_id').apply(lambda x: get_count_risk_rolling_window(x, delay_period=7, windows_size_in_days=[1,7,30], feature="terminal_id"))
transactions_df=transactions_df.sort_values('post_ts').reset_index(drop=True)


CPU times: user 1 µs, sys: 2 µs, total: 3 µs
Wall time: 5.25 µs


In [5]:
# Save data with new features
%time
transactions_df.to_csv("./data/transactions_enrich_df.csv", index=False)

CPU times: user 1 µs, sys: 1e+03 ns, total: 2 µs
Wall time: 4.29 µs
