In [1]:
import pandas as pd
import numpy as np
import datetime as dt 
from collections import Counter

from sklearn.preprocessing import LabelEncoder

import xgboost as xgb
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

In [42]:
casino = pd.read_csv('Online_casino_DIB.csv')
casino['ReqTimeUTC'] = pd.to_datetime(casino['ReqTimeUTC'])
casino = casino[casino['ReqTimeUTC'] <= '2020-02-29'].copy()
casino['TransactionType'] = casino['TransactionType'].map({'LOYALTYCARDDEBIT':'L2D', 'LOYALTYCARDCREDITCL':'L1D', 'LOYALTYCARDCREDIT':'L2W'})
casino = casino[(casino['TransactionType'] == 'L2D') & (casino['Status'] == 'APPROVED')].reset_index(drop=True)
casino = casino[['AccountIdentifier', 'ReqTimeUTC', 'TransactionAmount']]
casino = casino.sort_values(['AccountIdentifier', 'ReqTimeUTC']).reset_index(drop=True)
casino.rename(columns={'AccountIdentifier': 'customer', 'ReqTimeUTC':'timest', 'TransactionAmount': 'amount'}, inplace=True)
casino.head()

Unnamed: 0,customer,timest,amount
0,customer1,2019-03-01 17:24:43+00:00,30.0
1,customer1,2019-03-01 19:54:04+00:00,50.0
2,customer1,2019-03-01 22:44:15+00:00,40.0
3,customer1,2019-03-02 02:44:20+00:00,100.0
4,customer1,2019-03-02 18:28:39+00:00,25.0


### Problem framing
1. will a customer purchase at least once in the next three days?
2. is this a time series/ supervised / unsupervised learning problem?
3. what algorithms would be suitable for solving the problem?
4. what data do we need for solving this problem?

### Feature engineering

In [43]:
casino['date'] = casino['timest'].dt.normalize()

In [44]:
casino.head()

Unnamed: 0,customer,timest,amount,date
0,customer1,2019-03-01 17:24:43+00:00,30.0,2019-03-01 00:00:00+00:00
1,customer1,2019-03-01 19:54:04+00:00,50.0,2019-03-01 00:00:00+00:00
2,customer1,2019-03-01 22:44:15+00:00,40.0,2019-03-01 00:00:00+00:00
3,customer1,2019-03-02 02:44:20+00:00,100.0,2019-03-02 00:00:00+00:00
4,customer1,2019-03-02 18:28:39+00:00,25.0,2019-03-02 00:00:00+00:00


In [45]:
daily_activity = casino.groupby(['customer', 'date']).agg(
    daily_sum = ('amount', 'sum'),
    tr_avg = ('amount', lambda x: x.sum() / x.count()),
    tr_count = ('timest', 'count')
    ).reset_index()
daily_activity.head()

Unnamed: 0,customer,date,daily_sum,tr_avg,tr_count
0,customer1,2019-03-01 00:00:00+00:00,120.0,40.0,3
1,customer1,2019-03-02 00:00:00+00:00,125.0,62.5,2
2,customer1,2019-03-03 00:00:00+00:00,200.0,100.0,2
3,customer1,2019-03-04 00:00:00+00:00,160.0,53.333333,3
4,customer1,2019-03-05 00:00:00+00:00,240.0,80.0,3


In [56]:
full_customer_df = pd.DataFrame()
for customer_id in daily_activity['customer'].unique():
    customer_df = daily_activity[daily_activity['customer'] == customer_id]
    full_range = list(pd.date_range(customer_df['date'].min(), customer_df['date'].max()))
    customer_df.set_index('date', inplace=True)
    customer_df = customer_df.reindex(full_range, fill_value=0)
    customer_df['customer'] = customer_id
    full_customer_df = pd.concat([full_customer_df, customer_df])
full_customer_df


Unnamed: 0_level_0,customer,daily_sum,tr_avg,tr_count
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-03-01 00:00:00+00:00,customer1,120.0,40.000000,3
2019-03-02 00:00:00+00:00,customer1,125.0,62.500000,2
2019-03-03 00:00:00+00:00,customer1,200.0,100.000000,2
2019-03-04 00:00:00+00:00,customer1,160.0,53.333333,3
2019-03-05 00:00:00+00:00,customer1,240.0,80.000000,3
...,...,...,...,...
2019-09-12 00:00:00+00:00,customer996,0.0,0.000000,0
2019-09-13 00:00:00+00:00,customer996,25.0,12.500000,2
2019-09-22 00:00:00+00:00,customer997,50.0,50.000000,1
2019-11-09 00:00:00+00:00,customer998,50.0,25.000000,2
