In [1]:
import pandas as pd
import numpy as np
import datetime as dt 
from collections import Counter

from sklearn.preprocessing import LabelEncoder

import xgboost as xgb
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

In [2]:
casino = pd.read_csv('Online_casino_DIB.csv')
casino['ReqTimeUTC'] = pd.to_datetime(casino['ReqTimeUTC'])
casino = casino[casino['ReqTimeUTC'] <= '2020-02-29'].copy()
casino['TransactionType'] = casino['TransactionType'].map({'LOYALTYCARDDEBIT':'L2D', 'LOYALTYCARDCREDITCL':'L1D', 'LOYALTYCARDCREDIT':'L2W'})
casino = casino[(casino['TransactionType'] == 'L2D') & (casino['Status'] == 'APPROVED')].reset_index(drop=True)
casino = casino[['AccountIdentifier', 'ReqTimeUTC', 'TransactionAmount']]
casino = casino.sort_values(['AccountIdentifier', 'ReqTimeUTC']).reset_index(drop=True)
casino.rename(columns={'AccountIdentifier': 'customer', 'ReqTimeUTC':'timest', 'TransactionAmount': 'amount'}, inplace=True)
casino.head()

Unnamed: 0,customer,timest,amount
0,customer1,2019-03-01 17:24:43+00:00,30.0
1,customer1,2019-03-01 19:54:04+00:00,50.0
2,customer1,2019-03-01 22:44:15+00:00,40.0
3,customer1,2019-03-02 02:44:20+00:00,100.0
4,customer1,2019-03-02 18:28:39+00:00,25.0


In [3]:
casino.shape

(69734, 3)

### Problem framing
1. will a customer purchase at least once in the next three days?
2. is this a time series/ supervised / unsupervised learning problem?
3. what algorithms would be suitable for solving the problem?
4. what data do we need for solving this problem?

### Feature engineering

In [4]:
casino

Unnamed: 0,customer,timest,amount
0,customer1,2019-03-01 17:24:43+00:00,30.0
1,customer1,2019-03-01 19:54:04+00:00,50.0
2,customer1,2019-03-01 22:44:15+00:00,40.0
3,customer1,2019-03-02 02:44:20+00:00,100.0
4,customer1,2019-03-02 18:28:39+00:00,25.0
...,...,...,...
69729,customer997,2019-09-22 00:56:47+00:00,50.0
69730,customer998,2019-11-09 07:03:51+00:00,25.0
69731,customer998,2019-11-09 07:05:05+00:00,25.0
69732,customer999,2019-03-20 10:58:04+00:00,250.0


In [5]:
casino['date'] = casino['timest'].dt.strftime('%Y-%m-%d')

In [6]:
daily_activity = casino.groupby(['customer', 'date']).agg(
    daily_sum = ('amount', 'sum'),
    tr_avg = ('amount', lambda x: x.sum() / x.count()),
    tr_count = ('timest', 'count')
    ).reset_index()
daily_activity

Unnamed: 0,customer,date,daily_sum,tr_avg,tr_count
0,customer1,2019-03-01,120.0,40.000000,3
1,customer1,2019-03-02,125.0,62.500000,2
2,customer1,2019-03-03,200.0,100.000000,2
3,customer1,2019-03-04,160.0,53.333333,3
4,customer1,2019-03-05,240.0,80.000000,3
...,...,...,...,...,...
28600,customer996,2019-03-14,25.0,25.000000,1
28601,customer996,2019-09-13,25.0,12.500000,2
28602,customer997,2019-09-22,50.0,50.000000,1
28603,customer998,2019-11-09,50.0,25.000000,2


In [7]:
full_range = pd.date_range(daily_activity['date'].min(), daily_activity['date'].max())
