# Import

In [1]:
import pandas as pd
import numpy as np

In [2]:
import os

In [3]:
import pathlib

In [4]:
from tqdm import tqdm

In [5]:
import pickle

In [6]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

In [7]:
from sklearn.metrics import accuracy_score

In [8]:
from bayes_opt import BayesianOptimization
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [9]:
from sklearn import metrics

In [10]:
from sklearn.model_selection import GridSearchCV

In [11]:
from sklearn.model_selection import RandomizedSearchCV

In [12]:
from sklearn.model_selection import StratifiedKFold

In [168]:
import lightgbm

In [220]:
from sklearn.preprocessing import StandardScaler

# Data pre-processing

## Loading data

In [13]:
DATA_DIR = './Data'

In [14]:
campaign_data_df = pd.read_csv(os.path.join(DATA_DIR, 'campaign_data.csv'), parse_dates=['start_date','end_date'], dayfirst=True)


In [15]:
coupon_item_map_df = pd.read_csv(os.path.join(DATA_DIR, 'coupon_item_mapping.csv'))


In [16]:
customer_demographics_df = pd.read_csv(os.path.join(DATA_DIR, 'customer_demographics.csv'))

In [17]:
customer_transaction_data_df = pd.read_csv(os.path.join(DATA_DIR, 'customer_transaction_data.csv'))

In [18]:
item_data_df = pd.read_csv(os.path.join(DATA_DIR, 'item_data.csv'))

In [19]:
train_df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))

In [20]:
test_df = pd.read_csv(os.path.join(DATA_DIR, 'test_QyjYwdj.csv'))

In [21]:
sample_submission_df = pd.read_csv(os.path.join(DATA_DIR, 'sample_submission_Byiv0dS.csv'))

## Exploration

### Campaign Data

In [22]:
campaign_data_df.head()

Unnamed: 0,campaign_id,campaign_type,start_date,end_date
0,24,Y,2013-10-21,2013-12-20
1,25,Y,2013-10-21,2013-11-22
2,20,Y,2013-09-07,2013-11-16
3,23,Y,2013-10-08,2013-11-15
4,21,Y,2013-09-16,2013-10-18


In [23]:
campaign_data_df['campaign_type'].unique()

array(['Y', 'X'], dtype=object)

In [24]:
campaign_data_df.shape

(28, 4)

In [25]:
campaign_data_df['campaign_id'].unique()

array([24, 25, 20, 23, 21, 22, 18, 19, 17, 16, 13, 11, 12, 10,  9,  8,  7,
        6,  3,  5,  4,  1,  2, 30, 29, 28, 27, 26])

In [26]:
campaign_data_df['campaign_id'].nunique()

28

In [27]:
campaign_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 4 columns):
campaign_id      28 non-null int64
campaign_type    28 non-null object
start_date       28 non-null datetime64[ns]
end_date         28 non-null datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(1)
memory usage: 976.0+ bytes


In [28]:
campaign_data_df['start_date'].min(), campaign_data_df['start_date'].max()

(Timestamp('2012-08-12 00:00:00'), Timestamp('2013-10-21 00:00:00'))

In [29]:
campaign_data_df['end_date'].min(), campaign_data_df['end_date'].max()

(Timestamp('2012-09-21 00:00:00'), Timestamp('2013-12-20 00:00:00'))

In [30]:
campaign_data_df['num_days'] = (campaign_data_df['end_date'] - campaign_data_df['start_date']).dt.days

In [31]:
campaign_data_df.describe()

Unnamed: 0,campaign_id,num_days
count,28.0,28.0
mean,15.571429,41.857143
std,9.118271,11.958924
min,1.0,32.0
25%,7.75,32.0
50%,16.5,35.5
75%,23.25,49.25
max,30.0,70.0


#### Observation 

- 28 different campaigns, each of either type X or type Y. 
- These campaigns are between 10th August 2012 to 1st December of 2013
- The end dates vary from 21st September 2012 to 20th December 2013
- Average number of days a campaign lasts is aobut 41 days ; Minimum is 32 days ; Maximum is about 70 days!

### Coupon-Item Data 

In [32]:
coupon_item_map_df.head()

Unnamed: 0,coupon_id,item_id
0,105,37
1,107,75
2,494,76
3,522,77
4,518,77


In [33]:
# verification for below
coupon_item_map_df[coupon_item_map_df['coupon_id'] == 1].shape

(39, 2)

In [34]:
items_gby_coupon_df = coupon_item_map_df.groupby('coupon_id')[['item_id']].count()

In [35]:
items_gby_coupon_df.head()

Unnamed: 0_level_0,item_id
coupon_id,Unnamed: 1_level_1
1,39
2,2
3,17
4,24
5,7


In [36]:
items_gby_coupon_df.sort_values(by='item_id', ascending=False).head()

Unnamed: 0_level_0,item_id
coupon_id,Unnamed: 1_level_1
32,11814
23,11813
22,4947
31,4873
24,4394


In [37]:
# verify that all are unique
coupon_item_map_df[coupon_item_map_df['coupon_id'] == 32][['item_id']].nunique()

item_id    11814
dtype: int64

In [38]:
items_gby_coupon_df.describe()

Unnamed: 0,item_id
count,1116.0
mean,83.031362
std,633.143339
min,1.0
25%,5.0
50%,10.0
75%,24.0
max,11814.0


In [39]:
items_gby_coupon_df.shape

(1116, 1)

--------

In [40]:
coupon_groupby_items_df = coupon_item_map_df.groupby('item_id')[['coupon_id']].count()

In [41]:
coupon_groupby_items_df.head()

Unnamed: 0_level_0,coupon_id
item_id,Unnamed: 1_level_1
1,2
4,3
7,2
10,4
12,2


In [42]:
# verification for below
coupon_item_map_df[coupon_item_map_df['item_id'] == 4].shape

(3, 2)

In [43]:
coupon_groupby_items_df.head()

Unnamed: 0_level_0,coupon_id
item_id,Unnamed: 1_level_1
1,2
4,3
7,2
10,4
12,2


In [44]:
# verify that all are unique
coupon_item_map_df[coupon_item_map_df['item_id'] == 10][['coupon_id']].nunique()

coupon_id    4
dtype: int64

In [45]:
coupon_groupby_items_df.shape

(36289, 1)

In [46]:
coupon_groupby_items_df.describe()

Unnamed: 0,coupon_id
count,36289.0
mean,2.553474
std,1.166767
min,1.0
25%,2.0
50%,2.0
75%,3.0
max,11.0


#### Observations

- Some coupons have as many as 11800 items under them
- Average items a coupon has are 83
- There are a total of 36289 items which get featured in coupons

- Some items appear in as 11 coupons
- Average item gets featured in 2-3 coupons
- Total number of coupons are 1116

### Customer Demographics Data

In [47]:
customer_demographics_df = pd.read_csv(os.path.join(DATA_DIR, 'customer_demographics.csv'))

In [48]:
customer_demographics_df.head()

Unnamed: 0,customer_id,age_range,marital_status,rented,family_size,no_of_children,income_bracket
0,1,70+,Married,0,2,,4
1,6,46-55,Married,0,2,,5
2,7,26-35,,0,3,1.0,3
3,8,26-35,,0,4,2.0,6
4,10,46-55,Single,0,1,,5


In [49]:
customer_demographics_df['age_range'].unique()

array(['70+', '46-55', '26-35', '36-45', '18-25', '56-70'], dtype=object)

In [50]:
customer_demographics_df['marital_status'].unique()

array(['Married', nan, 'Single'], dtype=object)

In [51]:
customer_demographics_df.isnull().sum()

customer_id         0
age_range           0
marital_status    329
rented              0
family_size         0
no_of_children    538
income_bracket      0
dtype: int64

In [52]:
customer_demographics_df.shape

(760, 7)

In [53]:
customer_demographics_df['rented'].unique()

array([0, 1])

In [54]:
customer_demographics_df['family_size'].unique()

array(['2', '3', '4', '1', '5+'], dtype=object)

In [55]:
customer_demographics_df[customer_demographics_df['rented'] == 0].shape

(719, 7)

In [56]:
customer_demographics_df[customer_demographics_df['rented'] == 1].shape

(41, 7)

In [57]:
customer_demographics_df['income_bracket'].unique()

array([ 4,  5,  3,  6,  1,  7,  2,  8,  9, 12, 10, 11])

#### Observations

- 760 customers' data is there
- Age ranges from 18 to 70+ 
- Age range is categorical but there's an inherent order so we need to change it to an ordered variable
- Marital status has NaN for many
- Very few customers have rented recommendations. Most have it bought! No particular correlation of such folks with their family size or no_of_children as such
- Income bracket goes from 1 to 11. 11 being highest!
- Family size is 1,2,3,4,5+ --- hence categorical


#### Actions
- Family size mein 5+ can be changed to just 5. So, it can be treated as an ordial number
- Try removing marital status from data? Does it affect negatively?
- Try removing no_of_children from data? Does it affect negatively?
- Somehow incorporate both and see

### Customer Transaction Data

In [58]:
customer_transaction_df = pd.read_csv(os.path.join(DATA_DIR, 'customer_transaction_data.csv'), parse_dates=['date'], dayfirst=True)

In [59]:
customer_transaction_df.head()

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount
0,2012-01-02,1501,26830,1,35.26,-10.69,0.0
1,2012-01-02,1501,54253,1,53.43,-13.89,0.0
2,2012-01-02,1501,31962,1,106.5,-14.25,0.0
3,2012-01-02,1501,33647,1,67.32,0.0,0.0
4,2012-01-02,1501,48199,1,71.24,-28.14,0.0


In [60]:
customer_transaction_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1324566 entries, 0 to 1324565
Data columns (total 7 columns):
date               1324566 non-null datetime64[ns]
customer_id        1324566 non-null int64
item_id            1324566 non-null int64
quantity           1324566 non-null int64
selling_price      1324566 non-null float64
other_discount     1324566 non-null float64
coupon_discount    1324566 non-null float64
dtypes: datetime64[ns](1), float64(3), int64(3)
memory usage: 70.7 MB


In [61]:
customer_transaction_df['date'].min(), customer_transaction_df['date'].max()

(Timestamp('2012-01-02 00:00:00'), Timestamp('2013-07-03 00:00:00'))

In [62]:
customer_transaction_df['customer_id'].nunique()

1582

In [63]:
customer_transaction_df['item_id'].nunique()

74063

In [64]:
customer_transaction_df.shape

(1324566, 7)

In [65]:
customer_transaction_gby_customer_df = customer_transaction_df.groupby('customer_id')[['item_id']].count()

In [66]:
# most shopaholic users
customer_transaction_gby_customer_df.sort_values(by='item_id', ascending=False)

Unnamed: 0_level_0,item_id
customer_id,Unnamed: 1_level_1
1555,4522
711,4173
464,4134
1475,3945
1011,3862
911,3600
1466,3443
627,3429
60,3328
898,3248


In [67]:
customer_transaction_gby_customer_df.shape

(1582, 1)

In [68]:
# popular items
customer_transaction_gby_item_df = customer_transaction_df.groupby('item_id')[['customer_id']].count()

In [69]:
customer_transaction_gby_item_df.sort_values(by='customer_id', ascending=False)

Unnamed: 0_level_0,customer_id
item_id,Unnamed: 1_level_1
49009,13540
34047,6308
13174,3879
45502,3292
29847,2276
18156,2226
28633,2152
32991,2101
9281,2007
8525,1922


In [70]:
customer_transaction_gby_item_df.shape

(74063, 1)

---------------
---------------

In [71]:
customer_transactions_with_some_kind_of_discounts_df = customer_transaction_df[(customer_transaction_df['coupon_discount'] < 0) | (customer_transaction_df['other_discount'] < 0)]

In [72]:
customer_transactions_with_some_kind_of_discounts_df.shape

(679316, 7)

In [73]:
customer_transaction_with_some_kind_of_discounts_gby_customer_df = customer_transactions_with_some_kind_of_discounts_df.groupby('customer_id')[['item_id']].count()

In [74]:
# most shopaholic users
customer_transaction_with_some_kind_of_discounts_gby_customer_df.sort_values(by='item_id', ascending=False)

Unnamed: 0_level_0,item_id
customer_id,Unnamed: 1_level_1
464,3397
1555,2647
934,1901
1115,1898
60,1889
1558,1888
1463,1818
150,1794
626,1720
1485,1654


In [75]:
customer_transaction_with_some_kind_of_discounts_gby_customer_df.shape

(1582, 1)

In [76]:
# popular items
customer_transaction_with_some_kind_of_discounts_gby_item_df = customer_transactions_with_some_kind_of_discounts_df.groupby('item_id')[['customer_id']].count()

In [77]:
customer_transaction_with_some_kind_of_discounts_gby_item_df.sort_values(by='customer_id', ascending=False)

Unnamed: 0_level_0,customer_id
item_id,Unnamed: 1_level_1
49009,13534
45502,2845
32991,1944
28633,1925
6958,1865
8525,1759
18156,1552
32573,1456
53517,1450
30518,1402


In [78]:
customer_transaction_with_some_kind_of_discounts_gby_item_df.shape

(38292, 1)

---------------
---------------

In [79]:
customer_transactions_with_discounts_df = customer_transaction_df[customer_transaction_df['coupon_discount'] < 0]

In [80]:
customer_transactions_with_discounts_df.shape

(21286, 7)

In [81]:
customer_transactions_with_discounts_gby_customer_df = customer_transactions_with_discounts_df.groupby('customer_id')[['item_id']].count()

In [82]:
# most shopaholic users
customer_transactions_with_discounts_gby_customer_df.sort_values(by='item_id', ascending=False)

Unnamed: 0_level_0,item_id
customer_id,Unnamed: 1_level_1
1574,430
748,345
367,293
1547,227
464,220
566,219
235,210
1208,203
1549,197
8,192


In [83]:
customer_transactions_with_discounts_gby_customer_df.shape

(1249, 1)

In [84]:
# popular items
customer_transactions_with_discounts_gby_item_df = customer_transactions_with_discounts_df.groupby('item_id')[['customer_id']].count()

In [85]:
customer_transactions_with_discounts_gby_item_df.sort_values(by='customer_id', ascending=False)

Unnamed: 0_level_0,customer_id
item_id,Unnamed: 1_level_1
45502,187
45539,117
53521,74
53517,69
45578,61
5984,58
22631,57
14764,56
45527,50
28289,49


In [86]:
customer_transactions_with_discounts_gby_item_df.shape

(7349, 1)

#### Observations

- ~1600 customers involved in transactions ; ~74000 items being purchased
- Some customers have done as much as 1500 transactions vs minimum number of transactions are around 42 by a customer (can become a feature)
- Some items have been bought as much as 13000 times, versus some which are bought only once (can become a feature)
- Distribution of above two things change a bit if we just consider transactions which had `other_discount` or `coupon_discount` (so we'll need features based on these as well)
- About 13 lac transactions, out of which 6.8 lacs had some `other_discount` and only ~`22000` had `coupon_discount`. Of these that had coupon discount also not all were coupon that are there in our training data. Some are retailer coupons which we lack date for (got to know this from slack channel)
                                                                                                                                

### Item Data

In [87]:
item_data_df.head()

Unnamed: 0,item_id,brand,brand_type,category
0,1,1,Established,Grocery
1,2,1,Established,Miscellaneous
2,3,56,Local,Bakery
3,4,56,Local,Grocery
4,5,56,Local,Grocery


In [88]:
item_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74066 entries, 0 to 74065
Data columns (total 4 columns):
item_id       74066 non-null int64
brand         74066 non-null int64
brand_type    74066 non-null object
category      74066 non-null object
dtypes: int64(2), object(2)
memory usage: 2.3+ MB


In [89]:
item_data_df['item_id'].nunique()

74066

In [90]:
item_data_df['brand'].nunique()

5528

In [91]:
item_data_df['brand_type'].unique()

array(['Established', 'Local'], dtype=object)

In [92]:
item_data_df['category'].unique()

array(['Grocery', 'Miscellaneous', 'Bakery', 'Pharmaceutical',
       'Packaged Meat', 'Seafood', 'Natural Products',
       'Dairy, Juices & Snacks', 'Prepared Food', 'Skin & Hair Care',
       'Meat', 'Travel', 'Flowers & Plants', 'Fuel', 'Salads', 'Alcohol',
       'Garden', 'Restauarant', 'Vegetables (cut)'], dtype=object)

In [93]:
item_data_df[item_data_df['brand']==56]['category'].unique()

array(['Bakery', 'Grocery', 'Pharmaceutical', 'Packaged Meat', 'Seafood',
       'Natural Products', 'Prepared Food', 'Skin & Hair Care',
       'Dairy, Juices & Snacks', 'Meat', 'Flowers & Plants',
       'Miscellaneous', 'Fuel'], dtype=object)

In [94]:
item_data_df[item_data_df['brand']==56]['brand_type'].unique()

array(['Local'], dtype=object)

In [95]:
item_data_df_gby_brand = item_data_df.groupby('brand')[['item_id']].count()

In [96]:
item_data_df_gby_brand.sort_values(by='item_id', ascending=False)

Unnamed: 0_level_0,item_id
brand,Unnamed: 1_level_1
56,10480
686,1142
1,1091
4700,923
1262,906
487,735
619,717
946,626
4467,607
1124,594


#### Observations

- If you look at items it looks like some big basket kinda shop which has these items
- The brand 56 is a local brand which has enormously large set of items and of different types. It seems like some big shot.
- Other than 56, highest number of items a brand has is about 1100, while lowest is 1.

### Train data

In [97]:
train_df.head()

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status
0,1,13,27,1053,0
1,2,13,116,48,0
2,6,9,635,205,0
3,7,13,644,1050,0
4,9,8,1017,1489,0


In [98]:
train_df.shape

(78369, 5)

In [99]:
train_when_redeemed_df = train_df[train_df['redemption_status'] == 1]

In [100]:
train_when_redeemed_df.shape

(729, 5)

In [101]:
train_df['customer_id'].nunique()

1428

In [102]:
train_when_redeemed_df['customer_id'].nunique()

247

In [103]:
train_df['coupon_id'].nunique()

866

In [104]:
train_when_redeemed_df['coupon_id'].nunique()

239

In [105]:
train_df['campaign_id'].nunique()

18

In [106]:
train_when_redeemed_df['campaign_id'].nunique()

18

#### Observations

- Highly imbalanced. 99% of training data redemption doesn't happen
- Only 247 customers of 1800 have redeemed couoon
- Only 239 coupons of 866 have been redeemed
- All campaigns exist in both redeemed and non-redeemed data

## Pre-processing

### Considering it to be a binary classification problem

We need a set of features which can help. 

Set of features to extract from above exploration:
    
**campaign data**
- get number of days of campaign
- get month of campaign
- keep start date and end date as is
- convert X and Y to one-hot 

**coupon-item mapping**
- coupon id's number of items offered (how global kinda is the coupon)
- for item id -- number of coupons available on it (will be based on date though)
- for item id -- whether it gets featured in any coupon at all (again based on date0
- coupon id to one-hot of itmems ?? (so depending upon which exact items are in discount might get redeemed)

**customer demographics**
- Data for 760 customers is there. Need to check how many of these occur in customer transaction and how many in train data
- age_range needs to change to ordinal feature - 1,2,3,4,5 or so. Via a dict.
- marital_status has NaN for many. Keep three possible values - married, single, and unknown. So, three features.
- Income bracket is already ordinal
- Family size mein 5+ can be replaced with 5 and then int operation can be applied on all
- no_of_children NaN can be changed to 0?


**customer transactions**
- 1600 customers involved in purchasing 74,000 items
- Extract shopaholic customers -- basically number of transactions done by user
- Extract number of times item has been bought as a feature
- Then extract the two above features when other_discount is > 0
- Extract the two above when other_discount and coupon_discount > 0
- Extract the two above when coupon_discount > 0
- About 13 lac transactions, out of which 6.8 lacs had some `other_discount` and only ~`22000` had `coupon_discount`. Of these that had coupon discount also not all were coupon that are there in our training data. Some are retailer coupons which we lack date for (got to know this from slack channel)
                                                   

**item data**
- can't do much here
- about 5500 brands, 2 brand_type and 19 categories
- Finally a coupon has multiple items in it, we can utilise that data to assign percentage of a coupon to each brand, brand_type and category

**train data**  
- Highly skewed
- 18 campaigns exist in both redeemed and non-redeemed time
- Only 239 coupons have been redeemed out of 866
- Only 247 customers of 1800 have redeemed

**test data**
- many customers are common between test and train
- the campaigns that exist in test are completely separate from campaigns in train
- So, we need to make sure to not use campaign id as a feature.


------------------------

### Final decided features 

**from campaign_id**
- campaign_type
- no. of days it ran
- start_date
- end_date
- no_of_coupons it has
- % of brand
- % of local brand
- % of category

**from coupon_id**
- no of items offered
- % of categories
- % of brand
- % of brand_type

**from customer_id**
- age_range (1,2,3,4,5)
- family size
- no of children (0 if NaN)
- rented 
- marital status - three values
- no. of transactions in total
- no. of transactions in campaign_id days
- no of transactions in total when other_discount > 0
- no of transactions in total when other_discount > 0 in campaign id dates
- no of transactions in total when discount_coupon > 0
- no of transactions in total when discount_coupon > 0 in campaign id dates
- no of transactions in total when discount_coupon > 0 and other_discount > 0 
- no of transactions in total when discount_coupon > 0 and other_discount > 0 in campaign id dates
- total quantities bought
- ***predicted* income bracket** 
- how many times he has redeemed coupon before
- no. of times he has redeemed same coupon before
- Total quantities bought % based on categories
- Total quantities bought % based on brand
- Total quantities bought % based on brand_type
- Average % discount for customer_id

In [130]:
coupon_item_map_df.head()

Unnamed: 0,coupon_id,item_id
0,105,37
1,107,75
2,494,76
3,522,77
4,518,77


In [131]:
item_data_df.head()

Unnamed: 0,item_id,brand,brand_type,category
0,1,1,Established,Grocery
1,2,1,Established,Miscellaneous
2,3,56,Local,Bakery
3,4,56,Local,Grocery
4,5,56,Local,Grocery


In [132]:
item_brand_details_df = item_data_df[['item_id','brand']]

In [133]:
item_brand_details_df.head(2)

Unnamed: 0,item_id,brand
0,1,1
1,2,1


In [134]:
item_brand_details_df = pd.concat([item_brand_details_df, pd.get_dummies(item_brand_details_df['brand'], prefix = 'brand')], axis=1)


In [135]:
del item_brand_details_df['brand']

In [136]:
item_brand_details_df.shape

(74066, 5529)

In [137]:
item_brand_details_df.head(2)

Unnamed: 0,item_id,brand_1,brand_2,brand_3,brand_4,brand_5,brand_6,brand_7,brand_8,brand_9,...,brand_5519,brand_5520,brand_5521,brand_5522,brand_5523,brand_5524,brand_5525,brand_5526,brand_5527,brand_5528
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [138]:
item_brand_details_df = item_brand_details_df.set_index('item_id')

In [139]:
item_brand_details_df.shape

(74066, 5528)

In [140]:
item_brand_details_df.head(2)

Unnamed: 0_level_0,brand_1,brand_2,brand_3,brand_4,brand_5,brand_6,brand_7,brand_8,brand_9,brand_10,...,brand_5519,brand_5520,brand_5521,brand_5522,brand_5523,brand_5524,brand_5525,brand_5526,brand_5527,brand_5528
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [141]:
item_brand_np = item_brand_details_df.to_numpy()

In [142]:
item_ids_idx_list = item_brand_details_df.index.tolist()

In [143]:
item_brand_np.shape

(74066, 5528)

In [109]:
# do same for brand type
item_brand_type_df = item_data_df[['item_id','brand_type']]
item_brand_type_df = pd.concat([item_brand_type_df, pd.get_dummies(item_brand_type_df['brand_type'], prefix = 'brand_type')], axis=1)
del item_brand_type_df['brand_type']
item_brand_type_df = item_brand_type_df.set_index('item_id')
item_brand_type_np = item_brand_type_df.to_numpy()

In [110]:
item_brand_type_df.head(3)

Unnamed: 0_level_0,brand_type_Established,brand_type_Local
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,0
2,1,0
3,0,1


In [101]:
item_brand_type_np[0:3]

array([[1, 0],
       [1, 0],
       [0, 1]], dtype=uint8)

In [111]:
# do same for category
item_category_df = item_data_df[['item_id','category']]
item_category_df = pd.concat([item_category_df, pd.get_dummies(item_category_df['category'], prefix = 'category')], axis=1)
del item_category_df['category']
item_category_df = item_category_df.set_index('item_id')
item_category_np = item_category_df.to_numpy()

In [112]:
item_category_df.head(2)

Unnamed: 0_level_0,category_Alcohol,category_Bakery,"category_Dairy, Juices & Snacks",category_Flowers & Plants,category_Fuel,category_Garden,category_Grocery,category_Meat,category_Miscellaneous,category_Natural Products,category_Packaged Meat,category_Pharmaceutical,category_Prepared Food,category_Restauarant,category_Salads,category_Seafood,category_Skin & Hair Care,category_Travel,category_Vegetables (cut)
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [113]:
item_category_np[0:2]

array([[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
      dtype=uint8)

In [114]:
item_category_np.shape

(74066, 19)

#### From campaign_id  
- campaign_type
- no. of days it ran
- start_date
- end_date
- **no_of_coupons it has** (not taking right now)
- **% of brand**
- **% of local brand**
- **% of category**

In [115]:
campaign_data_df.head(3)

Unnamed: 0,campaign_id,campaign_type,start_date,end_date,num_days
0,24,Y,2013-10-21,2013-12-20,60
1,25,Y,2013-10-21,2013-11-22,32
2,20,Y,2013-09-07,2013-11-16,70


In [116]:
campaign_data_preprocessed_df = campaign_data_df.copy()

In [117]:
campaign_data_preprocessed_df = pd.concat([campaign_data_preprocessed_df, pd.get_dummies(campaign_data_preprocessed_df['campaign_type'], prefix = 'campaign_type')], axis=1)
del campaign_data_preprocessed_df['campaign_type']

In [118]:
campaign_data_preprocessed_df.head(1)

Unnamed: 0,campaign_id,start_date,end_date,num_days,campaign_type_X,campaign_type_Y
0,24,2013-10-21,2013-12-20,60,0,1


In [119]:
campaign_data_preprocessed_df['num_days_campaign'] =  (campaign_data_preprocessed_df['end_date'] - campaign_data_preprocessed_df['start_date']).dt.days

In [120]:
campaign_data_preprocessed_df.head(1)

Unnamed: 0,campaign_id,start_date,end_date,num_days,campaign_type_X,campaign_type_Y,num_days_campaign
0,24,2013-10-21,2013-12-20,60,0,1,60


#### From coupon_id  

- no of items offered
- % of categories
- % of brand
- % of brand_type

In [121]:
coupon_item_map_df.head(2)

Unnamed: 0,coupon_id,item_id
0,105,37
1,107,75


In [122]:
coupon_item_gby_coupon_df = coupon_item_map_df.groupby('coupon_id').count()

In [123]:
coupon_item_gby_coupon_df.head()

Unnamed: 0_level_0,item_id
coupon_id,Unnamed: 1_level_1
1,39
2,2
3,17
4,24
5,7


In [124]:
coupon_item_gby_coupon_df = coupon_item_gby_coupon_df.rename(columns={'item_id': 'num_of_items_in_coupon'})

In [125]:
coupon_item_gby_coupon_df.head()

Unnamed: 0_level_0,num_of_items_in_coupon
coupon_id,Unnamed: 1_level_1
1,39
2,2
3,17
4,24
5,7


In [126]:
# unit test
coupon_item_map_df[coupon_item_map_df['coupon_id'] == 4][['item_id']].nunique()

item_id    24
dtype: int64

In [127]:
coupon_item_gby_coupon_df = coupon_item_gby_coupon_df.reset_index()

In [128]:
coupon_item_gby_coupon_df.head()

Unnamed: 0,coupon_id,num_of_items_in_coupon
0,1,39
1,2,2
2,3,17
3,4,24
4,5,7


In [181]:
# add X
coupon_id_list_in_order = coupon_item_gby_coupon_df['coupon_id'].tolist()
brand_mat = []
brand_type_mat = []
category_mat = []
for coupon_id in tqdm(coupon_id_list_in_order):
    list_of_items_in_coupon = coupon_item_map_df[coupon_item_map_df['coupon_id'] == coupon_id][['item_id']].values
    list_of_items_in_coupon = [a[0] for a in list_of_items_in_coupon]
    list_of_items_indices = [(a-1) for a in list_of_items_in_coupon]

    # coupon_brands_arr = item_brand_np[list_of_items_indices]
    # brand_arr = (np.sum(coupon_brands_arr, axis=0)[np.newaxis, :] / np.sum(coupon_brands_arr))
    
    coupon_brand_type_arr = item_brand_type_np[list_of_items_indices]
    brand_type_arr = (np.sum(coupon_brand_type_arr, axis=0)[np.newaxis, :] / np.sum(coupon_brand_type_arr))
    
    coupon_category_arr = item_category_np[list_of_items_indices]
    category_arr = (np.sum(coupon_category_arr, axis=0)[np.newaxis, :] / np.sum(coupon_category_arr))
    
    # brand_mat.append(brand_arr)
    brand_type_mat.append(brand_type_arr)
    category_mat.append(category_arr)

100%|██████████| 1116/1116 [00:02<00:00, 539.47it/s]


In [182]:
brand_mat_np = np.array(brand_mat)
brand_type_mat_np = np.array(brand_type_mat)
category_mat_np = np.array(category_mat)

In [183]:
brand_mat_np = np.squeeze(brand_mat_np)
brand_type_mat_np = np.squeeze(brand_type_mat_np)
category_mat_np = np.squeeze(category_mat_np)


In [186]:
coupon_item_preprocessed_df = pd.concat([coupon_item_gby_coupon_df, pd.DataFrame(brand_type_mat_np), pd.DataFrame(category_mat_np)], axis=1)


In [187]:
coupon_item_preprocessed_df.head()

Unnamed: 0,coupon_id,num_of_items_in_coupon,0,1,0.1,1.1,2,3,4,5,...,9,10,11,12,13,14,15,16,17,18
0,1,39,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.769231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,17,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,24,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### From customer_id  

- age_range (1,2,3,4,5)
- family size
- no of children (0 if NaN)
- rented 
- marital status - three values
- no. of transactions in total
- *no. of transactions in campaign_id days*
- no of transactions in total when other_discount > 0
- *no of transactions in total when other_discount > 0 in campaign id dates*
- no of transactions in total when discount_coupon > 0
- *no of transactions in total when discount_coupon > 0 in campaign id dates*
- no of transactions in total when discount_coupon > 0 and other_discount > 0 
- *no of transactions in total when discount_coupon > 0 and other_discount > 0 in campaign id dates*
- **total quantities bought**
- ***predicted* income bracket**
- how many times he has redeemed coupon before
- *no. of times he has redeemed same coupon before*
- **Total quantities bought % based on categories**
- **Total quantities bought % based on brand**
- **Total quantities bought % based on brand_type**
- Average % discount for customer_id

In [149]:
def get_transaction_dicts_for_customer(train_df, customer_transaction_data_df):
    customer_ids_from_train = train_df['customer_id'].tolist()
    customer_ids_from_transactions = customer_transaction_data_df['customer_id'].tolist()
    list_customer_ids = list(set(customer_ids_from_train).union(set(customer_ids_from_transactions)))
    data_for_columns = {
        'customer_id':[],
        'no_of_transactions':[],
        'no_of_transactions_with_odr_discount_gt0':[],
        'no_of_transactions_with_coupon_discount_gt0':[],
        'no_of_transactions_with_both_discount_gt0':[],
        'times_coupon_redeemed_before': [],
        'ever_redeemed_before': [],
        'average_other_discount': [],
        'average_coupon_discount': [],
        'average_combined_discount': [],
    }
    for customer_id in tqdm(list_customer_ids):
        data_for_columns['customer_id'].append(customer_id)
        cus_txn_df = customer_transaction_data_df[customer_transaction_data_df['customer_id']==customer_id]
        # transaction features
        data_for_columns['no_of_transactions'].append(cus_txn_df.shape[0])
        data_for_columns['no_of_transactions_with_odr_discount_gt0'].append(cus_txn_df[cus_txn_df['other_discount'] < 0].shape[0])
        data_for_columns['no_of_transactions_with_coupon_discount_gt0'].append(cus_txn_df[cus_txn_df['coupon_discount'] < 0].shape[0])
        data_for_columns['no_of_transactions_with_both_discount_gt0'].append(cus_txn_df[(cus_txn_df['coupon_discount'] < 0) & (cus_txn_df['other_discount'] < 0)].shape[0])
        data_for_columns['average_other_discount'].append(cus_txn_df['other_discount'].mean())
        data_for_columns['average_coupon_discount'].append(cus_txn_df['coupon_discount'].mean())
        data_for_columns['average_combined_discount'].append((cus_txn_df['coupon_discount'] + cus_txn_df['other_discount']).mean())
        # coupon redemption features
        cus_train_df = train_df[train_df['customer_id'] == customer_id]
        times_redeemed_before = cus_train_df[cus_train_df['redemption_status'] > 0].shape[0]
        data_for_columns['times_coupon_redeemed_before'].append(times_redeemed_before)
        if times_redeemed_before > 0:
            data_for_columns['ever_redeemed_before'].append(1)
        else:
            data_for_columns['ever_redeemed_before'].append(0)

    return pd.DataFrame.from_dict(data_for_columns)

In [150]:
customers_preprocessed_df = get_transaction_dicts_for_customer(train_df, customer_transaction_data_df)

100%|██████████| 1582/1582 [00:12<00:00, 130.56it/s]


In [151]:
customers_preprocessed_df.head()

Unnamed: 0,customer_id,no_of_transactions,no_of_transactions_with_odr_discount_gt0,no_of_transactions_with_coupon_discount_gt0,no_of_transactions_with_both_discount_gt0,times_coupon_redeemed_before,ever_redeemed_before,average_other_discount,average_coupon_discount,average_combined_discount
0,1,1048,497,78,45,2,1,-16.250382,-2.019876,-18.270258
1,2,419,179,4,2,0,0,-16.83043,-0.595084,-17.425513
2,3,705,393,53,28,0,0,-22.714227,-3.091546,-25.805773
3,4,220,85,1,1,0,0,-13.305409,-0.404773,-13.710182
4,5,792,290,2,2,0,0,-13.657917,-0.114684,-13.772601


In [152]:
def get_features_df_related_to_customer_and_coupon_and_campaign(df_to_fill, 
                                                                train_df, 
                                                                customer_transaction_data_df, 
                                                                campaign_data_df):
    
    feature_data_df = df_to_fill.copy()
    feature_data_df['no_of_transactions_in_campaign'] = 0
    feature_data_df['no_of_transactions_in_campaign_with_odr_discount_gt0'] = 0
    feature_data_df['no_of_transactions_in_campaign_with_coupon_discount_gt0'] = 0
    feature_data_df['no_of_transactions_in_campaign_with_both_discount_gt0'] = 0
    feature_data_df['no_of_times_this_coupon_redeemed_before'] = 0    
    
    for iter, row in tqdm(df_to_fill.iterrows(), total=df_to_fill.shape[0]):
        customer_id = row['customer_id']
        coupon_id = row['coupon_id']
        campaign_id = row['campaign_id']
        
        campaign_start_date, campaign_end_date = pd.to_datetime(campaign_data_df[campaign_data_df['campaign_id']==campaign_id][['start_date','end_date']].values[0])
        cus_txn_df = customer_transaction_data_df[(customer_transaction_data_df['customer_id']==customer_id) 
                                                  & (pd.to_datetime(customer_transaction_data_df['date']) >= campaign_start_date)
                                                  & (pd.to_datetime(customer_transaction_data_df['date']) <= campaign_end_date)]
        
        # transactions
        feature_data_df.at[iter,'no_of_transactions_in_campaign'] = cus_txn_df.shape[0]
        feature_data_df.at[iter,'no_of_transactions_in_campaign_with_odr_discount_gt0'] = cus_txn_df[cus_txn_df['other_discount'] < 0].shape[0]
        feature_data_df.at[iter,'no_of_transactions_in_campaign_with_coupon_discount_gt0'] = cus_txn_df[cus_txn_df['coupon_discount'] < 0].shape[0]
        feature_data_df.at[iter,'no_of_transactions_in_campaign_with_both_discount_gt0'] = cus_txn_df[(cus_txn_df['coupon_discount'] < 0) & (cus_txn_df['other_discount'] < 0)].shape[0]
        
        # coupon
        feature_data_df.at[iter,'no_of_times_this_coupon_redeemed_before'] = train_df[(train_df['customer_id'] == customer_id) & (train_df['coupon_id'] == coupon_id) & (train_df['redemption_status'] == 1)].shape[0]
    
    return feature_data_df

In [180]:
customer_local_features_df = get_features_df_related_to_customer_and_coupon_and_campaign(train_df, train_df, customer_transaction_df, campaign_data_df)

100%|██████████| 78369/78369 [23:20<00:00, 55.97it/s]


In [182]:
with open('./Data/customer_train_df_local_features_df.pkl', 'wb') as f:
    pickle.dump(customer_local_features_df, f)

In [153]:
with open('./Data/customer_train_df_local_features_df.pkl', 'rb') as f:
    customer_local_features_df = pickle.load(f)

### Merging Pre-processed Data

- `customers_preprocessed_df` : containing customer_id related features. 1 row for each customer_id
- `customer_local_features_df` : built by adding new columns to train_df / test_df
- `coupon_item_preprocessed_df` : one row for each coupon id. Contains info related to brand %, item % etc.
- `campaign_data_preprocessed_df` : one row for each campaign. 4-5 columns only

Steps :   
1. Build `customer_local_features_df`
2. Merge `customers_preprocessed_df` into it based on `customer_id`. Check that columns should only increase, no rows increase.
3. Merge `coupon_item_preprocessed_df` into it. Only cols should increase.
4. Merge `campaign_data_preprocessed_df` ; only columns should increase.
5. Remove unnecessary fields and use data to train algorithms.


***Later try to also merge customer demographic data. But that's gonna introduce more categorical variables, which most ML algos don't like. However, lightGBM can work with categorical columns!***

In [188]:
# lets build upon train_i_df
train_i_df = customer_local_features_df.copy()

In [189]:
train_i_df.shape

(78369, 10)

In [190]:
train_i_df.head(2)

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status,no_of_transactions_in_campaign,no_of_transactions_in_campaign_with_odr_discount_gt0,no_of_transactions_in_campaign_with_coupon_discount_gt0,no_of_transactions_in_campaign_with_both_discount_gt0,no_of_times_this_coupon_redeemed_before
0,1,13,27,1053,0,40,23,1,1,0
1,2,13,116,48,0,31,16,0,0,0


In [191]:
train_i_df = train_i_df.merge(customers_preprocessed_df, on='customer_id')

In [192]:
train_i_df.shape

(78369, 19)

In [193]:
train_i_df = train_i_df.merge(coupon_item_preprocessed_df, on='coupon_id')

In [194]:
train_i_df.shape

(78369, 41)

In [195]:
train_i_df = train_i_df.merge(campaign_data_preprocessed_df, on='campaign_id')

In [196]:
train_i_df.shape

(78369, 47)

In [197]:
train_i_df.head(1)

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status,no_of_transactions_in_campaign,no_of_transactions_in_campaign_with_odr_discount_gt0,no_of_transactions_in_campaign_with_coupon_discount_gt0,no_of_transactions_in_campaign_with_both_discount_gt0,no_of_times_this_coupon_redeemed_before,...,15,16,17,18,start_date,end_date,num_days,campaign_type_X,campaign_type_Y,num_days_campaign
0,1,13,27,1053,0,40,23,1,1,0,...,0.0,0.0,0.0,0.0,2013-05-19,2013-07-05,47,1,0,47


# Models

## Xgboost 

In [214]:
# get the labels
y_i = train_i_df['redemption_status'].values

train_i_df_fin = train_i_df.drop(['id', 'campaign_id', 'coupon_id', 'customer_id' ,'redemption_status', 'start_date', 'end_date'], axis=1)
x_i = train_i_df_fin.values


#
# Create training and validation sets
#
x_tr_i, x_val_i, y_tr_i, y_val_i = train_test_split(x_i, y_i, test_size=0.4, random_state=42, stratify=y_i)


In [214]:
model_i = XGBClassifier(n_jobs=6)

In [215]:
model_i.fit(x_tr_i, y_tr_i)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=6, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

This job started at around Thu Oct  3 20:01:42 UTC 2019  ( or around 1:30am)

In [216]:
from datetime import datetime
print(datetime.now())

2019-10-03 20:02:03.256704


In [217]:
y_pred_val = model_i.predict(x_val_i)

In [221]:
accuracy = accuracy_score(y_val_i, y_pred_val)

In [222]:
accuracy

0.9996810003827995

In [224]:
y_pred_val_prob = model_i.predict_proba(x_val_i)

### Predicting for test

In [201]:
test_data_df = pd.read_csv('./Data/test_QyjYwdj.csv')

In [202]:
test_data_df.head(2)

Unnamed: 0,id,campaign_id,coupon_id,customer_id
0,3,22,869,967
1,4,20,389,1566


In [204]:
test_data_i_df = get_features_df_related_to_customer_and_coupon_and_campaign(test_data_df, train_df, customer_transaction_df, campaign_data_df)


  2%|▏         | 1030/50226 [00:21<17:11, 47.69it/s]

KeyboardInterrupt: 

In [232]:
with open('./Data/test_data_i_df.pkl', 'wb') as f:
    pickle.dump(test_data_i_df, f)

In [205]:
with open('./Data/test_data_i_df.pkl', 'rb') as f:
    test_data_i_df = pickle.load(f)

In [206]:
test_data_i_df.shape

(50226, 9)

In [207]:
test_data_i_df = test_data_i_df.merge(customers_preprocessed_df, on='customer_id')
test_data_i_df = test_data_i_df.merge(coupon_item_preprocessed_df, on='coupon_id')
test_data_i_df = test_data_i_df.merge(campaign_data_preprocessed_df, on='campaign_id')

In [208]:
test_data_i_df.shape

(50226, 46)

In [209]:
test_data_i_df_fin = test_data_i_df.drop(['id', 'campaign_id', 'coupon_id', 'customer_id' , 'start_date', 'end_date'], axis=1)
x_test_i = test_data_i_df_fin.values


  2%|▏         | 1030/50226 [00:40<17:11, 47.69it/s]

In [210]:
x_test_i.shape

(50226, 40)

In [237]:
y_pred_test_prob_i = model_i.predict_proba(x_test_i)
y_pred_test_i = model_i.predict(x_test_i)

### Submitting

In [238]:
submit_i = pd.read_csv('./Data/sample_submission_Byiv0dS.csv')

In [239]:
submit_i.head(2)

Unnamed: 0,id,redemption_status
0,3,0
1,4,0


In [240]:
submit_i.shape

(50226, 2)

In [241]:
submit_i[submit_i['redemption_status']==1].shape

(0, 2)

In [242]:
submit_i['redemption_status'] = y_pred_test_i

In [243]:
submit_i[submit_i['redemption_status']==1].shape

(36, 2)

In [244]:
submit_i.shape

(50226, 2)

In [245]:
submit_i.to_csv('./Data/submit_i.csv', index=False)

In [246]:
submit_i_probs = pd.read_csv('./Data/sample_submission_Byiv0dS.csv')

In [247]:
submit_i_probs['redemption_status'] = y_pred_test_prob_i

In [248]:
submit_i_probs.to_csv('./Data/submit_i_probs.csv', index=False)

## Using lightgbm

In [None]:
x_tr_i, x_val_i, y_tr_i, y_val_i

In [215]:
x_tr_i.shape

(47021, 40)

In [221]:
sc = StandardScaler()

In [222]:
train_data = lightgbm.Dataset(sc.fit_transform(x_tr_i), label=y_tr_i)
test_data = lightgbm.Dataset(sc.fit_transform(x_val_i), label=y_val_i)


#
# Train the model
#

parameters = {
    'application': 'binary',
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 31,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    'verbose': 0
}

model = lightgbm.train(parameters,
                       train_data,
                       valid_sets=test_data,
                       num_boost_round=5000,
                       early_stopping_rounds=100)

[1]	valid_0's auc: 0.900343
Training until validation scores don't improve for 100 rounds
[2]	valid_0's auc: 0.979878
[3]	valid_0's auc: 0.977421
[4]	valid_0's auc: 0.975477
[5]	valid_0's auc: 0.973596
[6]	valid_0's auc: 0.980396
[7]	valid_0's auc: 0.98376
[8]	valid_0's auc: 0.983645
[9]	valid_0's auc: 0.982151
[10]	valid_0's auc: 0.980325
[11]	valid_0's auc: 0.980326
[12]	valid_0's auc: 0.982945
[13]	valid_0's auc: 0.984285
[14]	valid_0's auc: 0.983612
[15]	valid_0's auc: 0.984496
[16]	valid_0's auc: 0.984991
[17]	valid_0's auc: 0.984702
[18]	valid_0's auc: 0.985119
[19]	valid_0's auc: 0.9853
[20]	valid_0's auc: 0.985225
[21]	valid_0's auc: 0.985419
[22]	valid_0's auc: 0.985439
[23]	valid_0's auc: 0.985504
[24]	valid_0's auc: 0.985504
[25]	valid_0's auc: 0.985676
[26]	valid_0's auc: 0.985922
[27]	valid_0's auc: 0.986252
[28]	valid_0's auc: 0.986445
[29]	valid_0's auc: 0.986888
[30]	valid_0's auc: 0.987065
[31]	valid_0's auc: 0.987154
[32]	valid_0's auc: 0.987311
[33]	valid_0's auc: 0.

In [226]:
y_pred_test_prob_ii = model.predict(sc.fit_transform(x_test_i))

In [227]:
y_pred_test_prob_ii[0:10]

array([2.85128350e-05, 9.98392048e-06, 3.18304317e-06, 7.28918091e-06,
       2.55383498e-06, 6.70811169e-06, 3.27240643e-06, 4.15694713e-06,
       1.59437299e-06, 5.54072798e-06])

In [228]:
submit_ii = pd.read_csv('./Data/sample_submission_Byiv0dS.csv')
submit_ii['redemption_status'] = y_pred_test_prob_ii
submit_ii.to_csv('./Data/submit_ii_iv.csv', index=False)

## Using xgboost cross validation

In [235]:
kf = StratifiedKFold(n_splits=10, shuffle = True, random_state = 228)
fold_splits = kf.split(x_i, y_i)
cv_scores = []
pred_full_test = 0
pred_train = np.zeros((x_i.shape[0]))
i = 1
for dev_index, val_index in fold_splits:
    print('Started fold ' + str(i))
    dev_X, val_X = x_i[dev_index], x_i[val_index]
    dev_y, val_y = y_i[dev_index], y_i[val_index]
    
    # model train
    model = XGBClassifier(n_jobs=6)
    model.fit(dev_X, dev_y)
    pred_val_y = model.predict(val_X)
    # pred_full_test = pred_full_test + pred_test_y
    pred_train[val_index] = pred_val_y
    cv_score = metrics.roc_auc_score(val_y, pred_val_y)
    cv_scores.append(cv_score)
    print(' cv score {}: {}'.format(i, cv_score))
    i += 1
label = 'xgboost-model'
print('{} cv scores : {}'.format(label, cv_scores))
print('{} cv mean score : {}'.format(label, np.mean(cv_scores)))
print('{} cv std score : {}'.format(label, np.std(cv_scores)))
# pred_full_test = pred_full_test / 5.0
results = {'label': label,
          'train': pred_train, 'test': pred_full_test,
          'cv': cv_scores}

Started fold 1
 cv score 1: 0.999806800618238
Started fold 2
 cv score 2: 0.986043770687331
Started fold 3
 cv score 3: 0.9998712004121586
Started fold 4
 cv score 4: 0.9995492014425553
Started fold 5
 cv score 5: 0.9999356002060794
Started fold 6
 cv score 6: 0.9999356002060794
Started fold 7
 cv score 7: 0.9997424008243174
Started fold 8
 cv score 8: 0.9998712004121586
Started fold 9
 cv score 9: 0.9927642861679827
Started fold 10
 cv score 10: 0.992991155761635
xgboost-model cv scores : [0.999806800618238, 0.986043770687331, 0.9998712004121586, 0.9995492014425553, 0.9999356002060794, 0.9999356002060794, 0.9997424008243174, 0.9998712004121586, 0.9927642861679827, 0.992991155761635]
xgboost-model cv mean score : 0.9970511216738535
xgboost-model cv std score : 0.004578704303954754


So, actually problem is with my features only; not with algorithm. Cuz here it gives 0.99 and there it becomes 0.5.

## Remove unused variables

In [299]:
import sys
def sizeof_fmt(num, suffix='B'):
    ''' by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

for name, size in sorted(((name, sys.getsizeof(value)) for name, value in locals().items()),
                         key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))

                    train_i_df:  3.3 GiB
                        x_tr_i:  2.6 GiB
                test_data_i_df:  2.1 GiB
                       x_val_i: 665.7 MiB
         item_brand_details_df: 391.0 MiB
  customer_transaction_data_df: 145.3 MiB
       customer_transaction_df: 70.7 MiB
   coupon_item_preprocessed_df: 47.3 MiB
                          _188: 12.0 MiB
                  item_data_df: 10.7 MiB


In [277]:
# remove unused variables
del _34
del train_i_df_fin
del test_data_i_df_fin
del X_train
del X_test

In [278]:
import sys
def sizeof_fmt(num, suffix='B'):
    ''' by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

for name, size in sorted(((name, sys.getsizeof(value)) for name, value in locals().items()),
                         key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))

                    train_i_df:  3.3 GiB
                        x_tr_i:  2.6 GiB
                test_data_i_df:  2.1 GiB
                       x_val_i: 665.7 MiB
         item_brand_details_df: 391.0 MiB
                          _159: 155.4 MiB
  customer_transaction_data_df: 145.3 MiB
       customer_transaction_df: 70.7 MiB
                          _119: 47.3 MiB
   coupon_item_preprocessed_df: 47.3 MiB


In [279]:
del _159
del _119

In [None]:
aa