In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import KFold,StratifiedKFold, cross_val_score, train_test_split


import lightgbm as lgb
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier,Pool, cv

from sklearn.metrics import mean_squared_error, accuracy_score, roc_auc_score, f1_score

  from numpy.core.umath_tests import inner1d


In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
print(train.shape)
train.head()

(78369, 5)


Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status
0,1,13,27,1053,0
1,2,13,116,48,0
2,6,9,635,205,0
3,7,13,644,1050,0
4,9,8,1017,1489,0


In [4]:
print(test.shape)
test.head()

(50226, 4)


Unnamed: 0,id,campaign_id,coupon_id,customer_id
0,3,22,869,967
1,4,20,389,1566
2,5,22,981,510
3,8,25,1069,361
4,10,17,498,811


In [5]:
camp = pd.read_csv("campaign_data.csv", parse_dates=["start_date", "end_date"])
coup_item = pd.read_csv("coupon_item_mapping.csv")
cust_demo = pd.read_csv("customer_demographics.csv")
cust_trans = pd.read_csv("customer_transaction_data.csv")
item = pd.read_csv("item_data.csv")

In [6]:
print(len(np.intersect1d(train.customer_id, test.customer_id)))
print(len(np.intersect1d(train.coupon_id, test.coupon_id)))
print(len(np.intersect1d(train.campaign_id, test.campaign_id)))

1096
81
0


In [7]:
print(train.customer_id.nunique())
print(train.coupon_id.nunique())
print(train.campaign_id.nunique())

1428
866
18


In [8]:
print(camp.shape)
camp.head()

(28, 4)


Unnamed: 0,campaign_id,campaign_type,start_date,end_date
0,24,Y,2013-10-21,2013-12-20
1,25,Y,2013-10-21,2013-11-22
2,20,Y,2013-07-09,2013-11-16
3,23,Y,2013-08-10,2013-11-15
4,21,Y,2013-09-16,2013-10-18


In [9]:
camp.campaign_id.nunique()

28

In [10]:
print(coup_item.shape)
coup_item.head()

(92663, 2)


Unnamed: 0,coupon_id,item_id
0,105,37
1,107,75
2,494,76
3,522,77
4,518,77


In [11]:
print(item.shape)
item.head()

(74066, 4)


Unnamed: 0,item_id,brand,brand_type,category
0,1,1,Established,Grocery
1,2,1,Established,Miscellaneous
2,3,56,Local,Bakery
3,4,56,Local,Grocery
4,5,56,Local,Grocery


In [12]:
coup_item = coup_item.merge(item, on = 'item_id', how = 'left')
coup_item.shape

(92663, 5)

In [13]:
coup_item.head()

Unnamed: 0,coupon_id,item_id,brand,brand_type,category
0,105,37,56,Local,Grocery
1,107,75,56,Local,Grocery
2,494,76,209,Established,Grocery
3,522,77,278,Established,Grocery
4,518,77,278,Established,Grocery


In [14]:
print(cust_demo.shape)
cust_demo.head()

(760, 7)


Unnamed: 0,customer_id,age_range,marital_status,rented,family_size,no_of_children,income_bracket
0,1,70+,Married,0,2,,4
1,6,46-55,Married,0,2,,5
2,7,26-35,,0,3,1.0,3
3,8,26-35,,0,4,2.0,6
4,10,46-55,Single,0,1,,5


In [15]:
print(cust_demo.customer_id.nunique())

760


In [16]:
print(cust_trans.shape)
cust_trans.head()

(1324566, 7)


Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount
0,2012-01-02,1501,26830,1,35.26,-10.69,0.0
1,2012-01-02,1501,54253,1,53.43,-13.89,0.0
2,2012-01-02,1501,31962,1,106.5,-14.25,0.0
3,2012-01-02,1501,33647,1,67.32,0.0,0.0
4,2012-01-02,1501,48199,1,71.24,-28.14,0.0


In [17]:
coup_item.drop(['item_id'],axis = 1, inplace = True)
coup_item.head()

Unnamed: 0,coupon_id,brand,brand_type,category
0,105,56,Local,Grocery
1,107,56,Local,Grocery
2,494,209,Established,Grocery
3,522,278,Established,Grocery
4,518,278,Established,Grocery


In [18]:
camp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 4 columns):
campaign_id      28 non-null int64
campaign_type    28 non-null object
start_date       28 non-null datetime64[ns]
end_date         28 non-null datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(1)
memory usage: 976.0+ bytes


In [19]:
#preprocessing dates
def process_date(df):
    #df["daysb4Checkin"] = (df["booking_date"]-df["checkin_date"]).dt.days
    df["start_weekday"] = df["start_date"].dt.weekday
    df["start_month"] = df["start_date"].dt.month
    df["start_weekofyear"] = df["start_date"].dt.weekofyear
    df["start_monthstart"] = df["start_date"].dt.is_month_start
    df["start_monthend"] = df["start_date"].dt.is_month_end
   
    
    df["end_weekday"] = df["end_date"].dt.weekday
    df["end_month"] = df["end_date"].dt.month
    df["end_weekofyear"] = df["end_date"].dt.weekofyear
    df["end_monthstart"] = df["end_date"].dt.is_month_start
    df["end_monthend"] = df["end_date"].dt.is_month_end    
    return df

In [20]:
camp = process_date(camp)
camp.head()

Unnamed: 0,campaign_id,campaign_type,start_date,end_date,start_weekday,start_month,start_weekofyear,start_monthstart,start_monthend,end_weekday,end_month,end_weekofyear,end_monthstart,end_monthend
0,24,Y,2013-10-21,2013-12-20,0,10,43,False,False,4,12,51,False,False
1,25,Y,2013-10-21,2013-11-22,0,10,43,False,False,4,11,47,False,False
2,20,Y,2013-07-09,2013-11-16,1,7,28,False,False,5,11,46,False,False
3,23,Y,2013-08-10,2013-11-15,5,8,32,False,False,4,11,46,False,False
4,21,Y,2013-09-16,2013-10-18,0,9,38,False,False,4,10,42,False,False


In [21]:
camp.campaign_type.value_counts()

Y    22
X     6
Name: campaign_type, dtype: int64

In [22]:
camp['campaign_type'] = camp['campaign_type'].map({'Y':0, 'X':1}) 

In [23]:
camp['start_monthstart'] = camp['start_monthstart'].map({False:0, True:1}) 
camp['start_monthend'] = camp['start_monthend'].map({False:0, True:1}) 
camp['end_monthstart'] = camp['end_monthstart'].map({False:0, True:1}) 
camp['end_monthend'] = camp['end_monthend'].map({False:0, True:1}) 

In [24]:
camp.drop(['start_date','end_date'], axis = 1, inplace = True)
camp.head()

Unnamed: 0,campaign_id,campaign_type,start_weekday,start_month,start_weekofyear,start_monthstart,start_monthend,end_weekday,end_month,end_weekofyear,end_monthstart,end_monthend
0,24,0,0,10,43,0,0,4,12,51,0,0
1,25,0,0,10,43,0,0,4,11,47,0,0
2,20,0,1,7,28,0,0,5,11,46,0,0
3,23,0,5,8,32,0,0,4,11,46,0,0
4,21,0,0,9,38,0,0,4,10,42,0,0


In [25]:
train1 = train.merge(camp, on = 'campaign_id', how = 'left')
print("train1 shape",train1.shape, "train shape", train.shape)
train1.head()

train1 shape (78369, 16) train shape (78369, 5)


Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status,campaign_type,start_weekday,start_month,start_weekofyear,start_monthstart,start_monthend,end_weekday,end_month,end_weekofyear,end_monthstart,end_monthend
0,1,13,27,1053,0,1,6,5,20,0,0,1,5,19,0,0
1,2,13,116,48,0,1,6,5,20,0,0,1,5,19,0,0
2,6,9,635,205,0,0,6,11,44,0,0,2,12,49,0,0
3,7,13,644,1050,0,1,6,5,20,0,0,1,5,19,0,0
4,9,8,1017,1489,0,1,5,2,7,0,0,5,5,18,0,0


In [26]:
test1 = test.merge(camp, on = 'campaign_id', how = 'left')
print("test1 shape",test1.shape, "test shape", test.shape)
test1.head()

test1 shape (50226, 15) test shape (50226, 4)


Unnamed: 0,id,campaign_id,coupon_id,customer_id,campaign_type,start_weekday,start_month,start_weekofyear,start_monthstart,start_monthend,end_weekday,end_month,end_weekofyear,end_monthstart,end_monthend
0,3,22,869,967,1,0,9,38,0,0,4,10,42,0,0
1,4,20,389,1566,0,1,7,28,0,0,5,11,46,0,0
2,5,22,981,510,1,0,9,38,0,0,4,10,42,0,0
3,8,25,1069,361,0,0,10,43,0,0,4,11,47,0,0
4,10,17,498,811,0,0,7,31,0,0,4,8,35,0,0


In [27]:
coup_item.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 92663 entries, 0 to 92662
Data columns (total 4 columns):
coupon_id     92663 non-null int64
brand         92663 non-null int64
brand_type    92663 non-null object
category      92663 non-null object
dtypes: int64(2), object(2)
memory usage: 3.5+ MB


In [28]:
coup_item.head()

Unnamed: 0,coupon_id,brand,brand_type,category
0,105,56,Local,Grocery
1,107,56,Local,Grocery
2,494,209,Established,Grocery
3,522,278,Established,Grocery
4,518,278,Established,Grocery


In [29]:
#df.groupby('name')['activity'].value_counts().unstack().fillna(0)
x = coup_item.groupby('coupon_id')['brand_type'].value_counts().unstack()
x = x.add_prefix('brand_type_').rename_axis(None, axis=1).reset_index().fillna(0)
y = coup_item.groupby('coupon_id')['category'].value_counts().unstack()
y = y.add_prefix('category_').rename_axis(None, axis=1).reset_index().fillna(0)
coup_itemF = x.merge(y, how = 'left', on = 'coupon_id')
coup_itemF.head()

Unnamed: 0,coupon_id,brand_type_Established,brand_type_Local,category_Bakery,"category_Dairy, Juices & Snacks",category_Flowers & Plants,category_Garden,category_Grocery,category_Meat,category_Miscellaneous,category_Natural Products,category_Packaged Meat,category_Pharmaceutical,category_Prepared Food,category_Restauarant,category_Salads,category_Seafood,category_Skin & Hair Care,category_Travel,category_Vegetables (cut)
0,1,39.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,2.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,17.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,24.0,0.0,0.0,0.0,0.0,0.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
train1 = train1.merge(coup_itemF, on = 'coupon_id', how = 'left')
print(train1.shape)

(78369, 35)


In [31]:
test1 = test1.merge(coup_itemF, on = 'coupon_id', how = 'left')
print(test1.shape)

(50226, 34)


In [32]:
train1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78369 entries, 0 to 78368
Data columns (total 35 columns):
id                                 78369 non-null int64
campaign_id                        78369 non-null int64
coupon_id                          78369 non-null int64
customer_id                        78369 non-null int64
redemption_status                  78369 non-null int64
campaign_type                      78369 non-null int64
start_weekday                      78369 non-null int64
start_month                        78369 non-null int64
start_weekofyear                   78369 non-null int64
start_monthstart                   78369 non-null int64
start_monthend                     78369 non-null int64
end_weekday                        78369 non-null int64
end_month                          78369 non-null int64
end_weekofyear                     78369 non-null int64
end_monthstart                     78369 non-null int64
end_monthend                       78369 non-null i

In [33]:
test1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50226 entries, 0 to 50225
Data columns (total 34 columns):
id                                 50226 non-null int64
campaign_id                        50226 non-null int64
coupon_id                          50226 non-null int64
customer_id                        50226 non-null int64
campaign_type                      50226 non-null int64
start_weekday                      50226 non-null int64
start_month                        50226 non-null int64
start_weekofyear                   50226 non-null int64
start_monthstart                   50226 non-null int64
start_monthend                     50226 non-null int64
end_weekday                        50226 non-null int64
end_month                          50226 non-null int64
end_weekofyear                     50226 non-null int64
end_monthstart                     50226 non-null int64
end_monthend                       50226 non-null int64
brand_type_Established             50226 non-null f

In [34]:
cust_trans.head()

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount
0,2012-01-02,1501,26830,1,35.26,-10.69,0.0
1,2012-01-02,1501,54253,1,53.43,-13.89,0.0
2,2012-01-02,1501,31962,1,106.5,-14.25,0.0
3,2012-01-02,1501,33647,1,67.32,0.0,0.0
4,2012-01-02,1501,48199,1,71.24,-28.14,0.0


In [35]:
cat_agg=['count','nunique']
num_agg=['min','mean','max','sum']
agg_col={
    'quantity':['sum'], 'selling_price':num_agg, 'other_discount':num_agg,'coupon_discount':num_agg
}

In [36]:
cust_trans1= cust_trans.groupby('customer_id').agg(agg_col)
print(cust_trans1.shape)
cust_trans1.head()

(1582, 13)


Unnamed: 0_level_0,quantity,selling_price,selling_price,selling_price,selling_price,other_discount,other_discount,other_discount,other_discount,coupon_discount,coupon_discount,coupon_discount,coupon_discount
Unnamed: 0_level_1,sum,min,mean,max,sum,min,mean,max,sum,min,mean,max,sum
customer_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
1,1227,12.11,94.001842,445.25,98513.93,-372.23,-16.250382,0.0,-17030.4,-106.86,-2.019876,0.0,-2116.83
2,474,14.25,102.864033,640.8,43100.03,-312.03,-16.83043,0.0,-7051.95,-89.05,-0.595084,0.0,-249.34
3,8163,8.9,103.617404,862.0,73050.27,-463.06,-22.714227,0.0,-16013.53,-142.48,-3.091546,0.0,-2179.54
4,280,14.25,154.423727,1330.05,33973.22,-390.04,-13.305409,0.0,-2927.19,-89.05,-0.404773,0.0,-89.05
5,93353,12.11,130.827146,1485.35,103615.1,-216.21,-13.657917,0.0,-10817.07,-71.24,-0.114684,0.0,-90.83


In [37]:
cust_trans1.columns=['_'.join(col).strip() for col in cust_trans1.columns.values]
cust_trans1.reset_index(inplace=True)
cust_trans1.head()

Unnamed: 0,customer_id,quantity_sum,selling_price_min,selling_price_mean,selling_price_max,selling_price_sum,other_discount_min,other_discount_mean,other_discount_max,other_discount_sum,coupon_discount_min,coupon_discount_mean,coupon_discount_max,coupon_discount_sum
0,1,1227,12.11,94.001842,445.25,98513.93,-372.23,-16.250382,0.0,-17030.4,-106.86,-2.019876,0.0,-2116.83
1,2,474,14.25,102.864033,640.8,43100.03,-312.03,-16.83043,0.0,-7051.95,-89.05,-0.595084,0.0,-249.34
2,3,8163,8.9,103.617404,862.0,73050.27,-463.06,-22.714227,0.0,-16013.53,-142.48,-3.091546,0.0,-2179.54
3,4,280,14.25,154.423727,1330.05,33973.22,-390.04,-13.305409,0.0,-2927.19,-89.05,-0.404773,0.0,-89.05
4,5,93353,12.11,130.827146,1485.35,103615.1,-216.21,-13.657917,0.0,-10817.07,-71.24,-0.114684,0.0,-90.83


In [38]:
cust_trans1.shape

(1582, 14)

In [39]:
cust_demo.head()

Unnamed: 0,customer_id,age_range,marital_status,rented,family_size,no_of_children,income_bracket
0,1,70+,Married,0,2,,4
1,6,46-55,Married,0,2,,5
2,7,26-35,,0,3,1.0,3
3,8,26-35,,0,4,2.0,6
4,10,46-55,Single,0,1,,5


In [41]:
cust_demo.income_bracket.value_counts(dropna = False)

5     187
4     165
6      88
3      70
2      68
1      59
8      37
7      32
9      29
12     10
10     10
11      5
Name: income_bracket, dtype: int64

In [42]:
cust_trans1 = cust_trans1.merge(cust_demo, on = 'customer_id', how = 'left')
cust_trans1.shape

(1582, 20)

In [43]:
cust_trans1.head()

Unnamed: 0,customer_id,quantity_sum,selling_price_min,selling_price_mean,selling_price_max,selling_price_sum,other_discount_min,other_discount_mean,other_discount_max,other_discount_sum,coupon_discount_min,coupon_discount_mean,coupon_discount_max,coupon_discount_sum,age_range,marital_status,rented,family_size,no_of_children,income_bracket
0,1,1227,12.11,94.001842,445.25,98513.93,-372.23,-16.250382,0.0,-17030.4,-106.86,-2.019876,0.0,-2116.83,70+,Married,0.0,2.0,,4.0
1,2,474,14.25,102.864033,640.8,43100.03,-312.03,-16.83043,0.0,-7051.95,-89.05,-0.595084,0.0,-249.34,,,,,,
2,3,8163,8.9,103.617404,862.0,73050.27,-463.06,-22.714227,0.0,-16013.53,-142.48,-3.091546,0.0,-2179.54,,,,,,
3,4,280,14.25,154.423727,1330.05,33973.22,-390.04,-13.305409,0.0,-2927.19,-89.05,-0.404773,0.0,-89.05,,,,,,
4,5,93353,12.11,130.827146,1485.35,103615.1,-216.21,-13.657917,0.0,-10817.07,-71.24,-0.114684,0.0,-90.83,,,,,,


In [46]:
cust_trans1.age_range.value_counts(dropna = False)

46-55    405
36-45    350
26-35    264
56-70    205
70+      197
18-25    161
Name: age_range, dtype: int64

In [45]:
#cust_trans1.age_range.fillna('26-35', inplace = True)
import random
cust_trans1["age_range"] = cust_trans1["age_range"].apply(lambda x: random.choice(['46-55','36-45','26-35','70+','56-70','18-25']) if str(x) == 'nan' else x)

In [47]:
cust_trans1.marital_status.value_counts(dropna=False)

NaN        1151
Married     317
Single      114
Name: marital_status, dtype: int64

In [48]:
import random
cust_trans1["marital_status"] = cust_trans1["marital_status"].apply(lambda x: random.choice(['Married','Single']) if str(x) == 'nan' else x)

In [51]:
cust_trans1.rented.value_counts(dropna = False)

0.0    1137
1.0     445
Name: rented, dtype: int64

In [50]:
import random
cust_trans1["rented"] = cust_trans1["rented"].apply(lambda x: random.choice([0,1]) if str(x) == 'nan' else x)

In [101]:
#cust_trans1.rented.fillna(0, inplace = True)

In [52]:
cust_trans1.family_size.value_counts(dropna = False)

NaN    822
2      303
1      248
3      104
5+      57
4       48
Name: family_size, dtype: int64

In [53]:
cust_trans1.family_size.value_counts().index.values

array(['2', '1', '3', '5+', '4'], dtype=object)

In [54]:
cust_trans1["family_size"] = cust_trans1["family_size"].apply(lambda x: random.choice(cust_trans1.family_size.value_counts().index.values) if str(x) == 'nan' else x)

In [55]:
cust_trans1.no_of_children.value_counts(dropna = False)

NaN    1360
1       107
3+       60
2        55
Name: no_of_children, dtype: int64

In [56]:
cust_trans1["no_of_children"] = cust_trans1["no_of_children"].apply(lambda x: random.choice(cust_trans1.no_of_children.value_counts().index.values) if str(x) == 'nan' else x)

In [57]:
cust_trans1.income_bracket.value_counts(dropna = False)

NaN      822
 5.0     187
 4.0     165
 6.0      88
 3.0      70
 2.0      68
 1.0      59
 8.0      37
 7.0      32
 9.0      29
 10.0     10
 12.0     10
 11.0      5
Name: income_bracket, dtype: int64

In [58]:
cust_trans1["income_bracket"] = cust_trans1["income_bracket"].apply(lambda x: random.choice(cust_trans1.income_bracket.value_counts().index.values) if str(x) == 'nan' else x)

In [59]:
cust_trans1 = pd.get_dummies(cust_trans1, columns = ['income_bracket','no_of_children','family_size','age_range'],
                             drop_first = True)
cust_trans1.head()

Unnamed: 0,customer_id,quantity_sum,selling_price_min,selling_price_mean,selling_price_max,selling_price_sum,other_discount_min,other_discount_mean,other_discount_max,other_discount_sum,...,no_of_children_3+,family_size_2,family_size_3,family_size_4,family_size_5+,age_range_26-35,age_range_36-45,age_range_46-55,age_range_56-70,age_range_70+
0,1,1227,12.11,94.001842,445.25,98513.93,-372.23,-16.250382,0.0,-17030.4,...,0,1,0,0,0,0,0,0,0,1
1,2,474,14.25,102.864033,640.8,43100.03,-312.03,-16.83043,0.0,-7051.95,...,0,1,0,0,0,0,0,0,0,0
2,3,8163,8.9,103.617404,862.0,73050.27,-463.06,-22.714227,0.0,-16013.53,...,1,0,0,1,0,1,0,0,0,0
3,4,280,14.25,154.423727,1330.05,33973.22,-390.04,-13.305409,0.0,-2927.19,...,1,0,1,0,0,0,1,0,0,0
4,5,93353,12.11,130.827146,1485.35,103615.1,-216.21,-13.657917,0.0,-10817.07,...,1,0,0,0,0,0,1,0,0,0


In [60]:
cust_trans1 = pd.get_dummies(cust_trans1, columns = ['marital_status'],
                             drop_first = True)

In [61]:
trainF = train1.merge(cust_trans1, on = 'customer_id', how = 'left')
print(trainF.shape)
trainF.head()

(78369, 72)


Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status,campaign_type,start_weekday,start_month,start_weekofyear,start_monthstart,...,family_size_2,family_size_3,family_size_4,family_size_5+,age_range_26-35,age_range_36-45,age_range_46-55,age_range_56-70,age_range_70+,marital_status_Single
0,1,13,27,1053,0,1,6,5,20,0,...,0,0,0,0,0,0,1,0,0,0
1,2,13,116,48,0,1,6,5,20,0,...,1,0,0,0,0,1,0,0,0,0
2,6,9,635,205,0,0,6,11,44,0,...,1,0,0,0,0,0,1,0,0,0
3,7,13,644,1050,0,1,6,5,20,0,...,0,0,0,0,0,0,1,0,0,1
4,9,8,1017,1489,0,1,5,2,7,0,...,1,0,0,0,0,0,1,0,0,0


In [63]:
testF = test1.merge(cust_trans1, on = 'customer_id', how = 'left')
print(testF.shape)
testF.head()

(50226, 71)


Unnamed: 0,id,campaign_id,coupon_id,customer_id,campaign_type,start_weekday,start_month,start_weekofyear,start_monthstart,start_monthend,...,family_size_2,family_size_3,family_size_4,family_size_5+,age_range_26-35,age_range_36-45,age_range_46-55,age_range_56-70,age_range_70+,marital_status_Single
0,3,22,869,967,1,0,9,38,0,0,...,0,0,0,0,0,1,0,0,0,1
1,4,20,389,1566,0,1,7,28,0,0,...,1,0,0,0,1,0,0,0,0,1
2,5,22,981,510,1,0,9,38,0,0,...,1,0,0,0,1,0,0,0,0,0
3,8,25,1069,361,0,0,10,43,0,0,...,0,0,0,0,0,0,0,0,0,1
4,10,17,498,811,0,0,7,31,0,0,...,0,0,0,0,1,0,0,0,0,0


In [65]:
trainF.isnull().sum().values.sum(), testF.isnull().sum().values.sum()

(0, 0)

In [66]:
trainF.redemption_status.value_counts()
#clearly a Class Imbalance problem

0    77640
1      729
Name: redemption_status, dtype: int64

In [67]:
 trainF.drop(['customer_id', 'campaign_id', 'coupon_id', 'id'],axis = 1, inplace = True)

In [68]:
testF.drop(['customer_id', 'campaign_id', 'coupon_id', 'id'],axis = 1, inplace = True)

In [69]:
testF.shape, trainF.shape

((50226, 67), (78369, 68))

In [71]:
trainF[['coupon_discount_max','coupon_discount_min','coupon_discount_mean','other_discount_max','other_discount_min','other_discount_mean']]= trainF[['coupon_discount_max','coupon_discount_min','coupon_discount_mean','other_discount_max','other_discount_min','other_discount_mean']]*-1

In [72]:
testF[['coupon_discount_max','coupon_discount_min','coupon_discount_mean','other_discount_max','other_discount_min','other_discount_mean']]= testF[['coupon_discount_max','coupon_discount_min','coupon_discount_mean','other_discount_max','other_discount_min','other_discount_mean']]*-1

In [73]:
trainF.drop(['coupon_discount_max','other_discount_max'], axis = 1, inplace = True)
testF.drop(['coupon_discount_max','other_discount_max'], axis = 1, inplace = True)

In [79]:
trainF['redemption_status'] = target

In [80]:
trainF.to_csv("trainF1.csv", index = False)
testF.to_csv('testF1.csv', index = False)

In [81]:
trainF.shape, testF.shape

((78369, 66), (50226, 65))

In [84]:
target = trainF['redemption_status']
trainF.drop('redemption_status', axis = 1, inplace = True)

In [161]:
x_train,x_val,y_train,y_val = train_test_split(trainF,target, test_size=0.25, random_state = 1993, stratify=target)

In [85]:
err=[]
y_pred_tot=[]
from sklearn.model_selection import KFold,StratifiedKFold
fold=StratifiedKFold(n_splits=10,shuffle=True,random_state=1993)
i=1
for train_index, test_index in fold.split(trainF,target):
    x_train, x_test = trainF.iloc[train_index], trainF.iloc[test_index]
    y_train, y_test = target[train_index], target[test_index]
    m=LGBMClassifier(n_estimators=3000,random_state=1993,learning_rate=0.03,colsample_bytree=0.2,objective='binary',scale_pos_weight=1)
    m.fit(x_train,y_train,eval_set=[(x_test, y_test)],eval_metric='auc', early_stopping_rounds=200,verbose=200)
    preds=m.predict_proba(x_test)[:,-1]
    print("err: ",roc_auc_score(y_test,preds))
    err.append(roc_auc_score(y_test,preds))
    p = m.predict_proba(testF)[:,-1]
    i=i+1
    y_pred_tot.append(p)

Training until validation scores don't improve for 200 rounds.
[200]	valid_0's binary_logloss: 0.0348675	valid_0's auc: 0.941437
[400]	valid_0's binary_logloss: 0.0345886	valid_0's auc: 0.94291
Early stopping, best iteration is:
[344]	valid_0's binary_logloss: 0.0344735	valid_0's auc: 0.943739
err:  0.9437392814041625
Training until validation scores don't improve for 200 rounds.
[200]	valid_0's binary_logloss: 0.0333703	valid_0's auc: 0.936601
[400]	valid_0's binary_logloss: 0.0332009	valid_0's auc: 0.93338
Early stopping, best iteration is:
[246]	valid_0's binary_logloss: 0.0330744	valid_0's auc: 0.937274
err:  0.9372737185323199
Training until validation scores don't improve for 200 rounds.
[200]	valid_0's binary_logloss: 0.0327429	valid_0's auc: 0.948047
[400]	valid_0's binary_logloss: 0.0327214	valid_0's auc: 0.943171
Early stopping, best iteration is:
[233]	valid_0's binary_logloss: 0.0325242	valid_0's auc: 0.94832
err:  0.9483204886621075
Training until validation scores don't i

In [86]:
np.mean(err,0)

0.9398261178608369

In [164]:
submission = pd.read_csv('sample_submission.csv')
print(submission.shape)
submission.head()

(50226, 2)


Unnamed: 0,id,redemption_status
0,3,0
1,4,0
2,5,0
3,8,0
4,10,0


In [87]:
submit = pd.DataFrame(columns = ['id','redemption_status'])
submit['id'] = test['id']
submit['redemption_status'] = np.mean(y_pred_tot,0)
submit.head()

Unnamed: 0,id,redemption_status
0,3,0.045776
1,4,0.001238
2,5,0.02446
3,8,9.7e-05
4,10,0.000193


In [88]:
submit.to_csv('LGBC.csv',index=False)

In [89]:
err=[]
y_pred_tot_cb=[]
from sklearn.model_selection import KFold,StratifiedKFold
fold=StratifiedKFold(n_splits=10,shuffle=True,random_state=1993)
i=1
for train_index, test_index in fold.split(trainF,target):
    x_train, x_test = trainF.iloc[train_index], trainF.iloc[test_index]
    y_train, y_test = target[train_index], target[test_index]
    m=CatBoostClassifier(n_estimators=2500,random_state=1993,eval_metric='AUC',learning_rate=0.03)
    m.fit(x_train,y_train,eval_set=[(x_test, y_test)], early_stopping_rounds=200,verbose=200)
    preds=m.predict_proba(x_test)[:,-1]
    print("err: ",roc_auc_score(y_test,preds))
    err.append(roc_auc_score(y_test,preds))
    p = m.predict_proba(testF)[:,-1]
    i=i+1
    y_pred_tot_cb.append(p)

0:	test: 0.8229306	best: 0.8229306 (0)	total: 148ms	remaining: 6m 9s
200:	test: 0.9333983	best: 0.9333983 (200)	total: 22.3s	remaining: 4m 14s
400:	test: 0.9348803	best: 0.9348803 (400)	total: 41.6s	remaining: 3m 37s
600:	test: 0.9353603	best: 0.9355261 (594)	total: 59.9s	remaining: 3m 9s
800:	test: 0.9370576	best: 0.9373399 (793)	total: 1m 19s	remaining: 2m 48s
1000:	test: 0.9372993	best: 0.9376398 (909)	total: 1m 37s	remaining: 2m 25s
1200:	test: 0.9376434	best: 0.9378957 (1094)	total: 1m 53s	remaining: 2m 3s
1400:	test: 0.9370664	best: 0.9381709 (1273)	total: 2m 10s	remaining: 1m 42s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.9381709047
bestIteration = 1273

Shrink model to first 1274 iterations.
err:  0.93817090470242
0:	test: 0.8260262	best: 0.8260262 (0)	total: 87.5ms	remaining: 3m 38s
200:	test: 0.9274117	best: 0.9274117 (200)	total: 18.1s	remaining: 3m 27s
400:	test: 0.9345698	best: 0.9345698 (400)	total: 34.9s	remaining: 3m 2s
600:	test: 0.9358049	bes

In [90]:
np.mean(err,0)

0.9352061592276572

In [175]:
submit = pd.DataFrame(columns = ['id','redemption_status'])
submit['id'] = test['id']
submit['redemption_status'] = np.mean(y_pred_tot_cb,0)
submit.head()

Unnamed: 0,id,redemption_status
0,3,0.081369
1,4,0.001885
2,5,0.043714
3,8,0.000371
4,10,0.00023


In [176]:
submit.to_csv('Catboost.csv',index=False)

In [177]:
submit = pd.DataFrame(columns = ['id','redemption_status'])
submit['id'] = test['id']
submit['redemption_status'] = (np.mean(y_pred_tot_cb,0) + np.mean(y_pred_tot,0))/2
submit.head()

Unnamed: 0,id,redemption_status
0,3,0.067255
1,4,0.001618
2,5,0.034239
3,8,0.000232
4,10,0.000318


In [178]:
submit.to_csv('Catboost_lgbc.csv',index=False)

In [91]:
err=[]
y_pred_tot1=[]
from sklearn.model_selection import KFold,StratifiedKFold,RepeatedStratifiedKFold
fold=StratifiedKFold(n_splits=10, shuffle=True, random_state=1993)
i=1

for train_index, test_index in fold.split(trainF,target):
    x_train, x_test = trainF.iloc[train_index], trainF.iloc[test_index]
    y_train, y_test = target[train_index], target[test_index]
    
    lgbm = LGBMClassifier(boosting_type='gbdt', num_leaves=31, max_depth=-1, learning_rate=0.07, n_estimators=1000, 
                          min_child_weight=0.01, colsample_bytree=0.5, random_state=1993)
    
    lgbm.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric='auc', early_stopping_rounds=100, verbose=100)
    preds = lgbm.predict_proba(x_test)[:,-1]
    
    print("ROC_AUC Score: ", roc_auc_score(y_test, preds))
    err.append(roc_auc_score(y_test,preds))
    p = lgbm.predict_proba(testF)[:,-1]
    print(f'--------------------Fold {i} completed !!!------------------')
    i=i+1
    y_pred_tot1.append(p)

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.0336783	valid_0's auc: 0.950842
Early stopping, best iteration is:
[73]	valid_0's binary_logloss: 0.0335372	valid_0's auc: 0.952268
ROC_AUC Score:  0.9522682842483396
--------------------Fold 1 completed !!!------------------
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.0343009	valid_0's auc: 0.937591
Early stopping, best iteration is:
[76]	valid_0's binary_logloss: 0.0340637	valid_0's auc: 0.938451
ROC_AUC Score:  0.9384505586020482
--------------------Fold 2 completed !!!------------------
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.0345297	valid_0's auc: 0.945408
Early stopping, best iteration is:
[89]	valid_0's binary_logloss: 0.0343548	valid_0's auc: 0.94516
ROC_AUC Score:  0.9451596056262483
--------------------Fold 3 completed !!!------------------
Training until validation scores

In [92]:
np.mean(err,0)

0.9417077999520872

In [180]:
submit = pd.DataFrame(columns = ['id','redemption_status'])
submit['id'] = test['id']
submit['redemption_status'] = np.mean(y_pred_tot1,0)
submit.head()

Unnamed: 0,id,redemption_status
0,3,0.05264
1,4,0.001404
2,5,0.0437
3,8,0.000345
4,10,0.000424


In [181]:
submit.to_csv('lgbc2.csv',index=False)

In [96]:
submit = pd.DataFrame(columns = ['id','redemption_status'])
submit['id'] = test['id']
submit['redemption_status'] = (np.mean(y_pred_tot_cb,0) + np.mean(y_pred_tot1,0) + np.mean(y_pred_tot1,0))/3
submit.head()

Unnamed: 0,id,redemption_status
0,3,0.068933
1,4,0.001711
2,5,0.049626
3,8,0.000251
4,10,0.000198


In [97]:
submit.to_csv('lgbc2_catboost.csv',index=False)