In [1]:
import pandas as pd
import numpy as np

In [2]:
campaign_data = pd.DataFrame(pd.read_csv('data/campaign_data.csv'))
item_data = pd.DataFrame(pd.read_csv('data/item_data.csv'))
coupon_item_mapping = pd.DataFrame(pd.read_csv('data/coupon_item_mapping.csv'))
ct = pd.DataFrame(pd.read_csv('data/customer_transaction_data.csv'))
cd = pd.DataFrame(pd.read_csv('data/customer_demographics.csv'))

train = pd.DataFrame(pd.read_csv('data/train.csv'))
test = pd.DataFrame(pd.read_csv('data/test.csv'))

In [3]:
train.shape, test.shape

((78369, 5), (50226, 4))

### merging test and train data

In [4]:
train1 = train.append(test, ignore_index = True, sort=False)
print(train1.shape)

(128595, 5)


In [5]:
print(len(train.customer_id.unique()))
print(len(test.customer_id.unique()))
print(len(train1.customer_id.unique()))

1428
1250
1582


### Feature engineering campaign_data

In [6]:
print(campaign_data.shape)
print(campaign_data.dtypes)
campaign_data.head()

(28, 4)
campaign_id       int64
campaign_type    object
start_date       object
end_date         object
dtype: object


Unnamed: 0,campaign_id,campaign_type,start_date,end_date
0,24,Y,21/10/13,20/12/13
1,25,Y,21/10/13,22/11/13
2,20,Y,07/09/13,16/11/13
3,23,Y,08/10/13,15/11/13
4,21,Y,16/09/13,18/10/13


In [7]:
campaign_data['start-date'] = pd.to_datetime(campaign_data['start_date'], format='%d/%m/%y', dayfirst=True)
campaign_data['end-date'] = pd.to_datetime(campaign_data['end_date'], format='%d/%m/%y', dayfirst=True)
campaign_data.head()

Unnamed: 0,campaign_id,campaign_type,start_date,end_date,start-date,end-date
0,24,Y,21/10/13,20/12/13,2013-10-21,2013-12-20
1,25,Y,21/10/13,22/11/13,2013-10-21,2013-11-22
2,20,Y,07/09/13,16/11/13,2013-09-07,2013-11-16
3,23,Y,08/10/13,15/11/13,2013-10-08,2013-11-15
4,21,Y,16/09/13,18/10/13,2013-09-16,2013-10-18


In [8]:
campaign_data['date_diff'] = (campaign_data['end-date'] - campaign_data['start-date']).dt.days
campaign_data.head()

Unnamed: 0,campaign_id,campaign_type,start_date,end_date,start-date,end-date,date_diff
0,24,Y,21/10/13,20/12/13,2013-10-21,2013-12-20,60
1,25,Y,21/10/13,22/11/13,2013-10-21,2013-11-22,32
2,20,Y,07/09/13,16/11/13,2013-09-07,2013-11-16,70
3,23,Y,08/10/13,15/11/13,2013-10-08,2013-11-15,38
4,21,Y,16/09/13,18/10/13,2013-09-16,2013-10-18,32


### merging campaign_data in train1

In [9]:
train2 = train1.merge(campaign_data, on='campaign_id', how='left')
print(train2.shape)
train2.head()

(128595, 11)


Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status,campaign_type,start_date,end_date,start-date,end-date,date_diff
0,1,13,27,1053,0.0,X,19/05/13,05/07/13,2013-05-19,2013-07-05,47
1,2,13,116,48,0.0,X,19/05/13,05/07/13,2013-05-19,2013-07-05,47
2,6,9,635,205,0.0,Y,11/03/13,12/04/13,2013-03-11,2013-04-12,32
3,7,13,644,1050,0.0,X,19/05/13,05/07/13,2013-05-19,2013-07-05,47
4,9,8,1017,1489,0.0,X,16/02/13,05/04/13,2013-02-16,2013-04-05,48


In [10]:
train2 = train2.drop(columns=['start_date','end_date','start-date','end-date'])
train2.head()

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status,campaign_type,date_diff
0,1,13,27,1053,0.0,X,47
1,2,13,116,48,0.0,X,47
2,6,9,635,205,0.0,Y,32
3,7,13,644,1050,0.0,X,47
4,9,8,1017,1489,0.0,X,48


### merging Item Data in Coupon Item Mapping

In [11]:
print(item_data.shape)
item_data.head()

(74066, 4)


Unnamed: 0,item_id,brand,brand_type,category
0,1,1,Established,Grocery
1,2,1,Established,Miscellaneous
2,3,56,Local,Bakery
3,4,56,Local,Grocery
4,5,56,Local,Grocery


In [12]:
print(coupon_item_mapping.shape)
coupon_item_mapping.head()

(92663, 2)


Unnamed: 0,coupon_id,item_id
0,105,37
1,107,75
2,494,76
3,522,77
4,518,77


In [13]:
coupon_item_mapping1 = coupon_item_mapping.merge(item_data, on='item_id', how='left')

In [14]:
print(coupon_item_mapping1.shape)
coupon_item_mapping1.head()

(92663, 5)


Unnamed: 0,coupon_id,item_id,brand,brand_type,category
0,105,37,56,Local,Grocery
1,107,75,56,Local,Grocery
2,494,76,209,Established,Grocery
3,522,77,278,Established,Grocery
4,518,77,278,Established,Grocery


### Feature engineering coupon_item_mapping1

In [15]:
print(len(coupon_item_mapping1.coupon_id.unique()))
print(len(coupon_item_mapping1.item_id.unique()))
print(len(coupon_item_mapping1.brand.unique()))
print(len(coupon_item_mapping1.brand_type.unique()))
print(len(coupon_item_mapping1.category.unique()))

1116
36289
2555
2
17


In [16]:
coupon_item_mapping1.dtypes

coupon_id      int64
item_id        int64
brand          int64
brand_type    object
category      object
dtype: object

In [17]:
for i in ['brand', 'brand_type', 'category']:
    coupon_item_mapping1[i]=coupon_item_mapping1[i].astype('category')

In [18]:
coupon_item_mapping1.dtypes

coupon_id        int64
item_id          int64
brand         category
brand_type    category
category      category
dtype: object

In [19]:
coupon_item_mapping1.isna().sum()

coupon_id     0
item_id       0
brand         0
brand_type    0
category      0
dtype: int64

In [20]:
coupon_item_mapping1.head()

Unnamed: 0,coupon_id,item_id,brand,brand_type,category
0,105,37,56,Local,Grocery
1,107,75,56,Local,Grocery
2,494,76,209,Established,Grocery
3,522,77,278,Established,Grocery
4,518,77,278,Established,Grocery


In [21]:
counts = coupon_item_mapping1['brand'].value_counts()
counts

56      13540
1        1919
686      1696
602      1621
278      1331
4700     1312
2088     1251
158      1146
1470     1004
1101      978
133       919
328       896
1262      881
673       835
1337      827
866       814
209       805
4412      768
967       724
989       720
1124      703
57        658
946       596
544       563
1075      561
681       543
1041      524
487       516
1587      501
982       485
        ...  
4293        2
4292        2
1269        2
4288        2
4287        2
4285        2
1287        2
4282        2
4283        2
4280        2
4272        2
4270        2
1341        2
4271        2
5472        2
4273        2
207         1
5383        1
427         1
4384        1
4554        1
1047        1
1736        1
2045        1
218         1
4983        1
2001        1
1820        1
5390        1
4395        1
Name: brand, Length: 2555, dtype: int64

In [22]:
coupon_item_mapping1['brand_value_counts'] = coupon_item_mapping1['brand'].map(counts)

In [23]:
coupon_item_mapping1.head()

Unnamed: 0,coupon_id,item_id,brand,brand_type,category,brand_value_counts
0,105,37,56,Local,Grocery,13540
1,107,75,56,Local,Grocery,13540
2,494,76,209,Established,Grocery,805
3,522,77,278,Established,Grocery,1331
4,518,77,278,Established,Grocery,1331


In [24]:
cut_bins = [1, 1000, 10000, 20000]
pd.cut(coupon_item_mapping1['brand_value_counts'], bins=cut_bins).value_counts()

(1, 1000]         67829
(10000, 20000]    13540
(1000, 10000]     11280
Name: brand_value_counts, dtype: int64

In [25]:
cut_labels = ['small','medium','large']
coupon_item_mapping1['brand_bin'] = pd.cut(coupon_item_mapping1.brand_value_counts, 
                                                             bins=cut_bins, 
                                                             labels=cut_labels)

In [26]:
coupon_item_mapping1.head()

Unnamed: 0,coupon_id,item_id,brand,brand_type,category,brand_value_counts,brand_bin
0,105,37,56,Local,Grocery,13540,large
1,107,75,56,Local,Grocery,13540,large
2,494,76,209,Established,Grocery,805,small
3,522,77,278,Established,Grocery,1331,medium
4,518,77,278,Established,Grocery,1331,medium


In [27]:
cat_value_counts = coupon_item_mapping1['category'].value_counts()
cat_value_counts

Grocery                   36466
Pharmaceutical            25061
Natural Products           6819
Meat                       6218
Packaged Meat              6144
Skin & Hair Care           4924
Seafood                    2227
Flowers & Plants           1963
Dairy, Juices & Snacks     1867
Garden                      286
Prepared Food               240
Miscellaneous               184
Bakery                      100
Salads                      100
Travel                       44
Vegetables (cut)             19
Restauarant                   1
Name: category, dtype: int64

In [28]:
coupon_item_mapping1['category_value_counts'] = coupon_item_mapping1['category'].map(cat_value_counts)

In [29]:
coupon_item_mapping1.head()

Unnamed: 0,coupon_id,item_id,brand,brand_type,category,brand_value_counts,brand_bin,category_value_counts
0,105,37,56,Local,Grocery,13540,large,36466
1,107,75,56,Local,Grocery,13540,large,36466
2,494,76,209,Established,Grocery,805,small,36466
3,522,77,278,Established,Grocery,1331,medium,36466
4,518,77,278,Established,Grocery,1331,medium,36466


In [30]:
cat_cut_labels = ['others','pharmaceutical','grocery']
cat_cut_bins = [1, 6819, 25061, 36466]
coupon_item_mapping1['category_bin'] = pd.cut(coupon_item_mapping1.category_value_counts, 
                                              bins=cat_cut_bins, 
                                              labels=cat_cut_labels)

In [31]:
coupon_item_mapping1.head()

Unnamed: 0,coupon_id,item_id,brand,brand_type,category,brand_value_counts,brand_bin,category_value_counts,category_bin
0,105,37,56,Local,Grocery,13540,large,36466,grocery
1,107,75,56,Local,Grocery,13540,large,36466,grocery
2,494,76,209,Established,Grocery,805,small,36466,grocery
3,522,77,278,Established,Grocery,1331,medium,36466,grocery
4,518,77,278,Established,Grocery,1331,medium,36466,grocery


In [32]:
coupon_item_mapping1 = coupon_item_mapping1.drop(columns=['brand','category','brand_value_counts','category_value_counts'])

In [33]:
coupon_item_mapping1.head()

Unnamed: 0,coupon_id,item_id,brand_type,brand_bin,category_bin
0,105,37,Local,large,grocery
1,107,75,Local,large,grocery
2,494,76,Established,small,grocery
3,522,77,Established,medium,grocery
4,518,77,Established,medium,grocery


### Feature engineering customer_transaction

In [34]:
print(ct.shape)
ct.head()

(1324566, 7)


Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount
0,2012-01-02,1501,26830,1,35.26,-10.69,0.0
1,2012-01-02,1501,54253,1,53.43,-13.89,0.0
2,2012-01-02,1501,31962,1,106.5,-14.25,0.0
3,2012-01-02,1501,33647,1,67.32,0.0,0.0
4,2012-01-02,1501,48199,1,71.24,-28.14,0.0


In [58]:
ct['date'] = pd.to_datetime(ct['date'], format='%Y-%m-%d')
# fetching day of the month and whether the day is a weekend or not
ct['dayofmonth'] = ct['date'].dt.day
ct['weekend'] = ct['date'].dt.dayofweek

In [59]:
#if weekend then 1 , 0 otherwise, (Monday=0, Sunday=6)
ct['weekend'] = (ct['weekend'] > 4).astype(int)

In [60]:
ct.tail()

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount,dayofmonth,weekend
1324561,2013-06-30,1129,2777,1,284.6,-71.24,0.0,30,1
1324562,2013-06-30,1129,2953,4,42.74,-28.5,0.0,30,1
1324563,2013-06-30,1129,2971,6,64.12,-42.74,0.0,30,1
1324564,2013-06-30,1129,46984,1,95.82,0.0,0.0,30,1
1324565,2013-06-30,1129,64498,2,489.78,0.0,0.0,30,1


In [85]:
ct['total_discount'] = ct['other_discount'] + ct['coupon_discount']
ct['cost_price'] = ct['selling_price'] - ct['total_discount']
ct['percent_discount'] = (ct['total_discount'] / ct['cost_price'])
ct.head()

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount,dayofmonth,weekend,total_discount,cost_price,percent_discount,coupon_disc_bin
0,2012-01-02,1501,26830,1,35.26,-10.69,0.0,2,0,-10.69,45.95,-0.232644,0
1,2012-01-02,1501,54253,1,53.43,-13.89,0.0,2,0,-13.89,67.32,-0.206328,0
2,2012-01-02,1501,31962,1,106.5,-14.25,0.0,2,0,-14.25,120.75,-0.118012,0
3,2012-01-02,1501,33647,1,67.32,0.0,0.0,2,0,0.0,67.32,0.0,0
4,2012-01-02,1501,48199,1,71.24,-28.14,0.0,2,0,-28.14,99.38,-0.283156,0


In [62]:
ct['coupon_disc_bin'] = ct['coupon_discount'].apply(lambda x:0 if x>=0 else 1)

In [86]:
ct[ct['customer_id']==464].head(1)

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount,dayofmonth,weekend,total_discount,cost_price,percent_discount,coupon_disc_bin
88,2012-01-02,464,5525,1,106.5,-35.62,-35.62,2,0,-71.24,177.74,-0.40081,1


### merging coupon_item_mapping1 into ct

In [87]:
print(coupon_item_mapping1.shape)
coupon_item_mapping1.head()

(92663, 5)


Unnamed: 0,coupon_id,item_id,brand_type,brand_bin,category_bin
0,105,37,Local,large,grocery
1,107,75,Local,large,grocery
2,494,76,Established,small,grocery
3,522,77,Established,medium,grocery
4,518,77,Established,medium,grocery


In [88]:
print(ct.shape)
ct.head()

(1324566, 13)


Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount,dayofmonth,weekend,total_discount,cost_price,percent_discount,coupon_disc_bin
0,2012-01-02,1501,26830,1,35.26,-10.69,0.0,2,0,-10.69,45.95,-0.232644,0
1,2012-01-02,1501,54253,1,53.43,-13.89,0.0,2,0,-13.89,67.32,-0.206328,0
2,2012-01-02,1501,31962,1,106.5,-14.25,0.0,2,0,-14.25,120.75,-0.118012,0
3,2012-01-02,1501,33647,1,67.32,0.0,0.0,2,0,0.0,67.32,0.0,0
4,2012-01-02,1501,48199,1,71.24,-28.14,0.0,2,0,-28.14,99.38,-0.283156,0


In [89]:
print(len(coupon_item_mapping1.item_id.unique()))
print(len(ct.item_id.unique()))

36289
74063


In [90]:
ct1 = ct.merge(coupon_item_mapping1, on='item_id', how='left')
ct1.head()

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount,dayofmonth,weekend,total_discount,cost_price,percent_discount,coupon_disc_bin,coupon_id,brand_type,brand_bin,category_bin
0,2012-01-02,1501,26830,1,35.26,-10.69,0.0,2,0,-10.69,45.95,-0.232644,0,7.0,Local,large,others
1,2012-01-02,1501,26830,1,35.26,-10.69,0.0,2,0,-10.69,45.95,-0.232644,0,20.0,Local,large,others
2,2012-01-02,1501,26830,1,35.26,-10.69,0.0,2,0,-10.69,45.95,-0.232644,0,29.0,Local,large,others
3,2012-01-02,1501,54253,1,53.43,-13.89,0.0,2,0,-13.89,67.32,-0.206328,0,7.0,Local,large,others
4,2012-01-02,1501,54253,1,53.43,-13.89,0.0,2,0,-13.89,67.32,-0.206328,0,20.0,Local,large,others


In [91]:
ct1.shape

(2657495, 17)

In [92]:
len(ct1[ct1.duplicated()])

6608

In [109]:
ct1 = ct1[ct1.duplicated()==False]
ct1.shape

(2650887, 17)

In [111]:
ct1.head()

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount,dayofmonth,weekend,total_discount,cost_price,percent_discount,coupon_disc_bin,coupon_id,brand_type,brand_bin,category_bin
0,2012-01-02,1501,26830,1,35.26,-10.69,0.0,2,0,-10.69,45.95,-0.232644,0,7.0,Local,large,others
1,2012-01-02,1501,26830,1,35.26,-10.69,0.0,2,0,-10.69,45.95,-0.232644,0,20.0,Local,large,others
2,2012-01-02,1501,26830,1,35.26,-10.69,0.0,2,0,-10.69,45.95,-0.232644,0,29.0,Local,large,others
3,2012-01-02,1501,54253,1,53.43,-13.89,0.0,2,0,-13.89,67.32,-0.206328,0,7.0,Local,large,others
4,2012-01-02,1501,54253,1,53.43,-13.89,0.0,2,0,-13.89,67.32,-0.206328,0,20.0,Local,large,others


In [124]:
### group all columns in ct1 wrt unique customer_id
ct2=ct1.merge(ct1.groupby(['customer_id']).agg({
                                                'item_id':'count','quantity':sum,'selling_price':sum,
                                                'other_discount':sum,'coupon_discount':sum,'dayofmonth':'count',
                                                'weekend':'count','total_discount':sum,'cost_price':sum,
                                                'percent_discount':sum,'coupon_disc_bin':sum,'coupon_id':'count',
                                                'brand_type':'count','brand_bin':'count','category_bin':'count',
                                                'date':'first'
                                                }).reset_index(),on=['customer_id'],how='left')

In [125]:
ct2.head()

Unnamed: 0,date_x,customer_id,item_id_x,quantity_x,selling_price_x,other_discount_x,coupon_discount_x,dayofmonth_x,weekend_x,total_discount_x,...,weekend_y,total_discount_y,cost_price_y,percent_discount_y,coupon_disc_bin_y,coupon_id_y,brand_type_y,brand_bin_y,category_bin_y,date_y
0,2012-01-02,1501,26830,1,35.26,-10.69,0.0,2,0,-10.69,...,678,-14958.42,102473.02,-87.778425,39,529,529,529,529,2012-01-02
1,2012-01-02,1501,26830,1,35.26,-10.69,0.0,2,0,-10.69,...,678,-14958.42,102473.02,-87.778425,39,529,529,529,529,2012-01-02
2,2012-01-02,1501,26830,1,35.26,-10.69,0.0,2,0,-10.69,...,678,-14958.42,102473.02,-87.778425,39,529,529,529,529,2012-01-02
3,2012-01-02,1501,54253,1,53.43,-13.89,0.0,2,0,-13.89,...,678,-14958.42,102473.02,-87.778425,39,529,529,529,529,2012-01-02
4,2012-01-02,1501,54253,1,53.43,-13.89,0.0,2,0,-13.89,...,678,-14958.42,102473.02,-87.778425,39,529,529,529,529,2012-01-02


In [126]:
ct2.shape

(2650887, 33)

In [121]:
ct2.reset_index(inplace=True)

In [123]:
ct2.head()

Unnamed: 0,index,date,customer_id,item_id_x,quantity_x,selling_price_x,other_discount_x,coupon_discount_x,dayofmonth_x,weekend_x,...,dayofmonth_y,weekend_y,total_discount_y,cost_price_y,percent_discount_y,coupon_disc_bin_y,coupon_id_y,brand_type_y,brand_bin_y,category_bin_y
0,0,2012-01-02,1501,26830,1,35.26,-10.69,0.0,2,0,...,678,678,-14958.42,102473.02,-87.778425,39,529,529,529,529
1,1,2012-01-02,1501,26830,1,35.26,-10.69,0.0,2,0,...,678,678,-14958.42,102473.02,-87.778425,39,529,529,529,529
2,2,2012-01-02,1501,26830,1,35.26,-10.69,0.0,2,0,...,678,678,-14958.42,102473.02,-87.778425,39,529,529,529,529
3,3,2012-01-02,1501,54253,1,53.43,-13.89,0.0,2,0,...,678,678,-14958.42,102473.02,-87.778425,39,529,529,529,529
4,4,2012-01-02,1501,54253,1,53.43,-13.89,0.0,2,0,...,678,678,-14958.42,102473.02,-87.778425,39,529,529,529,529


In [None]:
#rename(                                             
#                                                columns={                                                
#                                                    'quantity':'quantity_count',
#                                                    'percent_discount':'percent_discount_sum',
#                                                    'coupon_id':'coupon_id_count',
#                                                    'item_id':'item_id_count'})

In [98]:
ct2[(ct2['customer_id']==1052)]

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount,dayofmonth,weekend,total_discount,...,percent_discount,coupon_disc_bin,coupon_id,brand_type,brand_bin,category_bin,quantity_count,percent_discount_sum,coupon_id_count,item_id_count
184949,2012-04-05,1052,6018,1,106.50,0.00,0.00,5,0,0.00,...,0.000000,0,7.0,Local,large,others,32,-6.072291,26,32
184950,2012-04-05,1052,6018,1,106.50,0.00,0.00,5,0,0.00,...,0.000000,0,20.0,Local,large,others,32,-6.072291,26,32
184951,2012-04-05,1052,6018,1,106.50,0.00,0.00,5,0,0.00,...,0.000000,0,29.0,Local,large,others,32,-6.072291,26,32
184952,2012-04-05,1052,6640,1,89.05,-45.95,0.00,5,0,-45.95,...,-0.340370,0,24.0,Established,small,others,32,-6.072291,26,32
184953,2012-04-05,1052,6640,1,89.05,-45.95,0.00,5,0,-45.95,...,-0.340370,0,8.0,Established,small,others,32,-6.072291,26,32
184954,2012-04-05,1052,6640,1,89.05,-45.95,0.00,5,0,-45.95,...,-0.340370,0,33.0,Established,small,others,32,-6.072291,26,32
184955,2012-04-05,1052,10975,1,35.62,-10.33,0.00,5,0,-10.33,...,-0.224810,0,,,,,32,-6.072291,26,32
184956,2012-04-05,1052,13774,1,35.62,-31.70,0.00,5,0,-31.70,...,-0.470885,0,9.0,Local,large,grocery,32,-6.072291,26,32
184957,2012-04-05,1052,13774,1,35.62,-31.70,0.00,5,0,-31.70,...,-0.470885,0,21.0,Local,large,grocery,32,-6.072291,26,32
184958,2012-04-05,1052,13774,1,35.62,-31.70,0.00,5,0,-31.70,...,-0.470885,0,30.0,Local,large,grocery,32,-6.072291,26,32


In [100]:
ct2.isna().sum()

date                         0
customer_id                  0
item_id                      0
quantity                     0
selling_price                0
other_discount               0
coupon_discount              0
dayofmonth                   0
weekend                      0
total_discount               0
cost_price                   0
percent_discount             0
coupon_disc_bin              0
coupon_id               555178
brand_type              555178
brand_bin               555272
category_bin            555180
quantity_count               0
percent_discount_sum         0
coupon_id_count              0
item_id_count                0
dtype: int64

In [None]:
ct3 = ct2.groupby(['customer_id']).agg({'item_id':'count',
                                       })

In [None]:
#ct2 = ct1.drop(columns=['selling_price','other_discount','coupon_discount',''])

### merging ct2 in train2

In [106]:
print(ct2.shape)
ct2.head()

(2650887, 21)


Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount,dayofmonth,weekend,total_discount,...,percent_discount,coupon_disc_bin,coupon_id,brand_type,brand_bin,category_bin,quantity_count,percent_discount_sum,coupon_id_count,item_id_count
0,2012-01-02,1501,26830,1,35.26,-10.69,0.0,2,0,-10.69,...,-0.232644,0,7.0,Local,large,others,12,-2.567551,11,12
1,2012-01-02,1501,26830,1,35.26,-10.69,0.0,2,0,-10.69,...,-0.232644,0,20.0,Local,large,others,12,-2.567551,11,12
2,2012-01-02,1501,26830,1,35.26,-10.69,0.0,2,0,-10.69,...,-0.232644,0,29.0,Local,large,others,12,-2.567551,11,12
3,2012-01-02,1501,54253,1,53.43,-13.89,0.0,2,0,-13.89,...,-0.206328,0,7.0,Local,large,others,12,-2.567551,11,12
4,2012-01-02,1501,54253,1,53.43,-13.89,0.0,2,0,-13.89,...,-0.206328,0,20.0,Local,large,others,12,-2.567551,11,12


In [104]:
print(len(train2.customer_id.unique()))
print(len(ct2.customer_id.unique()))

1582
1582


In [None]:
#df=df.merge(trans,on=['customer_id'],how='left')