In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [2]:
train=pd.read_csv('train.csv')
campaign_data=pd.read_csv('campaign_data.csv')
coupon_item_mapping=pd.read_csv('coupon_item_mapping.csv')
customer_demographics=pd.read_csv('customer_demographics.csv')
customer_transaction_data=pd.read_csv('customer_transaction_data.csv')
item_data=pd.read_csv('item_data.csv')
test=pd.read_csv('test.csv')

# DATA CLEANING

## TRAIN

In [3]:
train.head()

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status
0,1,13,27,1053,0
1,2,13,116,48,0
2,6,9,635,205,0
3,7,13,644,1050,0
4,9,8,1017,1489,0


In [4]:
train.shape

(78369, 5)

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78369 entries, 0 to 78368
Data columns (total 5 columns):
id                   78369 non-null int64
campaign_id          78369 non-null int64
coupon_id            78369 non-null int64
customer_id          78369 non-null int64
redemption_status    78369 non-null int64
dtypes: int64(5)
memory usage: 3.0 MB


## CAMPAIGN_DATA

In [6]:
campaign_data.head()

Unnamed: 0,campaign_id,campaign_type,start_date,end_date
0,24,Y,21/10/13,20/12/13
1,25,Y,21/10/13,22/11/13
2,20,Y,07/09/13,16/11/13
3,23,Y,08/10/13,15/11/13
4,21,Y,16/09/13,18/10/13


In [7]:
campaign_data.shape

(28, 4)

In [8]:
campaign_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 4 columns):
campaign_id      28 non-null int64
campaign_type    28 non-null object
start_date       28 non-null object
end_date         28 non-null object
dtypes: int64(1), object(3)
memory usage: 1.0+ KB


In [9]:
le=LabelEncoder()
campaign_data['campaign_type']=le.fit_transform(campaign_data['campaign_type'])

In [10]:
campaign_data['start_date']=pd.to_datetime(campaign_data['start_date'])

In [11]:
campaign_data['end_date']=pd.to_datetime(campaign_data['end_date'])

In [12]:
campaign_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 4 columns):
campaign_id      28 non-null int64
campaign_type    28 non-null int32
start_date       28 non-null datetime64[ns]
end_date         28 non-null datetime64[ns]
dtypes: datetime64[ns](2), int32(1), int64(1)
memory usage: 912.0 bytes


## COUPON ITEM MAPPING

In [13]:
coupon_item_mapping.head()

Unnamed: 0,coupon_id,item_id
0,105,37
1,107,75
2,494,76
3,522,77
4,518,77


In [14]:
coupon_item_mapping.shape

(92663, 2)

## CUSTOMER DEMOGRAPHICS

In [15]:
customer_demographics.head()

Unnamed: 0,customer_id,age_range,marital_status,rented,family_size,no_of_children,income_bracket
0,1,70+,Married,0,2,,4
1,6,46-55,Married,0,2,,5
2,7,26-35,,0,3,1.0,3
3,8,26-35,,0,4,2.0,6
4,10,46-55,Single,0,1,,5


In [16]:
customer_demographics.shape

(760, 7)

In [17]:
customer_demographics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 760 entries, 0 to 759
Data columns (total 7 columns):
customer_id       760 non-null int64
age_range         760 non-null object
marital_status    431 non-null object
rented            760 non-null int64
family_size       760 non-null object
no_of_children    222 non-null object
income_bracket    760 non-null int64
dtypes: int64(3), object(4)
memory usage: 41.7+ KB


In [18]:
customer_demographics['no_of_children'].unique()

array([nan, '1', '2', '3+'], dtype=object)

In [19]:
customer_demographics['family_size'].unique()

array(['2', '3', '4', '1', '5+'], dtype=object)

In [20]:
customer_demographics['marital_status'].isnull().sum()

329

In [21]:
customer_demographics['age_range'].unique()

array(['70+', '46-55', '26-35', '36-45', '18-25', '56-70'], dtype=object)

In [22]:
customer_demographics['no_of_children'] = customer_demographics['no_of_children'].replace('3+', 3).astype(float)
customer_demographics['family_size'] = customer_demographics['family_size'].replace('5+', 3).astype(float)
customer_demographics['marital_status'] = pd.Series(customer_demographics['marital_status'].factorize()[0]).replace(-1, np.nan)
customer_demographics['age_range'] = pd.Series(customer_demographics['age_range'].factorize()[0]).replace(-1, np.nan)

In [23]:
customer_demographics.head()

Unnamed: 0,customer_id,age_range,marital_status,rented,family_size,no_of_children,income_bracket
0,1,0,0.0,0,2.0,,4
1,6,1,0.0,0,2.0,,5
2,7,2,,0,3.0,1.0,3
3,8,2,,0,4.0,2.0,6
4,10,1,1.0,0,1.0,,5


In [24]:
customer_demographics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 760 entries, 0 to 759
Data columns (total 7 columns):
customer_id       760 non-null int64
age_range         760 non-null int64
marital_status    431 non-null float64
rented            760 non-null int64
family_size       760 non-null float64
no_of_children    222 non-null float64
income_bracket    760 non-null int64
dtypes: float64(3), int64(4)
memory usage: 41.7 KB


## CUSTOMER TRANSACTION DETAILS

In [25]:
customer_transaction_data.head()

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount
0,2012-01-02,1501,26830,1,35.26,-10.69,0.0
1,2012-01-02,1501,54253,1,53.43,-13.89,0.0
2,2012-01-02,1501,31962,1,106.5,-14.25,0.0
3,2012-01-02,1501,33647,1,67.32,0.0,0.0
4,2012-01-02,1501,48199,1,71.24,-28.14,0.0


In [26]:
customer_transaction_data.shape

(1324566, 7)

In [27]:
customer_transaction_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1324566 entries, 0 to 1324565
Data columns (total 7 columns):
date               1324566 non-null object
customer_id        1324566 non-null int64
item_id            1324566 non-null int64
quantity           1324566 non-null int64
selling_price      1324566 non-null float64
other_discount     1324566 non-null float64
coupon_discount    1324566 non-null float64
dtypes: float64(3), int64(3), object(1)
memory usage: 70.7+ MB


In [28]:
customer_transaction_data['date'] = pd.to_datetime(customer_transaction_data['date'])

In [29]:
customer_transaction_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1324566 entries, 0 to 1324565
Data columns (total 7 columns):
date               1324566 non-null datetime64[ns]
customer_id        1324566 non-null int64
item_id            1324566 non-null int64
quantity           1324566 non-null int64
selling_price      1324566 non-null float64
other_discount     1324566 non-null float64
coupon_discount    1324566 non-null float64
dtypes: datetime64[ns](1), float64(3), int64(3)
memory usage: 70.7 MB


## ITEM DATA

In [30]:
item_data.head()

Unnamed: 0,item_id,brand,brand_type,category
0,1,1,Established,Grocery
1,2,1,Established,Miscellaneous
2,3,56,Local,Bakery
3,4,56,Local,Grocery
4,5,56,Local,Grocery


In [31]:
item_data.shape

(74066, 4)

In [32]:
item_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74066 entries, 0 to 74065
Data columns (total 4 columns):
item_id       74066 non-null int64
brand         74066 non-null int64
brand_type    74066 non-null object
category      74066 non-null object
dtypes: int64(2), object(2)
memory usage: 2.3+ MB


In [33]:
item_data['brand_type']=le.fit_transform(item_data['brand_type'])

In [34]:
item_data['category']=le.fit_transform(item_data['category'])

## TEST

In [35]:
test.head()

Unnamed: 0,id,campaign_id,coupon_id,customer_id
0,3,22,869,967
1,4,20,389,1566
2,5,22,981,510
3,8,25,1069,361
4,10,17,498,811


In [36]:
test.shape

(50226, 4)

# DATA PREPARATION

In [37]:
train['type']='train'
test['type']='test'

In [38]:
data=pd.concat([train,test],sort=False).reset_index(drop=True)

In [39]:
data.head()

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status,type
0,1,13,27,1053,0.0,train
1,2,13,116,48,0.0,train
2,6,9,635,205,0.0,train
3,7,13,644,1050,0.0,train
4,9,8,1017,1489,0.0,train


In [40]:
data=data.merge(campaign_data,on='campaign_id')
data['start_date'] = pd.to_datetime(data['start_date'], dayfirst=True)
data['end_date'] = pd.to_datetime(data['end_date'], dayfirst=True)
data['campaign_type'] = pd.Series(data['campaign_type'].factorize()[0]).replace(-1, np.nan)

In [41]:
data.head()

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status,type,campaign_type,start_date,end_date
0,1,13,27,1053,0.0,train,0,2013-05-19,2013-05-07
1,2,13,116,48,0.0,train,0,2013-05-19,2013-05-07
2,7,13,644,1050,0.0,train,0,2013-05-19,2013-05-07
3,21,13,1028,89,0.0,train,0,2013-05-19,2013-05-07
4,23,13,517,1067,0.0,train,0,2013-05-19,2013-05-07


In [42]:
data.shape

(128595, 9)

In [43]:
rented_mean_by_id = customer_demographics.groupby("customer_id")['rented'].mean().to_dict()

In [44]:
rented_mean_by_id

{1: 0,
 6: 0,
 7: 0,
 8: 0,
 10: 0,
 11: 0,
 12: 0,
 13: 0,
 14: 1,
 15: 0,
 17: 0,
 19: 0,
 22: 0,
 27: 0,
 28: 0,
 30: 0,
 31: 0,
 33: 0,
 35: 0,
 36: 0,
 38: 0,
 39: 0,
 40: 0,
 41: 0,
 42: 0,
 45: 0,
 48: 0,
 51: 0,
 52: 0,
 53: 0,
 55: 0,
 58: 0,
 59: 0,
 66: 0,
 67: 0,
 69: 0,
 71: 0,
 72: 0,
 74: 0,
 75: 0,
 78: 0,
 79: 0,
 82: 0,
 83: 0,
 84: 0,
 85: 0,
 87: 0,
 89: 0,
 90: 0,
 92: 0,
 93: 0,
 94: 0,
 97: 0,
 103: 0,
 105: 0,
 107: 0,
 108: 0,
 110: 0,
 112: 0,
 113: 0,
 114: 0,
 119: 0,
 123: 0,
 124: 0,
 128: 0,
 131: 0,
 132: 0,
 134: 0,
 135: 0,
 136: 0,
 138: 0,
 140: 0,
 141: 0,
 142: 0,
 143: 0,
 144: 0,
 149: 0,
 150: 0,
 151: 0,
 153: 1,
 154: 0,
 155: 0,
 158: 0,
 159: 1,
 161: 0,
 162: 0,
 163: 0,
 167: 0,
 168: 0,
 174: 0,
 178: 0,
 179: 0,
 180: 0,
 185: 0,
 186: 0,
 189: 0,
 191: 0,
 192: 0,
 193: 0,
 195: 0,
 197: 0,
 201: 0,
 202: 0,
 204: 0,
 205: 0,
 207: 0,
 209: 0,
 212: 0,
 214: 0,
 223: 0,
 225: 0,
 226: 0,
 227: 0,
 228: 0,
 230: 0,
 231: 0,
 232: 0,
 235

In [45]:
data['rented_mean'] = data['customer_id'].map(rented_mean_by_id)

In [46]:
data.head()

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status,type,campaign_type,start_date,end_date,rented_mean
0,1,13,27,1053,0.0,train,0,2013-05-19,2013-05-07,0.0
1,2,13,116,48,0.0,train,0,2013-05-19,2013-05-07,0.0
2,7,13,644,1050,0.0,train,0,2013-05-19,2013-05-07,
3,21,13,1028,89,0.0,train,0,2013-05-19,2013-05-07,0.0
4,23,13,517,1067,0.0,train,0,2013-05-19,2013-05-07,0.0


In [47]:
data.shape

(128595, 10)

In [48]:
income_bracket_sum = customer_demographics.groupby("customer_id")['income_bracket'].sum().to_dict()

In [49]:
income_bracket_sum

{1: 4,
 6: 5,
 7: 3,
 8: 6,
 10: 5,
 11: 1,
 12: 7,
 13: 2,
 14: 6,
 15: 6,
 17: 5,
 19: 3,
 22: 4,
 27: 8,
 28: 1,
 30: 5,
 31: 2,
 33: 9,
 35: 4,
 36: 4,
 38: 5,
 39: 4,
 40: 7,
 41: 4,
 42: 9,
 45: 1,
 48: 3,
 51: 2,
 52: 7,
 53: 3,
 55: 5,
 58: 4,
 59: 4,
 66: 6,
 67: 4,
 69: 1,
 71: 4,
 72: 1,
 74: 2,
 75: 4,
 78: 8,
 79: 4,
 82: 4,
 83: 4,
 84: 8,
 85: 3,
 87: 4,
 89: 3,
 90: 5,
 92: 1,
 93: 3,
 94: 5,
 97: 6,
 103: 4,
 105: 5,
 107: 2,
 108: 5,
 110: 6,
 112: 4,
 113: 4,
 114: 5,
 119: 3,
 123: 4,
 124: 6,
 128: 1,
 131: 5,
 132: 3,
 134: 4,
 135: 1,
 136: 1,
 138: 5,
 140: 2,
 141: 9,
 142: 4,
 143: 2,
 144: 4,
 149: 3,
 150: 4,
 151: 12,
 153: 5,
 154: 6,
 155: 5,
 158: 8,
 159: 1,
 161: 1,
 162: 5,
 163: 3,
 167: 4,
 168: 6,
 174: 4,
 178: 5,
 179: 4,
 180: 5,
 185: 7,
 186: 4,
 189: 5,
 191: 1,
 192: 4,
 193: 4,
 195: 5,
 197: 4,
 201: 6,
 202: 6,
 204: 4,
 205: 7,
 207: 5,
 209: 6,
 212: 8,
 214: 1,
 223: 5,
 225: 5,
 226: 5,
 227: 9,
 228: 4,
 230: 5,
 231: 5,
 232: 5,
 23

In [50]:
data['income_bracket_sum'] = data['customer_id'].map(income_bracket_sum)

In [51]:
data.head()

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status,type,campaign_type,start_date,end_date,rented_mean,income_bracket_sum
0,1,13,27,1053,0.0,train,0,2013-05-19,2013-05-07,0.0,5.0
1,2,13,116,48,0.0,train,0,2013-05-19,2013-05-07,0.0,3.0
2,7,13,644,1050,0.0,train,0,2013-05-19,2013-05-07,,
3,21,13,1028,89,0.0,train,0,2013-05-19,2013-05-07,0.0,3.0
4,23,13,517,1067,0.0,train,0,2013-05-19,2013-05-07,0.0,5.0


In [52]:
age_range_mean = customer_demographics.groupby("customer_id")['age_range'].mean().to_dict()
data['age_range_mean'] = data['customer_id'].map(age_range_mean)

In [53]:
family_size_mean = customer_demographics.groupby("customer_id")['family_size'].mean().to_dict()
data['family_size_mean'] = data['customer_id'].map(family_size_mean)

In [54]:
no_of_children_mean = customer_demographics.groupby("customer_id")['no_of_children'].mean().to_dict()
data['no_of_children_mean'] = data['customer_id'].map(no_of_children_mean)

In [55]:
no_of_children_count = customer_demographics.groupby("customer_id")['no_of_children'].count().to_dict()
data['no_of_children_count'] = data['customer_id'].map(no_of_children_count)

In [56]:
marital_status_count = customer_demographics.groupby("customer_id")['marital_status'].count().to_dict()
data['marital_status_count'] = data['customer_id'].map(marital_status_count)

In [57]:
data.head()

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status,type,campaign_type,start_date,end_date,rented_mean,income_bracket_sum,age_range_mean,family_size_mean,no_of_children_mean,no_of_children_count,marital_status_count
0,1,13,27,1053,0.0,train,0,2013-05-19,2013-05-07,0.0,5.0,1.0,1.0,,0.0,0.0
1,2,13,116,48,0.0,train,0,2013-05-19,2013-05-07,0.0,3.0,3.0,2.0,,0.0,1.0
2,7,13,644,1050,0.0,train,0,2013-05-19,2013-05-07,,,,,,,
3,21,13,1028,89,0.0,train,0,2013-05-19,2013-05-07,0.0,3.0,1.0,1.0,,0.0,0.0
4,23,13,517,1067,0.0,train,0,2013-05-19,2013-05-07,0.0,5.0,1.0,1.0,,0.0,0.0


In [58]:
data.shape

(128595, 16)

In [59]:
customer_transaction_data['date'] = pd.to_datetime(customer_transaction_data['date'])

In [60]:
quantity_mean = customer_transaction_data.groupby("customer_id")['quantity'].mean().to_dict()
data['quantity_mean'] = data['customer_id'].map(quantity_mean)

In [61]:
coupon_discount_mean = customer_transaction_data.groupby("customer_id")['coupon_discount'].mean().to_dict()
data['coupon_discount_mean'] = data['customer_id'].map(coupon_discount_mean)

In [62]:
other_discount_mean = customer_transaction_data.groupby("customer_id")['other_discount'].mean().to_dict()
data['other_discount_mean'] = data['customer_id'].map(other_discount_mean)

In [63]:
customer_transaction_data['day'] = customer_transaction_data.date.dt.day
date_day_mean = customer_transaction_data.groupby("customer_id")['day'].mean().to_dict()
data['date_day_mean'] = data['customer_id'].map(date_day_mean)

In [64]:
#MERGING TABLES COUNPON ITEM MAPPING AND ITEM DATA
coupon_item_mapping = coupon_item_mapping.merge(item_data, how = 'left', on = 'item_id')
coupon_item_mapping['brand_type'] = pd.Series(coupon_item_mapping['brand_type'].factorize()[0]).replace(-1, np.nan)
coupon_item_mapping['category'] = pd.Series(coupon_item_mapping['category'].factorize()[0]).replace(-1, np.nan)

In [65]:
category = coupon_item_mapping.groupby("coupon_id")['category'].mean().to_dict()
data['category_mean'] = data['coupon_id'].map(category)
category = coupon_item_mapping.groupby("coupon_id")['category'].count().to_dict()
data['category_count'] = data['coupon_id'].map(category)
category = coupon_item_mapping.groupby("coupon_id")['category'].nunique().to_dict()
data['category_nunique'] = data['coupon_id'].map(category)
category = coupon_item_mapping.groupby("coupon_id")['category'].max().to_dict()
data['category_max'] = data['coupon_id'].map(category)
category = coupon_item_mapping.groupby("coupon_id")['category'].min().to_dict()
data['category_min'] = data['coupon_id'].map(category)

In [66]:
brand_mean = coupon_item_mapping.groupby("coupon_id")['brand'].mean().to_dict()
data['brand_mean'] = data['coupon_id'].map(brand_mean)
brand_mean = coupon_item_mapping.groupby("coupon_id")['brand'].count().to_dict()
data['brand_count'] = data['coupon_id'].map(brand_mean)
brand_mean = coupon_item_mapping.groupby("coupon_id")['brand'].min().to_dict()
data['brand_min'] = data['coupon_id'].map(brand_mean)
brand_mean = coupon_item_mapping.groupby("coupon_id")['brand'].max().to_dict()
data['brand_max'] = data['coupon_id'].map(brand_mean)
brand_mean = coupon_item_mapping.groupby("coupon_id")['brand'].nunique().to_dict()
data['brand_nunique'] = data['coupon_id'].map(brand_mean)

In [67]:
selling_price_mean = customer_transaction_data.groupby("customer_id")['selling_price'].mean().to_dict()
data['selling_price_mean'] = data['customer_id'].map(selling_price_mean)
selling_price_mean = customer_transaction_data.groupby("customer_id")['selling_price'].sum().to_dict()
data['selling_price_sum'] = data['customer_id'].map(selling_price_mean)
selling_price_mean = customer_transaction_data.groupby("customer_id")['selling_price'].min().to_dict()
data['selling_price_min'] = data['customer_id'].map(selling_price_mean)
selling_price_mean = customer_transaction_data.groupby("customer_id")['selling_price'].max().to_dict()
data['selling_price_max'] = data['customer_id'].map(selling_price_mean)
selling_price_mean = customer_transaction_data.groupby("customer_id")['selling_price'].nunique().to_dict()
data['selling_price_nunique'] = data['customer_id'].map(selling_price_mean)

In [68]:
data.head()

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status,type,campaign_type,start_date,end_date,rented_mean,...,brand_mean,brand_count,brand_min,brand_max,brand_nunique,selling_price_mean,selling_price_sum,selling_price_min,selling_price_max,selling_price_nunique
0,1,13,27,1053,0.0,train,0,2013-05-19,2013-05-07,0.0,...,1364.128,125,1105,1636,2,184.260484,57120.75,17.45,5164.54,129
1,2,13,116,48,0.0,train,0,2013-05-19,2013-05-07,0.0,...,56.0,3,56,56,1,234.247013,90185.1,7.12,1758.92,114
2,7,13,644,1050,0.0,train,0,2013-05-19,2013-05-07,,...,611.0,4,611,611,1,98.276034,23291.42,13.89,708.48,84
3,21,13,1028,89,0.0,train,0,2013-05-19,2013-05-07,0.0,...,1639.0,6,1639,1639,1,115.576332,77204.99,10.33,1246.7,186
4,23,13,517,1067,0.0,train,0,2013-05-19,2013-05-07,0.0,...,261.0,3,261,261,1,115.829742,112354.85,3.56,1905.31,186


In [69]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 128595 entries, 0 to 128594
Data columns (total 35 columns):
id                       128595 non-null int64
campaign_id              128595 non-null int64
coupon_id                128595 non-null int64
customer_id              128595 non-null int64
redemption_status        78369 non-null float64
type                     128595 non-null object
campaign_type            128595 non-null int64
start_date               128595 non-null datetime64[ns]
end_date                 128595 non-null datetime64[ns]
rented_mean              74600 non-null float64
income_bracket_sum       74600 non-null float64
age_range_mean           74600 non-null float64
family_size_mean         74600 non-null float64
no_of_children_mean      24218 non-null float64
no_of_children_count     74600 non-null float64
marital_status_count     74600 non-null float64
quantity_mean            128595 non-null float64
coupon_discount_mean     128595 non-null float64
other_discou

In [70]:
data.shape

(128595, 35)

In [71]:
data.isnull().sum()

id                            0
campaign_id                   0
coupon_id                     0
customer_id                   0
redemption_status         50226
type                          0
campaign_type                 0
start_date                    0
end_date                      0
rented_mean               53995
income_bracket_sum        53995
age_range_mean            53995
family_size_mean          53995
no_of_children_mean      104377
no_of_children_count      53995
marital_status_count      53995
quantity_mean                 0
coupon_discount_mean          0
other_discount_mean           0
date_day_mean                 0
category_mean                 0
category_count                0
category_nunique              0
category_max                  0
category_min                  0
brand_mean                    0
brand_count                   0
brand_min                     0
brand_max                     0
brand_nunique                 0
selling_price_mean            0
selling_

In [72]:
data[['rented_mean','income_bracket_sum','age_range_mean','family_size_mean'
      ,'no_of_children_mean','marital_status_count']]=data[['rented_mean','income_bracket_sum','age_range_mean','family_size_mean'
      ,'no_of_children_mean','marital_status_count']].fillna(0)

In [73]:
data.isnull().sum()

id                           0
campaign_id                  0
coupon_id                    0
customer_id                  0
redemption_status        50226
type                         0
campaign_type                0
start_date                   0
end_date                     0
rented_mean                  0
income_bracket_sum           0
age_range_mean               0
family_size_mean             0
no_of_children_mean          0
no_of_children_count     53995
marital_status_count         0
quantity_mean                0
coupon_discount_mean         0
other_discount_mean          0
date_day_mean                0
category_mean                0
category_count               0
category_nunique             0
category_max                 0
category_min                 0
brand_mean                   0
brand_count                  0
brand_min                    0
brand_max                    0
brand_nunique                0
selling_price_mean           0
selling_price_sum            0
selling_

In [74]:
data['no_of_children_count']=data['no_of_children_count'].fillna(0)

In [75]:
from sklearn.preprocessing import StandardScaler
data[['campaign_id','coupon_id','campaign_type','rented_mean','income_bracket_sum','age_range_mean','family_size_mean',
 'no_of_children_mean',
 'no_of_children_count',
 'marital_status_count',
 'quantity_mean',
 'coupon_discount_mean',
 'other_discount_mean',
 'date_day_mean',
 'category_mean',
 'category_nunique',
 'category_max',
 'category_min',
 'brand_mean',
 'brand_max',
 'brand_nunique',
 'selling_price_mean',
 'selling_price_min',
 'selling_price_nunique']]=StandardScaler.fit_transform(data[['campaign_id','coupon_id','campaign_type','rented_mean','income_bracket_sum','age_range_mean','family_size_mean',
 'no_of_children_mean',
 'no_of_children_count',
 'marital_status_count',
 'quantity_mean',
 'coupon_discount_mean',
 'other_discount_mean',
 'date_day_mean',
 'category_mean',
 'category_nunique',
 'category_max',
 'category_min',
 'brand_mean',
 'brand_max',
 'brand_nunique',
 'selling_price_mean',
 'selling_price_min',
 'selling_price_nunique']])

TypeError: fit_transform() missing 1 required positional argument: 'X'

In [None]:
df=data[['campaign_id','coupon_id','campaign_type','rented_mean','income_bracket_sum','age_range_mean','family_size_mean',
 'no_of_children_mean',
 'no_of_children_count',
 'marital_status_count',
 'quantity_mean',
 'coupon_discount_mean',
 'other_discount_mean',
 'date_day_mean',
 'category_mean',
 'category_nunique',
 'category_max',
 'category_min',
 'brand_mean',
 'brand_max',
 'brand_nunique',
 'selling_price_mean',
 'selling_price_min',
 'selling_price_nunique','type','redemption_status']]

In [None]:
train = df.loc[df['type']=='train']
test = df.loc[df['type']=='test']

In [None]:
train.head()

In [None]:
train.isnull().sum()

In [None]:
X_train=train.drop(['type','redemption_status'],axis=1)
Y_train=train['redemption_status']
X_test=test.drop(['type','redemption_status'],axis=1)

## WHICH MODEL TO USE?????

X=X_train
Y=Y_train

from sklearn.model_selection import StratifiedKFold, cross_val_score,GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

cv=StratifiedKFold(n_splits=5,random_state=5)

models = []
models.append(('CART', DecisionTreeClassifier()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('Naive Bayes', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('Random Forest', RandomForestClassifier()))
models.append(('Bagging', BaggingClassifier()))
models.append(('AdaBoost', AdaBoostClassifier()))
models.append(('Gradient Boosting', GradientBoostingClassifier()))
models.append(('Logistic Regression', LogisticRegression()))
models.append(('MLP', MLPClassifier ( max_iter=1000)))

results = []
names = []
final_scores=[]
for name, model in models:
    accuracy=cross_val_score(model, X, Y,cv=cv)
    results.append(accuracy)
    names.append(name)
    score_mean = "%s: %f" % (name, accuracy.mean())
    final_scores.append(score_mean)
final_scores

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier

In [None]:
lr=LogisticRegression( penalty='l2',
    dual=False,
    tol=0.0001,
    C=1.0,
    fit_intercept=True,
    intercept_scaling=1,
    class_weight=None,
    random_state=None,
    solver='warn',
    max_iter=1000,
    multi_class='warn',
    verbose=0,
    warm_start=False,
    n_jobs=None,
    l1_ratio=None)
re=RandomForestClassifier()
xgb=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=10, min_child_weight=40, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
       subsample=1)

In [None]:
lr.fit(X_train,Y_train)

In [None]:
Y_pred=lr.predict(X_test)

In [None]:
smp_sub=pd.read_csv('sample_submission.csv')

In [None]:
sub={'id':smp_sub['id'],'redemption_status':Y_pred}

In [None]:
sub1=pd.DataFrame(sub)

In [None]:
sub1['redemption_status']=sub1['redemption_status'].astype(int)

In [None]:
sub1.to_csv('my_submission_lr_1.csv',index=False)