In [1]:
import pandas as pd
import numpy as np
import ast
pd.set_option("display.max_columns", 200)

In [2]:
train = pd.read_csv("../data/train_go05W65.csv")

In [3]:
test = pd.read_csv("../data/test_VkM91FT.csv")

In [4]:
train = train.append(test, ignore_index=True)

In [5]:
train.Product_Holding_B1 = train.Product_Holding_B1.apply(lambda x : ast.literal_eval(x))
train.Product_Holding_B2 = train.Product_Holding_B2.apply(lambda x : ast.literal_eval(x) if pd.notnull(x) else None)

In [6]:
train.Customer_ID = train.Customer_ID.apply(lambda x : x[2:]).astype(int)

In [7]:
train.shape

(58075, 9)

In [8]:
train.head(2)

Unnamed: 0,Customer_ID,Gender,Age,Vintage,Is_Active,City_Category,Customer_Category,Product_Holding_B1,Product_Holding_B2
0,264719,Male,41,14,0,C1,S3,[P16],[P8]
1,209679,Female,47,14,1,C1,S2,"[P13, P20]",[P3]


In [9]:
test.head(5)

Unnamed: 0,Customer_ID,Gender,Age,Vintage,Is_Active,City_Category,Customer_Category,Product_Holding_B1
0,CC372708,Female,31,31,0,C2,S3,"['P12', 'P13']"
1,CC216072,Male,28,37,1,C1,S2,"['P12', 'P13']"
2,CC387629,Male,31,12,0,C2,S3,['P20']
3,CC389228,Female,55,11,0,C2,S2,"['P13', 'P21']"
4,CC394445,Male,51,49,1,C2,S1,['P13']


In [10]:
# create bucket from customer id, and find trending products in that bucket
# here bucket will specify a time period

In [11]:
train["Customer_ID_bucket"] = train.Customer_ID.apply(lambda x : round(x/1000))

In [12]:
customer_popularity = train.explode("Product_Holding_B1").groupby(train['Customer_ID_bucket']).apply(lambda x : 
            x["Product_Holding_B1"].value_counts().nlargest(5)).reset_index().groupby(
    ["Customer_ID_bucket"])["level_1"].apply(list).reset_index()

In [13]:
customer_popularity = customer_popularity.rename(columns = {"level_1" : "customer_bucket_popular_products"})

In [14]:
customer_popularity.head(2)

Unnamed: 0,Customer_ID_bucket,customer_bucket_popular_products
0,199,"[P13, P12, P17, P20, P21]"
1,200,"[P13, P12, P17, P21, P20]"


In [15]:
train = train.merge(customer_popularity, on=["Customer_ID_bucket"])

In [16]:
train.head()

Unnamed: 0,Customer_ID,Gender,Age,Vintage,Is_Active,City_Category,Customer_Category,Product_Holding_B1,Product_Holding_B2,Customer_ID_bucket,customer_bucket_popular_products
0,264719,Male,41,14,0,C1,S3,[P16],[P8],265,"[P13, P16, P12, P21, P17]"
1,264929,Male,35,15,0,C2,S1,[P16],[P8],265,"[P13, P16, P12, P21, P17]"
2,265293,Male,28,14,0,C2,S1,[P16],[P8],265,"[P13, P16, P12, P21, P17]"
3,264895,Male,40,18,0,C1,S2,"[P12, P13, P17, P19, P21]","[P7, P8, P9, P10]",265,"[P13, P16, P12, P21, P17]"
4,265180,Male,37,16,0,C2,S3,"[P16, P17, P19]","[P8, P12]",265,"[P13, P16, P12, P21, P17]"


In [17]:
train.Gender = train.Gender.map({"Male" : 0, "Female":1})

In [18]:
# get dummies

In [19]:
df_product_holding_b1_dumies = pd.get_dummies(train.Product_Holding_B1.apply(pd.Series).stack()).sum(level=0)

In [20]:
df_customer_popular_products_dumies = pd.get_dummies(train.customer_bucket_popular_products.apply(pd.Series).stack()).sum(level=0)

In [21]:
df_product_holding_b1_dumies.columns = ["product_holding_" + x for x in df_product_holding_b1_dumies.columns]
df_customer_popular_products_dumies.columns = ["customer_holding_" + x for x in df_customer_popular_products_dumies.columns]

In [22]:
df_product_holding_b1_dumies.shape

(58075, 22)

In [23]:
df_product_holding_b1_dumies.head(2)

Unnamed: 0,product_holding_P00,product_holding_P1,product_holding_P10,product_holding_P11,product_holding_P12,product_holding_P13,product_holding_P14,product_holding_P15,product_holding_P16,product_holding_P17,product_holding_P18,product_holding_P19,product_holding_P2,product_holding_P20,product_holding_P21,product_holding_P3,product_holding_P4,product_holding_P5,product_holding_P6,product_holding_P7,product_holding_P8,product_holding_P9
0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [24]:
df_customer_popular_products_dumies.shape

(58075, 15)

In [25]:
df_customer_popular_products_dumies.head(2)

Unnamed: 0,customer_holding_P00,customer_holding_P10,customer_holding_P11,customer_holding_P12,customer_holding_P13,customer_holding_P15,customer_holding_P16,customer_holding_P17,customer_holding_P18,customer_holding_P19,customer_holding_P2,customer_holding_P20,customer_holding_P21,customer_holding_P8,customer_holding_P9
0,0,0,0,1,1,0,1,1,0,0,0,0,1,0,0
1,0,0,0,1,1,0,1,1,0,0,0,0,1,0,0


In [26]:
train = pd.concat([train, df_product_holding_b1_dumies, df_customer_popular_products_dumies], axis=1)

In [27]:
train.shape

(58075, 48)

In [28]:
train.head(2)

Unnamed: 0,Customer_ID,Gender,Age,Vintage,Is_Active,City_Category,Customer_Category,Product_Holding_B1,Product_Holding_B2,Customer_ID_bucket,customer_bucket_popular_products,product_holding_P00,product_holding_P1,product_holding_P10,product_holding_P11,product_holding_P12,product_holding_P13,product_holding_P14,product_holding_P15,product_holding_P16,product_holding_P17,product_holding_P18,product_holding_P19,product_holding_P2,product_holding_P20,product_holding_P21,product_holding_P3,product_holding_P4,product_holding_P5,product_holding_P6,product_holding_P7,product_holding_P8,product_holding_P9,customer_holding_P00,customer_holding_P10,customer_holding_P11,customer_holding_P12,customer_holding_P13,customer_holding_P15,customer_holding_P16,customer_holding_P17,customer_holding_P18,customer_holding_P19,customer_holding_P2,customer_holding_P20,customer_holding_P21,customer_holding_P8,customer_holding_P9
0,264719,0,41,14,0,C1,S3,[P16],[P8],265,"[P13, P16, P12, P21, P17]",0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,1,0,0,0,0,1,0,0
1,264929,0,35,15,0,C2,S1,[P16],[P8],265,"[P13, P16, P12, P21, P17]",0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,1,0,0,0,0,1,0,0


In [29]:
# store test data customer ids in a dataframe

In [30]:
temp_df = pd.DataFrame()

In [31]:
temp_df["Customer_ID_int"] = train[train.Product_Holding_B2.isnull()]["Customer_ID"].values

In [32]:
# store target column in a temp variable

In [33]:
temp_target = train["Product_Holding_B2"]

In [34]:
cols_to_dummy = ["City_Category", "Customer_Category"]

In [35]:
cols_to_drop = ["Customer_ID", "Product_Holding_B1", "Product_Holding_B2", "customer_bucket_popular_products"]

In [36]:
train.drop(columns=cols_to_drop, axis=1, inplace=True)

In [37]:
train = pd.get_dummies(train, cols_to_dummy)

In [38]:
train["Product_Holding_B2"] = temp_target

In [39]:
test = train[train.Product_Holding_B2.isnull()].reset_index(drop=True)
train = train[train.Product_Holding_B2.notnull()].reset_index(drop=True)

In [40]:
train.shape, test.shape

((37748, 48), (20327, 48))

In [41]:
train_X = train.drop(columns=["Product_Holding_B2"])
train_Y = train["Product_Holding_B2"]

In [42]:
del test["Product_Holding_B2"]

In [43]:
train_X.shape, train_Y.shape

((37748, 47), (37748,))

In [44]:
test.shape

(20327, 47)

In [45]:
from sklearn.preprocessing import MultiLabelBinarizer

multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(train_Y)

MultiLabelBinarizer()

In [46]:
Y = multilabel_binarizer.transform(train_Y)

In [47]:
multilabel_binarizer.classes_

array(['P00', 'P1', 'P10', 'P11', 'P12', 'P13', 'P14', 'P15', 'P16',
       'P17', 'P18', 'P2', 'P20', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8',
       'P9'], dtype=object)

In [48]:
mydict = {}
for idx, x in enumerate(multilabel_binarizer.classes_) :
    mydict[idx] = x

In [49]:
# nn model

In [50]:
from sklearn.model_selection import RepeatedKFold
from keras.models import Sequential
from keras.layers import Dense
 
 
# get the model
def get_model(n_inputs, n_outputs):
    model = Sequential()
    model.add(Dense(48, input_dim=n_inputs, kernel_initializer='he_uniform', activation='tanh'))
    model.add(Dense(96, kernel_initializer='he_uniform', activation='tanh'))
    model.add(Dense(48, kernel_initializer='he_uniform', activation='tanh'))
    model.add(Dense(n_outputs, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model

Using TensorFlow backend.


In [51]:
n_inputs, n_outputs = train_X.shape[1], Y.shape[1]

In [52]:
model = get_model(n_inputs, n_outputs)
print(model.summary())

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 48)                2304      
_________________________________________________________________
dense_2 (Dense)              (None, 96)                4704      
_________________________________________________________________
dense_3 (Dense)              (None, 48)                4656      
_________________________________________________________________
dense_4 (Dense)              (None, 20)                980       
Total params: 12,644
Trainable params: 12,644
Non-trainable params: 0
_________________________________________________________________
None


In [53]:
model.fit(train_X, Y, verbose=2, epochs=150)


Epoch 1/150
 - 2s - loss: 0.1849
Epoch 2/150
 - 1s - loss: 0.1372
Epoch 3/150
 - 1s - loss: 0.1130
Epoch 4/150
 - 1s - loss: 0.1015
Epoch 5/150
 - 1s - loss: 0.0972
Epoch 6/150
 - 1s - loss: 0.0934
Epoch 7/150
 - 2s - loss: 0.0914
Epoch 8/150
 - 2s - loss: 0.0900
Epoch 9/150
 - 2s - loss: 0.0889
Epoch 10/150
 - 1s - loss: 0.0872
Epoch 11/150
 - 1s - loss: 0.0862
Epoch 12/150
 - 2s - loss: 0.0857
Epoch 13/150
 - 2s - loss: 0.0849
Epoch 14/150
 - 2s - loss: 0.0848
Epoch 15/150
 - 2s - loss: 0.0844
Epoch 16/150
 - 1s - loss: 0.0841
Epoch 17/150
 - 1s - loss: 0.0835
Epoch 18/150
 - 1s - loss: 0.0829
Epoch 19/150
 - 1s - loss: 0.0822
Epoch 20/150
 - 1s - loss: 0.0821
Epoch 21/150
 - 1s - loss: 0.0820
Epoch 22/150
 - 2s - loss: 0.0816
Epoch 23/150
 - 1s - loss: 0.0813
Epoch 24/150
 - 1s - loss: 0.0809
Epoch 25/150
 - 1s - loss: 0.0808
Epoch 26/150
 - 1s - loss: 0.0808
Epoch 27/150
 - 1s - loss: 0.0808
Epoch 28/150
 - 1s - loss: 0.0805
Epoch 29/150
 - 1s - loss: 0.0807
Epoch 30/150
 - 1s - l

<keras.callbacks.callbacks.History at 0x7fda75ac6b70>

In [54]:
res_nn = model.predict(test)

In [55]:
# multichain lightgbm

In [56]:
import lightgbm as lgbm

In [57]:
from skmultilearn.problem_transform import ClassifierChain

In [58]:
classifier = ClassifierChain(lgbm.LGBMClassifier())

In [59]:
classifier.fit(train_X, Y)

ClassifierChain(classifier=LGBMClassifier(), require_dense=[True, True])

In [60]:
res_lfm = classifier.predict_proba(test)

In [61]:
res_lfm = res_lfm.toarray()

In [62]:
res_lfm.shape

(20327, 20)

In [63]:
# randomforest for multi label

In [64]:
from sklearn.ensemble import RandomForestRegressor

In [65]:
model = RandomForestRegressor()

In [66]:
model.fit(train_X, Y)

RandomForestRegressor()

In [67]:
res_rf = model.predict(test)

In [68]:
res_rf.shape

(20327, 20)

In [69]:
res = (res_lfm * 0.4)  + (res_nn * 0.3) + (res_rf * 0.3)

In [70]:
res_list = []
for x in res:
    res_list.append(list(x.argsort()[-3:][::-1]))

In [71]:
mydict = {}
for idx, x in enumerate(multilabel_binarizer.classes_) :
    mydict[idx] = x

In [72]:
predictions = pd.DataFrame()

In [73]:
predictions["labels"] = res_list

In [74]:
predictions.labels = predictions.labels.apply(lambda x : [mydict[a] for a in x])

In [75]:
predictions.head(2)

Unnamed: 0,labels
0,"[P8, P9, P10]"
1,"[P6, P7, P4]"


In [76]:
submission = pd.read_csv("../data/sample_submission_kF044ur.csv")

In [77]:
del submission["Product_Holding_B2"]

In [78]:
temp_df["Product_Holding_B2"] = predictions["labels"]

In [79]:
temp_df["Customer_ID"] = temp_df["Customer_ID_int"].apply(lambda x : "CC" + str(x))

In [80]:
submission = submission.merge(temp_df, on="Customer_ID")

In [81]:
submission

Unnamed: 0,Customer_ID,Customer_ID_int,Product_Holding_B2
0,CC372708,372708,"[P10, P4, P5]"
1,CC216072,216072,"[P8, P1, P6]"
2,CC387629,387629,"[P16, P13, P1]"
3,CC389228,389228,"[P4, P10, P5]"
4,CC394445,394445,"[P00, P1, P4]"
...,...,...,...
20322,CC303542,303542,"[P8, P6, P7]"
20323,CC266713,266713,"[P8, P6, P7]"
20324,CC393639,393639,"[P00, P1, P4]"
20325,CC285013,285013,"[P1, P7, P6]"


In [82]:
submission = submission[["Customer_ID", "Product_Holding_B2"]]

In [83]:
submission.to_csv("../submission/17_b.csv", index=False)