## Import necessary libraries

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
import seaborn as sns

In [2]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [3]:
train_dataset = '/kaggle/input/amexpert-2021-machine-learning-hackathon/train_go05W65.csv'
test_dataset = '/kaggle/input/amexpert-2021-machine-learning-hackathon/test_VkM91FT.csv'

In [4]:
df = pd.read_csv(train_dataset)
df.head()

### Exploratory Data Analysis

In [5]:
sns.displot(df.Age)

In [6]:
sns.displot(df.Vintage)

In [7]:
from sklearn.preprocessing import RobustScaler

In [8]:
transformer_age = RobustScaler().fit(df[['Age']])
transformer_Vintage = RobustScaler().fit(df[['Vintage']])

In [9]:
#scaled_age = list(transformer_age.transform(df[['Age']]))
df['Age'] = pd.DataFrame(transformer_age.transform(df[['Age']]))
df['Vintage'] = pd.DataFrame(transformer_Vintage.transform(df[['Vintage']]))
df.head()

In [10]:
df_num_train = pd.DataFrame(transformer_age.transform(df[['Age', 'Vintage']]), columns = ['Age', 'Vintage'])
df_num_train.head()

### Feature Engineering

In [11]:
product_set = set()

products_set = list(df.Product_Holding_B1)
for product_list in products_set:
    for product in product_list.strip('[]').split(','):
        product_set.add(product.strip("' "))
print(len(product_set))
print(product_set)

In [12]:
# Target
target_product_set = set()
products_set = list(df.Product_Holding_B2)
for product_list in products_set:
    for product in product_list.strip('[]').split(','):
        target_product_set.add(product.strip("' "))
print(len(target_product_set))
print(target_product_set)

### Extract all products from Product_Holding_B1 column and create individual products as features

In [13]:
def fe_product(feature, match):
    for product in feature.strip('[]').split(','):
        if match == product.strip("' "):
            return 1
    return 0

In [14]:
for product in product_set:
    feature = 'Product_' + product
    df[feature] = df.apply(lambda x: fe_product(x.Product_Holding_B1, product), axis=1)
df.head()

### Remove features 'Product_Holding_B1', 'Customer_ID'

In [15]:
target_df = df[['Product_Holding_B2']]

In [16]:
df.drop(['Product_Holding_B1', 'Product_Holding_B2', 'Customer_ID'], axis = 1, inplace = True)
df.columns

### One-Hot encoding using *pd.get_dummies*

In [17]:
df = pd.get_dummies(df, drop_first = True)
df.head()

In [18]:
df.columns

### Dimentionality Reduction

In [19]:
df.shape

In [20]:
from sklearn.decomposition import PCA

In [21]:
pca = PCA(n_components=29)
pca.fit(df.values)

In [22]:
print(pca.explained_variance_ratio_)

In [23]:
plt.bar(list(range(1,30)),pca.explained_variance_ratio_,alpha=0.5, align='center')
plt.ylabel('Variation explained')
plt.xlabel('eigen Value')
plt.show()

In [24]:
sum = 0
for no_of_pc, variance_ration in enumerate(pca.explained_variance_ratio_, 1):
    sum += variance_ration
    print(sum, no_of_pc)
    if sum > 0.99:
        break

In [25]:
pca18 = PCA(n_components=18)
pca18.fit(df.values)
X = pca18.transform(df.values)

In [26]:
#X = df.values
X.shape

### Feature Engineering for *target* Column

In [27]:
target_df.head()

In [28]:
y = np.zeros((target_df.shape[0], 22), dtype = 'int')
y.shape

In [29]:
products_set = list(target_df.Product_Holding_B2)
for pos, product_list in enumerate(products_set):
    for product in product_list.strip('[]').split(','):
        product = product.strip("' ")
        product_id = int(product[1:])
        y[pos, product_id] = 1

In [30]:
X.shape

In [31]:
y.shape

### Train Validaion split

In [32]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [33]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

### Neural Network Model

In [34]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras import optimizers, regularizers
import tensorflow as tf

In [35]:
#'''
model = Sequential()
#model.add(layers.Dropout(0.2))
model.add(layers.Dense(512, activation='relu', input_shape=(18,)))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.5))

'''
model.add(layers.Dense(512, activation='relu'))
#model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.5))
'''

model.add(layers.Dense(256, activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.5))

model.add(layers.Dense(128, activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.5))

model.add(layers.Dense(64, activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.5))

model.add(layers.Dense(22, activation='sigmoid'))
model.summary()
#'''

In [36]:
'''
model = Sequential()
model.add(layers.Dense(512, kernel_initializer='he_normal', activation='relu', input_shape=(29,)))
model.add(layers.BatchNormalization())
#model.add(layers.Dropout(0.5))
model.add(layers.Dense(256, kernel_initializer='he_normal', activation='relu'))
model.add(layers.BatchNormalization())
#model.add(layers.Dropout(0.5))
model.add(layers.Dense(128, kernel_initializer='he_normal', activation='relu'))
model.add(layers.BatchNormalization())
#model.add(layers.Dropout(0.5))
model.add(layers.Dense(64, kernel_initializer='he_normal', activation='relu'))
model.add(layers.BatchNormalization())
#model.add(layers.Dropout(0.5))


model.add(layers.Dense(32, kernel_initializer='he_normal', activation='relu'))
model.add(layers.BatchNormalization())
#model.add(layers.Dropout(0.5))


model.add(layers.Dense(22, activation='softmax'))
model.summary()
'''

In [37]:
#model.compile(optimizer=optimizers.Adam(learning_rate=0.0001), loss = 'binary_crossentropy', metrics = ['binary_accuracy'])
#model.compile(optimizer=optimizers.SGD(learning_rate=0.01, momentum=0.9), loss = 'binary_crossentropy', metrics = ['binary_accuracy'])


In [38]:
model.compile(optimizer=optimizers.Adam(), loss = 'binary_crossentropy', metrics = ['binary_accuracy'])
#history_final = model.fit(X, y, validation_split=0.2, batch_size = 32, epochs = 100, verbose = 1)
history_final = model.fit(X, y, batch_size = 32, epochs = 25, verbose = 1)

### Evaluating using Test Data

In [39]:
df_test = pd.read_csv(test_dataset)
df_test.head(20)

In [40]:
df_test.shape

In [41]:
df_test['Age'] = pd.DataFrame(transformer_age.transform(df_test[['Age']]))
df_test['Vintage'] = pd.DataFrame(transformer_Vintage.transform(df_test[['Vintage']]))
df_test.head()

In [42]:
for product in product_set:
    feature = 'Product_' + product
    df_test[feature] = df_test.apply(lambda x: fe_product(x.Product_Holding_B1, product), axis=1)
df_test.head()

In [43]:
df_submission = df_test[['Customer_ID', 'Product_Holding_B1']]
df_submission.head()

### Existing Products

In [44]:
existing_products = np.zeros((df_submission.shape[0], 22), dtype = 'int')
existing_products.shape

In [45]:
products_set = list(df_submission.Product_Holding_B1)
for pos, product_list in enumerate(products_set):
    for product in product_list.strip('[]').split(','):
        product = product.strip("' ")
        product_id = int(product[1:])
        existing_products[pos, product_id] = 1
existing_products.shape

In [46]:
existing_products[0:5]

In [47]:
df_test.head()

In [48]:
df_test.drop(['Product_Holding_B1', 'Customer_ID'], axis = 1, inplace = True)
df_test.columns

In [49]:
df_test = pd.get_dummies(df_test, drop_first = True)
df_test.shape

In [50]:
df_test.head(20)

### Prediction

In [51]:
X_test = pca18.transform(df_test.values)

y_test_pred = model.predict(X_test)
y_test_pred.shape

In [52]:
df_submission['Target'] = list(y_test_pred)
df_submission.head()

In [53]:
df_submission.drop('Product_Holding_B1', axis = 1, inplace = True)
df_submission.head()

In [54]:
df_submission['Existing'] = list(existing_products)
df_submission.head()

In [55]:
df_submission.head(30)

In [56]:
df_submission.Existing[0].sum()

In [57]:
match_count = 0
def generate_pred(cust_id, existing, target):
    global match_count
    existing_count = existing.sum()
    int_result = list()
    str_result = list()
    existing_list_org = list(existing)
    temp_list_org = list(target)
    temp_list_1 = list(target)
    temp_list_1.sort(reverse = True)
    count = 1
    for prob in temp_list_1:
        if existing_count == 1 and count > 1:
            break
        if existing_count == 2 and count > 2:
            break
        if count > 3:
            break
            
        index = temp_list_org.index(prob)
        if existing_list_org[index] != 1:
            int_result.append(temp_list_org.index(prob))
            count += 1
    
    int_result.sort()   
    for item in int_result:
        if item == 0:
            str_result.append('P00')
        else:
            str_result.append('P' + str(item))
    return str_result

In [58]:
df_submission['Product_Holding_B2'] = df_submission.apply(lambda x: generate_pred(x.Customer_ID, x.Existing, x.Target), axis=1)

In [59]:
print(match_count)

In [60]:
df_submission.drop('Target', axis = 1, inplace = True)
df_submission.head()

In [61]:
df_submission.drop('Existing', axis = 1, inplace = True)

In [62]:
df_submission.head(30)

In [63]:
predicted_product_set = set()
products_set = list(df_submission.Product_Holding_B2)
for product_list in products_set:
    for product in product_list:
        predicted_product_set.add(product)
print(len(predicted_product_set))
print(predicted_product_set)

In [64]:
df_submission.to_csv('submission.csv', index = False)