# Category Embeddings

In [82]:
import pandas as pd
from sklearn import model_selection, preprocessing, metrics
from keras.layers import Reshape, Input, add, Flatten, Dropout, Dense, Embedding, concatenate, BatchNormalization, AveragePooling1D
from keras.regularizers import l2
from keras.models import Model,model_from_json
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint, EarlyStopping
import os
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from operator import itemgetter
from collections import defaultdict
import seaborn as sns

def read_model(filename):
    # this is a helper function used to restore a keras NN model architecture and weights
    model = model_from_json(open('elo/model_architecture.json').read())
    model.load_weights('elo/'+filename+'_model_weights.h5')
    return model
def save_model(model,filename):
    # this is a helper function used to save a keras NN model architecture and weights
    json_string = model.to_json()

    open(filename+'_architecture.json', 'w').write(json_string)
    model.save_weights(filename+'_model_weights.h5', overwrite=True)
    
train_path = 'elo/all/train.csv'
test_path = 'elo/all/test.csv'
hist_tran_path = 'elo/all/historical_transactions.csv'
mernch_path = 'elo/all/merchants.csv'
new_marnch_path = 'elo/all/new_merchant_transactions.csv'

## Sign-up and pre processing

We sign up to the [ *elo*](https://www.kaggle.com/c/elo-merchant-category-recommendation) competition under the team name *$'$BGU-DL Y&A$'$*. And downloaded the data.<br>
Than we had to do some basic pre processing before we can use a classic ML algorithem.

In [2]:
path = '../Assignment2/elo/all/'

In [7]:
train_df = pd.read_csv(path + 'train.csv', parse_dates=["first_active_month"])
test_df = pd.read_csv(path + 'test.csv' , parse_dates=["first_active_month"])
new_trans_df = pd.read_csv(path + 'new_merchant_transactions.csv' )

In [16]:
hist_df = pd.read_csv(path + "/historical_transactions.csv")

MemoryError: 

In [None]:
new_trans_df = pd.read_csv(path + "/new_merchant_transactions.csv")

We want to have a new *csv* file with the following information:

1. The all features from *train.csv* file.
2. Purchase amount from history (*historical_transactions.csv* file).
3. Sum, mean, std, min, and max of purchase amount.

In [None]:
gdf = hist_df.groupby("card_id")
gdf = gdf["purchase_amount"].size().reset_index()
gdf.columns = ["card_id", "num_hist_transactions"]
train_df = pd.merge(train_df, gdf, on="card_id", how="left")
test_df = pd.merge(test_df, gdf, on="card_id", how="left")

In [None]:
gdf = hist_df.groupby("card_id")
gdf = gdf["purchase_amount"].agg(['sum', 'mean', 'std', 'min', 'max']).reset_index()
gdf.columns = ["card_id", "sum_hist_trans", "mean_hist_trans", "std_hist_trans", "min_hist_trans", "max_hist_trans"]
train_df = pd.merge(train_df, gdf, on="card_id", how="left")
test_df = pd.merge(test_df, gdf, on="card_id", how="left")

We want to have a new *csv* file with the following information:
1. The all features from train.csv
2. Purchase amount from new merchant (*new_merchant_transactions.csv* file).
3. Sum, mean, std, min and max of purchase amount.

In [None]:
gdf = new_trans_df.groupby("card_id")
gdf = gdf["purchase_amount"].size().reset_index()
gdf.columns = ["card_id", "num_merch_transactions"]
train_df = pd.merge(train_df, gdf, on="card_id", how="left")
test_df = pd.merge(test_df, gdf, on="card_id", how="left")

In [None]:
gdf = new_trans_df.groupby("card_id")
gdf = gdf["purchase_amount"].agg(['sum', 'mean', 'std', 'min', 'max']).reset_index()
gdf.columns = ["card_id", "sum_merch_trans", "mean_merch_trans", "std_merch_trans", "min_merch_trans", "max_merch_trans"]
train_df = pd.merge(train_df, gdf, on="card_id", how="left")
test_df = pd.merge(test_df, gdf, on="card_id", how="left")

We will convert dates that represented as string into two new fields that are numerial and are easy to learn from. 

In [None]:
train_df["year"] = train_df["first_active_month"].dt.year
test_df["year"] = test_df["first_active_month"].dt.year
train_df["month"] = train_df["first_active_month"].dt.month
test_df["month"] = test_df["first_active_month"].dt.month

train_df = train_df.drop(["first_active_month"], axis=1)
test_df = test_df.drop(["first_active_month"], axis=1)

Some of the cells are ```NaN``` because no purchase been made so we will convert this ```NaN```s into zeros.

In [None]:
train_df.fillna(value=0, inplace=True)
test_df.fillna(value=0, inplace=True)
train_df.to_csv("train_df.csv", index=False)
test_df.to_csv("test_df.csv", index=False)

For loading the pre process data.

In [None]:
train_df = pd.read_csv(path + 'train_df.csv')
test_df = pd.read_csv(path + 'test_df.csv')

## B Classical ML Algorithm

We will use ```Light GBM``` model. It is a gradient boosting framework that uses tree based learning algorithm. ```Light GBM``` is prefixed as "Light" because of its high speed. In addition this algorithm can handle a large size of data and takes lower memory to run. Another reason of why ```Light GBM``` is popular is because it focuses on accuracy of results. LGBM also supports GPU learning and thus data scientists are widely using LGBM for data science application development.

In [0]:
def create_lgb_model(train_X, train_y, val_X, val_y):
    params = {"objective" : "regression", 
              "metric" : "rmse", 
              "num_leaves" : 30, 
              "min_child_weight" : 50, 
              "learning_rate" : 0.05, 
              "bagging_fraction" : 0.7,
              "feature_fraction" : 0.7,
              "bagging_frequency" : 5,
              "bagging_seed" : 2018,
              "verbosity" : -1}
    
    train = lgb.Dataset(train_X, label=train_y)
    val = lgb.Dataset(val_X, label=val_y)
    evals_result = {}
    model = lgb.train(params, train, 1000, valid_sets=[val], early_stopping_rounds=100, verbose_eval=100, evals_result=evals_result)
    
    return model, evals_result

In [0]:
cols_to_use = ["feature_1", "feature_2", "feature_3", "year", "month", 
               "num_hist_transactions", "sum_hist_trans", "mean_hist_trans", "std_hist_trans", 
               "min_hist_trans", "max_hist_trans",
               "num_merch_transactions", "sum_merch_trans", "mean_merch_trans", "std_merch_trans",
               "min_merch_trans", "max_merch_trans"]

train_X = train_df[cols_to_use]
test_X = test_df[cols_to_use]
train_y = train_df['target'].values

In [0]:
pred_test = 0
kf = model_selection.KFold(n_splits=5, random_state=2018, shuffle=True)
for dev_index, val_index in kf.split(train_df):
    dev_X, val_X = train_X.loc[dev_index,:], train_X.loc[val_index,:]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    
    pred_test_tmp, model, evals_result = run_lgb(dev_X, dev_y, val_X, val_y, test_X)
    pred_test += pred_test_tmp
pred_test /= 5.

In [0]:
split_index = int(len(train_X)*0.8) # split train to 80% for train and 20% for val
x_train, y_train = train_X.iloc[:split_index, :], train_y[:split_index]
x_val, y_val = train_X.iloc[split_index:, :], train_y[split_index:]

lgb_model, evals_result = create_lgb_model(x_train, y_train, x_val, y_val)

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's rmse: 3.87463
[200]	valid_0's rmse: 3.8741
Early stopping, best iteration is:
[170]	valid_0's rmse: 3.87348


In [0]:
pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
preds_df = pd.DataFrame({'card_id': test_df['card_id'].values.tolist(), 'target': pred_test_y.tolist()})
preds_df.to_csv(path + "/preds_df.csv", index=False)

After we submited this result to kaggle we got a score of 3.884.

## C Per Process For Categorical Features Embeddings

We will perform the relevant preprocessing steps to create an embedding for the dataset categorical variables.<br>
Due to machine process-power limit we will not be able to merge the all given csv's so we will merge *train.csv* with *new_merchant_tran.csv* only.
But there is one more problem, there are some categorical fields that are not unique for a given id, for example, ```city_id```.<br>
```city_id``` have $N \to 1$ connection with ```card_id```, so we will take the ```city_id``` with maximum of ```purchase_amount```.<br>
After that, we will take all the fields that have $1\to1$ connection with this ``card_id``-```city_id```.<br>
Once we achived this goal, we will be able to merge with *train.csv*. 

In [3]:
new_trans_df = pd.read_csv(path + "/new_merchant_transactions.csv")

In [4]:
grouped_new_trans = new_trans_df.loc[new_trans_df.groupby('card_id')['purchase_amount'].idxmax()]

In [21]:
grouped_new_trans.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
1793299,Y,C_ID_00007093c1,69,N,1,B,879,M_ID_00a6ca8a8a,2,-0.656749,2018-04-09 16:23:59,1.0,9,29
38628,Y,C_ID_0001238066,333,N,3,C,607,M_ID_a5a61c543e,1,-0.078318,2018-03-24 17:02:00,1.0,9,29
1443420,Y,C_ID_0001506ef0,137,N,0,A,360,,1,-0.715352,2018-03-16 22:21:58,3.0,19,34
29165,Y,C_ID_0001793786,69,N,0,A,456,M_ID_b4f77c9e84,1,3.129932,2017-11-27 00:00:00,1.0,9,21
827140,Y,C_ID_000183fdda,161,N,-1,,489,M_ID_3bf000781a,2,-0.10768,2018-04-30 14:59:53,3.0,3,16


Now we can merge with train.csv

In [8]:
train_and_new_trans = pd.merge(train_df, grouped_new_trans, on="card_id", how="outer")

In [9]:
test_and_new_trans = pd.merge(test_df, grouped_new_trans, on="card_id", how="outer")
test_and_new_trans = test_and_new_trans[:123623]

In [10]:
train_and_new_trans.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target,authorized_flag,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,2017-06-01,C_ID_92a2005557,5.0,2.0,1.0,-0.820283,Y,276.0,N,0.0,A,783.0,M_ID_0703c10cc0,1.0,-0.296112,2018-03-12 16:49:03,1.0,9.0,19.0
1,2017-01-01,C_ID_3d0044924f,4.0,1.0,0.0,0.392913,Y,69.0,N,1.0,B,705.0,M_ID_3b3dad217c,2.0,-0.701858,2018-03-30 06:48:26,1.0,9.0,33.0
2,2016-08-01,C_ID_d639edf6cd,2.0,2.0,0.0,0.688056,Y,143.0,N,0.0,A,528.0,M_ID_c84d28e906,2.0,-0.700326,2018-04-28 17:43:11,5.0,5.0,25.0
3,2017-09-01,C_ID_186d6a6901,4.0,3.0,0.0,0.142495,Y,17.0,N,-1.0,,416.0,M_ID_db86b8ff60,2.0,-0.56674,2018-04-14 12:45:54,4.0,22.0,8.0
4,2017-11-01,C_ID_cdbd2c0db2,1.0,3.0,0.0,-0.159749,Y,302.0,N,1.0,B,652.0,M_ID_065d619231,2.0,0.450886,2018-04-14 11:40:10,3.0,7.0,7.0


We will replace every ```NaN``` with zero, even if the column isn't numeric, the embedding will treat zeros with the same meaning.

In [11]:
train_and_new_trans = train_and_new_trans.fillna(value=0)
test_and_new_trans = test_and_new_trans.fillna(value=0)

We will enumerate only the categorical features and merchant_id which is have too much unqiue valeus.

In [12]:
embd_categories = set(train_and_new_trans.keys()) - {'card_id','target','purchase_date','purchase_amount','merchant_id'}
embd_categories

{'authorized_flag',
 'category_1',
 'category_2',
 'category_3',
 'city_id',
 'feature_1',
 'feature_2',
 'feature_3',
 'first_active_month',
 'installments',
 'merchant_category_id',
 'month_lag',
 'state_id',
 'subsector_id'}

In [13]:
numerical_categories = set(train_and_new_trans.keys()) - {'purchase_date','purchase_amount','merchant_id'} 
enumerate_columns_list = {col:{p:i for (i,p) in enumerate(train_and_new_trans[col].unique())} for col in embd_categories}
enumerate_columns_list_for_test = {col:{p:i for (i,p) in enumerate(test_and_new_trans[col].unique())} for col in embd_categories}

In [14]:
processed_data = train_and_new_trans.loc[:,list(numerical_categories)].copy(deep=True)
for column in embd_categories:
    processed_data[column] = [enumerate_columns_list[column][x] for x in processed_data[column]]
    
processed_data_for_test = test_and_new_trans.loc[:,list(numerical_categories)].copy(deep=True)
for column in embd_categories:
    processed_data_for_test[column] = [enumerate_columns_list_for_test[column][x] for x in processed_data_for_test[column]]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [15]:
processed_data.head()

Unnamed: 0,city_id,target,state_id,category_1,category_2,feature_2,category_3,feature_1,feature_3,merchant_category_id,month_lag,installments,authorized_flag,card_id,first_active_month,subsector_id
0,0,-0.820283,0,0,0,0,0,0,0,0,0,0,0,C_ID_92a2005557,0,0
1,1,0.392913,0,0,0,1,1,1,1,1,1,1,0,C_ID_3d0044924f,1,1
2,2,0.688056,1,0,1,0,0,2,1,2,1,0,0,C_ID_d639edf6cd,2,2
3,3,0.142495,2,0,2,2,2,1,1,3,1,2,0,C_ID_186d6a6901,3,3
4,4,-0.159749,3,0,3,2,1,3,1,4,1,1,0,C_ID_cdbd2c0db2,4,4


In [47]:
processed_data_for_test.head()

Unnamed: 0,feature_3,target,feature_2,category_3,feature_1,category_2,first_active_month,merchant_category_id,state_id,authorized_flag,category_1,city_id,month_lag,subsector_id,installments,card_id
0,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,C_ID_0ab67a22ab
1,1,,0,0,1,1,1,1,1,0,0,1,0,1,0,C_ID_130fd0cbdd
2,0,,1,0,2,2,2,2,2,0,0,2,1,2,1,C_ID_b709037bc5
3,1,,1,0,1,3,3,3,3,0,1,3,0,1,2,C_ID_d27d835a9f
4,0,,1,0,2,1,4,4,1,0,0,4,1,3,3,C_ID_2b5e3df5c2


## D - Predicting the target

Our new embedding vector will be in length of the one-hot vector length $\times 0.3$ (flor).

In [17]:
embeddings = []
inputs = []
for column in embd_categories:
    _input = Input(shape=(1,), dtype='int64')
    inputs.append(_input)
    unique_len = len(np.unique(processed_data[column].values))
    output_dim = int(unique_len * 0.3)
    output_dim = 1 if output_dim == 0 else output_dim
    embeddings.append(Embedding(unique_len,output_dim,input_length=1, embeddings_regularizer=l2(1e-6), name='emb_'+column)(_input))

In [19]:
x = concatenate(embeddings)
x = Flatten()(x)
x = BatchNormalization()(x)
x = Dense(100,activation='linear')(x)
x = BatchNormalization()(x)
x = Dropout(0.5)(x)
x = Dense(10,activation='linear')(x)
x = BatchNormalization()(x)
x = Dense(1)(x)
model = Model(inputs, x)
model.compile(loss = 'mse',optimizer='adamax')
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_6 (I

In [17]:
train_df.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target
0,2017-06-01,C_ID_92a2005557,5,2,1,-0.820283
1,2017-01-01,C_ID_3d0044924f,4,1,0,0.392913
2,2016-08-01,C_ID_d639edf6cd,2,2,0,0.688056
3,2017-09-01,C_ID_186d6a6901,4,3,0,0.142495
4,2017-11-01,C_ID_cdbd2c0db2,1,3,0,-0.159749


In [28]:
train = processed_data.drop(['card_id', 'target'], axis=1)
target = processed_data['target']
x_train, x_val, y_train, y_val = train_test_split(train, target, test_size=0.20, random_state=3)

In [29]:
print('x_tarin.shape: {},\ty_train.shape{}\nx_val.shape: {},\ty_val.shape: {}'.format(x_train.shape, y_train.shape, x_val.shape, y_val.shape))

x_tarin.shape: (249545, 14),	y_train.shape(249545,)
x_val.shape: (62387, 14),	y_val.shape: (62387,)


In [30]:
cp = ModelCheckpoint('best_model_weights.h5',save_best_only=True)
es = EarlyStopping(patience=5, monitor='val_loss')

model_info = model.fit([x_train[key] for key in x_train.keys()], y_train, 
                       validation_data=[[x_val[key] for key in x_val.keys()], y_val], 
                       epochs=30,batch_size = 128, callbacks=[cp, es])

Train on 249545 samples, validate on 62387 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


As we can see, the results are not that good as we expected to be. The reason might be that our task is more suitable for classic machine learning models.

## E Add The Rest Of The Features


We will add the only feature which is not a categorical from *new_merchant.csv* - ```purchase amount```.


In [45]:
embd_categories = set(train_and_new_trans.keys()) - {'card_id','target','purchase_date','purchase_amount','merchant_id'}
numerical_categories_2 = set(train_and_new_trans.keys()) - {'purchase_date','merchant_id'} 
enumerate_columns_list_2 = {col:{p:i for (i,p) in enumerate(train_and_new_trans[col].unique())} for col in embd_categories}

In [90]:
processed_data_2.head()

Unnamed: 0,city_id,target,state_id,category_1,category_2,feature_2,category_3,purchase_amount,feature_1,feature_3,merchant_category_id,month_lag,installments,authorized_flag,card_id,first_active_month,subsector_id
0,0,-0.820283,0,0,0,0,0,-0.296112,0,0,0,0,0,0,C_ID_92a2005557,0,0
1,1,0.392913,0,0,0,1,1,-0.701858,1,1,1,1,1,0,C_ID_3d0044924f,1,1
2,2,0.688056,1,0,1,0,0,-0.700326,2,1,2,1,0,0,C_ID_d639edf6cd,2,2
3,3,0.142495,2,0,2,2,2,-0.56674,1,1,3,1,2,0,C_ID_186d6a6901,3,3
4,4,-0.159749,3,0,3,2,1,0.450886,3,1,4,1,1,0,C_ID_cdbd2c0db2,4,4


In [91]:
processed_data_2 = train_and_new_trans.loc[:,list(numerical_categories_2)].copy(deep=True)
for column in embd_categories:
    processed_data_2[column] = [enumerate_columns_list[column][x] for x in processed_data_2[column]]
    
processed_test_data_2 = test_and_new_trans.loc[:,list(numerical_categories_2)].copy(deep=True)
for column in embd_categories:
    processed_test_data_2[column] = [enumerate_columns_list[column].get(x,0) for x in processed_test_data_2[column]]    
    

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


We will remove the non-categorical features to the last column for simplicity

In [47]:
processed_data_2.keys()

Index(['city_id', 'target', 'state_id', 'category_1', 'category_2',
       'feature_2', 'category_3', 'purchase_amount', 'feature_1', 'feature_3',
       'merchant_category_id', 'month_lag', 'installments', 'authorized_flag',
       'card_id', 'first_active_month', 'subsector_id'],
      dtype='object')

In [92]:
col_list = processed_data_2.columns.tolist()
col_list = col_list[:7] + col_list[8:] + [col_list[7]] #non-categorical in index=7
processed_data_2 = processed_data_2[col_list]
processed_test_data_2 = processed_test_data_2[col_list]
processed_data_2.head()

Unnamed: 0,city_id,target,state_id,category_1,category_2,feature_2,category_3,feature_1,feature_3,merchant_category_id,month_lag,installments,authorized_flag,card_id,first_active_month,subsector_id,purchase_amount
0,0,-0.820283,0,0,0,0,0,0,0,0,0,0,0,C_ID_92a2005557,0,0,-0.296112
1,1,0.392913,0,0,0,1,1,1,1,1,1,1,0,C_ID_3d0044924f,1,1,-0.701858
2,2,0.688056,1,0,1,0,0,2,1,2,1,0,0,C_ID_d639edf6cd,2,2,-0.700326
3,3,0.142495,2,0,2,2,2,1,1,3,1,2,0,C_ID_186d6a6901,3,3,-0.56674
4,4,-0.159749,3,0,3,2,1,3,1,4,1,1,0,C_ID_cdbd2c0db2,4,4,0.450886


In [93]:
print(processed_test_data_2.shape)
processed_test_data_2.head()

(123623, 17)


Unnamed: 0,city_id,target,state_id,category_1,category_2,feature_2,category_3,feature_1,feature_3,merchant_category_id,month_lag,installments,authorized_flag,card_id,first_active_month,subsector_id,purchase_amount
0,263,,23,0,0,2,3,4,0,123,1,5,0,C_ID_0ab67a22ab,10,18,-0.383266
1,32,,16,0,2,2,3,2,1,70,1,5,0,C_ID_130fd0cbdd,1,8,-0.506484
2,41,,4,0,1,1,3,0,0,77,0,8,0,C_ID_b709037bc5,7,20,0.904506
3,38,,15,2,4,1,3,2,1,23,1,9,0,C_ID_d27d835a9f,26,8,-0.44788
4,34,,16,0,2,1,3,0,0,61,0,6,0,C_ID_2b5e3df5c2,38,12,14.279604


As we can see, we have the same dataset but purchase_amount added as a feature

In [63]:
features = []
inputs_2 = []
for column in embd_categories:
    _input = Input(shape=(1,), dtype='float32')
    inputs_2.append(_input)
    unique_len = len(enumerate_columns_list[column])
    output_dim = min(int(0.3*unique_len),100)
    output_dim = 1 if output_dim == 0 else output_dim
    features.append(Embedding(unique_len,output_dim,input_length=1, embeddings_regularizer=l2(1e-6), name='emb_'+column)(_input))
p_amoount_inp = Input(shape=(1,), dtype='float32')
p_amoount_reshape = Reshape((1, 1))(p_amoount_inp)
features.append(p_amoount_reshape)
inputs_2.append(p_amoount_inp)

In [66]:
x = concatenate(features)
x = Flatten()(x)
x = BatchNormalization()(x)
x = Dense(100,activation='linear')(x)
x = BatchNormalization()(x)
x = Dropout(0.5)(x)
x = Dense(10,activation='linear')(x)
x = BatchNormalization()(x)
x = Dense(1)(x)
model_2 = Model(inputs_2, x)
model_2.compile(loss = 'mse',optimizer='adamax')
model_2.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_106 (InputLayer)          (None, 1)            0                                            
__________________________________________________________________________________________________
input_107 (InputLayer)          (None, 1)            0                                            
__________________________________________________________________________________________________
input_108 (InputLayer)          (None, 1)            0                                            
__________________________________________________________________________________________________
input_109 (InputLayer)          (None, 1)            0                                            
__________________________________________________________________________________________________
input_110 

In [67]:
train = processed_data_2.drop(['card_id', 'target'], axis=1)
target = processed_data_2['target']

x_train, x_val, y_train, y_val = train_test_split(train, target, test_size=0.20, random_state=3)

In [70]:
cp = ModelCheckpoint('best2_model_weights.h5',save_best_only=True)
es = EarlyStopping(patience=5, monitor='val_loss')
model_2_info = model_2.fit([x_train[f] for f in train.keys()], y_train, 
                       validation_data=[[x_val[f] for f in train.keys()], y_val], 
                       epochs=30, batch_size=512, callbacks=[cp, es])

Train on 249545 samples, validate on 62387 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [0]:
save_model(model_2,'model_2')

## F Interesting Insights About The Embeddings


In our competition we do not know what are most of the features. The names of the features are not informative enough that we will be able to guess any intersting dependency between features.<br>
In this case (and many other) it is good idea to use embedding representation. The embedding is able to learn new information and include this information inside the new  representation.<br>
Another expirment that was nice to try is to represent some of the features by embedding representation that learns by other targets but ```loyalty score```.<br>
The connection between other features might be reflected in the embedding and achieves better results.<br>
For example, for each ```card_id``` we could take all the ```city_id``` which this card ever bought there and create a new embedding represent by training the embedding with ```purchase_amount``` as a target. Instead of using the ```city_id``` that the given ```card_id``` bought there mostly, we could use the new embedding vector as a feature.<br>
Unfortunately, our expirment with the embeddings layers did not produce good results compare to our baise line model. We belive that the nature of the competition didn't match nn-architecture and that's might be the main reason for the poor results. 

## G Feature Extractor

We will  use the embeddings we got in $3E$ as a “feature extractor” with XGBRegressor model.

In [74]:
fe_model = Model(model_2.inputs, model_2.layers[-1].output)
fe_model.load_weights('best2_model_weights.h5')
fe_model = Model(model_2.inputs, model_2.layers[-3].output)

fe_X_train = fe_model.predict([x_train[f] for f in train.keys()])


In [79]:
model = xgb.XGBRegressor()
model.fit(fe_X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

validation results

In [80]:
fe_X_val = fe_model.predict([x_val[f] for f in train.keys()])
val_predictions = model.predict(fe_X_val)

In [83]:
def rmse(y1, y2):
    return sqrt(mean_squared_error(y1, y2)) 

In [85]:
print("rmse {}".format(rmse(val_predictions,y_val)))
print("mse {}".format(mean_squared_error(val_predictions,y_val)))

rmse 3.009186824928832
mse 9.055205347325264


Very good improvment!

Now we will check the feature extraction model results on the test

In [95]:
fe_X_test = fe_model.predict([processed_test_data_2[f] for f in train.keys()])

In [96]:
test_predictions = model.predict(fe_X_test)

In [109]:
train_df.keys()

Index(['first_active_month', 'card_id', 'feature_1', 'feature_2', 'feature_3',
       'target'],
      dtype='object')

In [112]:
pd.DataFrame(np.stack((test_df['card_id'].values, test_predictions)).T, columns=['card_id', 'target']).to_csv('elo_preds.csv', index=False)

after submitting in kaggale our results are : 3.9.. 

## Summary

At the beginning of the mission we tried to accumulate as much knowledge as possible about the data. By looking at each file and trying to figure out what each feature is. In addition, we tried to search for information that people who were involved in the subject shared, for example, in the *Kaggel* discussion.<br>

In the first step we built a classic ML model that used a relatively small amount of data. And to our surprise the model did a pretty good job, $3.8$ *RMSE* over the competition test set while as for right now ($22$ of December at $23:42$) first place in the competition is with $3.656$ *RMSE*.<br>

We then began experimenting with embeddings for categorical features. And there we encountered a problem. How can we aggregate, correctly,  the categorical features? 
For example, make an average or sum between different records of```city_id``` will yield a meaningless result. So we decided to take one record which has the ```city_id``` where the customer (```card_id```) bought the most.<br>

After the end of the pre processing phase we have trained a model that include embedding layers. Unfortunately the results failed compared to the previous model.<br>

So we try to make use of categorical features combined with some other features. We saw the this model is better than the one using only categorical features. But not by much $9.2$ *MSE* loss over the validation set compared to $9.6$.<br>

So we realized that probably for this problem embedding is not a good enough method for solving the problem. And that the features that are not categorical are more significant.<br>

We wanted to try to use the last model as “feature extractor”, but we encountered a lack of time $:($ and bugs $:( \times \infty$. But since the embedding itself did not improve the classic model, we expect that the improvement, If any, will not be significant either.<br>

In conclusion, we think that embedding is a good way to perceive and represent certain features. And that embedding can capture within them meanings and connections between different variables. But unfortunately for this task we could not applied it.<br>

We also wanted to note that we have noticed that there is a small part of the target values which are very different (around $-33$). In our opinion, it will be difficult to train a DL model that can distinguish between such a case. In addition, we noticed that we are measured according to an *RMSE* metric, so if, for example, we predict the average of the target variable for these cases, the *RMSE* value will be very affected by it. Therefore, we recommended (for those who continue in the competition) to delve into these samples and create a model that can identify them and predict a target value which is closer to the $-33$ area, In order to advance in the competition.