In [None]:
%%shell
jupyter nbconvert --to html /content//GFK_AnkitAgr_ML.ipynb


```
# Notebook for *Multi-label Classification* using Multi-layer perceptron (MLP)
```

Choice of embeddings:  
    Bert (Pre-trained)  
    word2vec (custom trained)  
    (set them in global variables)


For multi-label classification:  
The model is trained to predict:  
item_id, mdm_model_text and mdm_brand_text


In [None]:
import pandas as pd
import numpy as np
import torch
import random
import re
import os
import time
import unicodedata
from gensim.models import Word2Vec
from tqdm import tqdm
import seaborn as sns
from pandas._libs.hashtable import value_count
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import MultiLabelBinarizer
from pandas._libs.hashtable import value_count

mlbe = preprocessing.MultiLabelBinarizer()
import matplotlib.pyplot as plt
%matplotlib inline
tqdm.pandas()

# seed for reproducibility
seed = 0
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
np.random.seed(seed)  # Numpy module.
random.seed(seed)  # Python random module.
torch.manual_seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

# global variables/flags
BALANCED_SAMPLING = False
BALANCED_SAMPLING_ATMOST = 25
WORD2VEC_EMBEDDING_DIMENSION = 100 # word2vec
CREATE_ONEHOT_COUNTRY = False 
# choose the embeddings type from below
APPLY_WORD2VEC_EMBEDDINGS = False
APPLY_BERT_EMBEDDINGS = True


# loading csv file into dataframe using utf-8 encoding
orig_df = pd.read_csv('train_lenses_ds_task.csv', encoding='utf-8')

# NOTE: if read_csv gives encoding error, open the csv file in sublime or other 
# text editor and save it with utf-8 encoding and try again

df = orig_df.drop_duplicates(keep='first') # drop dupliactes
df = df.dropna() # drop NaN
df = df.apply(lambda x: x.astype(str).str.upper()) # all string to upper case

# balanced sampling take atmost-15 entries for each count
if BALANCED_SAMPLING:
  df_high = df[df.groupby('item_id').transform('count').ge(BALANCED_SAMPLING_ATMOST)['main_text']]
  df_low = df[df.groupby('item_id').transform('count').lt(BALANCED_SAMPLING_ATMOST)['main_text']]
  df_high = df_high.groupby('item_id', group_keys=False).apply(lambda x: x.sample(BALANCED_SAMPLING_ATMOST))
  df = pd.concat([df_low, df_high], axis=0)


# Multilabel: extending classes
classes = []
classes.extend(pd.unique(df['item_id']))
classes.extend(pd.unique(df['mdm_brand_text']))
classes.extend(pd.unique(df['mdm_model_text']))
print(len(classes))
output_layer = len(classes)

# list of unique values for each category
all_item_ids = pd.unique(df['item_id']).tolist()
all_mdm_brand_text = pd.unique(df['mdm_brand_text']).tolist()
all_mdm_model_text = pd.unique(df['mdm_model_text']).tolist()
print('all_item_ids', all_item_ids)
print('length all_item_ids', len(all_item_ids))

# will be used later to map from mdm predictions to item_id
df['mdm_combine'] = df['mdm_brand_text'] + ' ' + df['mdm_model_text']

# Transforming to multi-label class Encoding
multilabelclasses = []
for ind, row in df.iterrows():
  local_list = []
  local_list.extend([row['item_id'], row['mdm_brand_text'], row['mdm_model_text']])
  multilabelclasses.append(local_list)

y_label = mlbe.fit_transform(multilabelclasses)
print(len(y_label))
df['new_item_id'] = y_label.tolist()

# drop below columns as alreadt appended as new classes in new_item_id
df = df.drop(columns=['mdm_brand_text', 'mdm_model_text'], axis=1)

print(df.head())
print('total unique entries:', len(df)) # 2868

# creating one-hot encoding for categorical varaible country_name
if CREATE_ONEHOT_COUNTRY:
  df_categorical = pd.get_dummies(df.country_name, prefix='country')
  df = pd.concat([df, df_categorical], axis=1)
  df = df.drop('country_name', axis=1)

# df.head(30)



536
all_item_ids ['138176095', '101261697', '140544215', '88210952', '112904161', '86804350', '139354017', '121721616', '24493444', '18378494', '122856012', '94920964', '87099837', '82981040', '102285076', '123927094', '136736656', '92361114', '18378561', '18378572', '94403654', '26726405', '80033948', '127901561', '25278162', '19026396', '79720920', '124606451', '104432648', '115519055', '123883561', '132843807', '103340122', '49498188', '81044073', '121626287', '117860560', '143471294', '141351006', '72145121', '119793326', '127487870', '105092835', '111044504', '76957414', '85892464', '104052744', '58328086', '104052322', '92562474', '105090127', '104386541', '79560221', '138543939', '115239599', '108586370', '80508911', '79452476', '54274515', '112306880', '112904599', '125771101', '125751655', '107335209', '85327619', '124632457', '108102591', '87121775', '98774856', '123805814', '119539790', '95336029', '73320960', '96266916', '79429232', '98426893', '124633843', '33336411', '143

Total unique entries in the data: 2868

Total different item_ids: 260  
Total different mdm_brand_text: 23  
Total different mdm_model_text: 253  
Final classes length:  536  

In [None]:
# code for visualization of class distribution 
# df2 = df.groupby('item_id').count().reset_index()[['item_id', 'main_text']]
# df2 = df2.sort_values('main_text', ascending=False)
# df2 = df2[df2['main_text']<500]

# df2.plot(kind='bar', x='item_id', figsize=(20,10))
# plt.xticks(rotation='vertical')
# plt.show()

In [None]:
# making train test split
# train split = 80%
# test split = 20%

train, test = train_test_split(df, test_size=0.2, random_state=12)

print(len(train)) #2294
print(len(test)) #574

2294
574


In [None]:
# preprocessing/cleaning of data

word2vec_processed_array = []
vocabulary = []

def strip_accents(text):
  """
    Strip accents from input String.

    :param text: The input string.
    :type text: String.

    :returns: The processed String.
    :rtype: String.
    """
  text = unicodedata.normalize('NFD', text)
  text = text.encode('ascii', 'ignore')
  text = text.decode("utf-8")
  return str(text)


def preprocess(text, is_train):
  """
    Instead of tokenizing directly removing all punctuations, here
    we handle numerical data first, and remove all other special characters.
    
    Replace all numberical data seperator from (',' or '-' or ',') to '_'
    Example:
    02.25 = 02_25
    02,25 = 02_25
    02-25 = 02_25
    2.2 = 2_2
    
    Note: must contain number both side of the special_character
    This step is done to maintain the relationship that numerical data may have

    Then we replace all special characters and punctuations except '_' with ' '
    
    Note: we also build the vocabulary of sentences found only in training data
    (is_train) = True, so that we can train custom word2vec model for embeddings
    
    :param text: The input string.
    :type text: String.

    :param is_train: build vocabulary is is_train = True
    :type text: Boolean.

    :returns: The processed String.
    :rtype: String.
    """
    
  processed_text = re.sub('([0-9]+)[,|.|-]([0-9]+)', '\\1_\\2', text)
  processed_text = re.sub('\W+',' ', processed_text)
  processed_text = processed_text.strip()

  processed_text = strip_accents(processed_text)
  # sent = ' '.join(word for word in processed_text.split() if len(word)>1)

  if is_train: # as embedding model will be trained only with training data
    processed_words = processed_text.split()
    word2vec_processed_array.append(processed_words)
    vocabulary.extend(processed_words)
  
  return processed_text


In [None]:
def preprocess_main(data, is_train=False):
  """
    Main function to preprocess both train and test data
    We combine 'main_text' and 'retailer_pg' and 'country_name' 
    and store it to 'feature_text' column in the dataframe.
    And pass them to preprocess method to clean the text
    'main_text','retailer_pg','country_name' is then dropped from the dataframe

    :param data: Dataframe of train or test split
    :type data: Dataframe.
    :param is_train: build vocabulary is is_train = True
    :type text: Boolean.

    :returns: Dataframe 
    :rtype: String.
    """
  
  for index, row in data.iterrows():
    text = row['main_text'] + ' ' + row['retailer_pg'] + ' ' + row['country_name']
    # get clean processed text from preprocess method
    clean_text = preprocess(text, is_train)
    # assign clean text to new column feature_text
    data.loc[index, 'feature_text'] = clean_text

  data = data.drop(columns=['main_text', 'retailer_pg', 'country_name'], axis=1)
  return data


In [None]:
# pass train and test splits(dataframe) to preprocess method
train = preprocess_main(train, is_train=True)
test = preprocess_main(test)

# seperate train label Y train
y_train = train.pop('new_item_id')
y_train = np.array(y_train.values.tolist())
print(y_train.shape)

# seperate test label Y test:
y_test = test.pop('new_item_id')
y_test = np.array(y_test.values.tolist())
print(y_test.shape)

(2294, 536)
(574, 536)


# Training Custom Word2Vec




In [None]:
# Train custom word2Vec model from Gensim
# Sentences are processed sentences('main_text' + 'retailer_pg')
# sentences only involve training data.

if APPLY_WORD2VEC_EMBEDDINGS:
  start_time = time.time()

  word2vec_model = Word2Vec(sentences=word2vec_processed_array,
                  sg=1, # 1 for skip_gram, 0 for CBOW
                  min_count=1,
                  size=WORD2VEC_EMBEDDING_DIMENSION,  
                  workers=4)

  print(f'Time taken : {(time.time() - start_time) / 60:.2f} mins')

  keys = set(word2vec_model.wv.vocab.keys())
  print('Total vocab of trained word2vec model:', len(keys))
  print('Dimension of trained word2vec vectors:',word2vec_model.wv.vector_size)
  # print(model.wv.get_vector('CRISTALES')) 

  # Save the word2vec model
  word2vec_model.wv.save_word2vec_format('custom_word2vec_model_100d.txt')
  word_vectors = word2vec_model

In [None]:
def apply_word2vec(data):
  """
  For each data row, take embedding vector for each token in the sentence
  from the trained word2vec model 
  and take mean of those vectors to for one vector for a sentence. 
  """

  D = WORD2VEC_EMBEDDING_DIMENSION

  X = np.zeros((len(data), D))
  n = 0
  emptycount = 0
  for rows in data:
    tokens = rows.split()
    vecs = []
    m = 0
    for word in tokens:
      try:
        # throws KeyError if word not found
        vec = word_vectors.wv.get_vector(word)
        vecs.append(vec)
        m += 1
      except KeyError:
        pass
    if len(vecs) > 0:
      vecs = np.array(vecs)
      X[n] = vecs.mean(axis=0)
    else:
      emptycount += 1
    n += 1
  print("Numer of samples with no words found: %s / %s" % (emptycount, len(data)))
  return X


# Bert Embedding:

In [None]:
if APPLY_BERT_EMBEDDINGS:
  !pip install -U sentence-transformers
  from sentence_transformers import SentenceTransformer

  bert_model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens')

def get_bert_embeddings(data):
  '''
  Apply bert embeddings to sentences provided in dataframe
  :param data: Series of sentences
  :type data: Dataframe Series

  :returns: list of embeddings of sentences 
  :shape: (num_of_sentences, bert_embd_dimension)
  :rtype: List of List.
  '''
  D = 768
  data = data.to_list()
  embeddings = bert_model.encode(data)
  return embeddings





 
For Bert embeddings:  
We directly ger embeddings of the complete sentence


In [None]:
# Generating embeddings of train/test using Bert

if APPLY_BERT_EMBEDDINGS:
  # Train:
  # Get the features(embedding vectors) for train data
  x_train = get_bert_embeddings(train['feature_text'])
  print(x_train.shape)

  # Test:
  # Get the features(embedding vectors) for test data
  x_test = get_bert_embeddings(test['feature_text'])
  print(x_test.shape)

(2294, 768)
(574, 768)


For word2vec embeddings:   
We can get the embeddings of complete sentence in feature_text by taking mean of all embedding vectors of its words.

In [None]:
# Generating embeddings of train/test using word2vec

if APPLY_WORD2VEC_EMBEDDINGS:
  # Train:
  # Get the features(embedding vectors) for train data
  x_train = apply_word2vec(train['feature_text'])
  print(x_train.shape)

  if CREATE_ONEHOT_COUNTRY:
    # Add features from one-hot encoding of country
    country_train = train[['country_GERMANY', 'country_SPAIN']].to_numpy()
    print(country_train.shape)

    x_train = np.concatenate((x_train, country_train), axis=1)
    print(x_train.shape)

  # Test:
  # Get the features(embedding vectors) for test data
  x_test = apply_word2vec(test['feature_text'])
  print(x_test.shape)

  if CREATE_ONEHOT_COUNTRY:
    # Add features from one-hot encoding of country
    country_test = test[['country_GERMANY', 'country_SPAIN']].to_numpy()
    print(country_test.shape)

    x_test = np.concatenate((x_test, country_test), axis=1)
    print(x_test.shape)

## Deep Learning Multi-Label Classifier

We train a neural network MLP model to classify multi-label i.e. to predict ('item_id', 'mdm_brand_text', 'mdm_model_text')

Currently for some data points, the prediction features ('main_text, 'retailer_pg', 'country_name')
are same but the classes(item_id) are different. which depends on 'mdm_brand_text', 'mdm_model_text'
And as these two columns are not available during prediction, it is hard for the model to return the correct item_id in this ambiguous senario. 

So we train the model to predict multi-label, all ('item_id', 'mdm_brand_text', 'mdm_model_text'), so that while learning the model to predict all three it learns to distinguish the case above.

Implementation:  
In test time accuracy, We only compare the item_id as other fields are not available at prediction time.  

We find three accuracy metric here:
1. For item_id directly predicted by the model ()
2. For item_id generated using the mapping from predicted value of mdm_brand_text and mdm_model_text
3. Multi-label accuracy (This is not the correct metric as it uses mdm_brand and mdm_model data at prediction to get the accuracy.)

Parameters of the network:  

if using Word2vec embeddings:  
1. Input Dimension: 102
2. Hidden Layer 1: 512
3. Hidden Layer 2: 1024
3. Output Layer 536 

if using Word2vec embeddings:  
1. Input Dimension: 768
2. Hidden Layer 1: 1024
3. Hidden Layer 2: 512
3. Output Layer 536 


Label:  
(combining all unique item_id, mdm_brand, mdm_model) and creating 1-hot encoding.

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
import random
# seed for reproducibility
seed = 0
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
np.random.seed(seed)  # Numpy module.
random.seed(seed)  # Python random module.
torch.manual_seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using: {device} device")

class AnkitDataset(Dataset):
    def __init__(self, feature_list, target_list):
        self.feature_list = feature_list
        self.target_list = target_list

    def __getitem__(self, index: int):
        feature = self.feature_list[index]
        target = self.target_list[index]

        return feature, target

    def __len__(self):
        return len(self.feature_list)


class AnkitNet(nn.Module):
  def __init__(self):
    super().__init__()

    if APPLY_BERT_EMBEDDINGS:
      self.fc1 = nn.Linear(768, 1024, bias=False)
      self.relu = nn.ReLU()
      self.fc2 = nn.Linear(1024, 768, bias=False)
      self.fc3 = nn.Linear(768, output_layer, bias=False)

    if APPLY_WORD2VEC_EMBEDDINGS:
      self.fc1 = nn.Linear(100, 512, bias=False)
      self.relu = nn.ReLU()
      self.fc2 = nn.Linear(512, 1024, bias=False)
      self.fc3 = nn.Linear(1024, output_layer, bias=False)


  def forward(self, x):
    out = self.fc1(x)
    out = self.relu(out)
    out = self.fc2(out)
    out = self.relu(out)
    out = self.fc3(out)
    return out


def Accuracy(y_true, y_pred):
  """
  This function calculates 3 types of accuracy:
  1. Overall Accuracy for item_id predicted by multi-label model
  2. Overall Accuracy for item_id converted from predicted mdm_model and mdm_brand by multi-label model
  3. Multi-label overall accuracy (This metric will not be valid as it takes mdm values in prediction time)
  
  for 1:
    we directly take the item_id predicted by model and compare to y_label

  for 2:
    we take mdm_model and mdm_brand values predicted by model, 
    map it to item_id available and compare to y_label

  NOTE: we dont use mdm_model and mdm_brand in prediction time
  """
  # final accuravy dict
  accuracy_results = {}
  # converting output with sigmoid
  y_pred = torch.sigmoid(y_pred)

  # we take top 3 classes out of 536 
  topkvalues, indices = torch.topk(y_pred, k=3, dim=1)
  top_third_values, _ = torch.min(topkvalues, dim=1)

  # get top 3 classes for each row
  for i in range(y_true.shape[0]): 
    y_pred[i] = y_pred[i]>=top_third_values[i] 


  top_third_values.detach().cpu()
  temp = 0
  automatic_match_count = 0
  calculated_match_count = 0

  y_true = y_true.detach().cpu()
  y_pred = y_pred.detach().cpu()

  # transforming back from multilabel encoder to original classes
  true_classes = mlbe.inverse_transform(y_true)
  pred_classes = mlbe.inverse_transform(y_pred)

  for i in range(y_true.shape[0]):
   
    # from true label at prediction time we can only have item_id.
    true_item_id = set(true_classes[i]).intersection(set(all_item_ids))

    # item id predicted by model directly
    pred_item_ids = set(pred_classes[i]).intersection(set(all_item_ids)) #automatic

    # below extract mdm_model and mdm_brand predicted by model
    pred_mdm_brand_texts = set(pred_classes[i]).intersection(set(all_mdm_brand_text))
    pred_mdm_model_texts = set(pred_classes[i]).intersection(set(all_mdm_model_text))
    pred_mdm_brand_text = pred_mdm_brand_texts.pop() if len(pred_mdm_brand_texts) >=1 else None
    pred_mdm_model_text = pred_mdm_model_texts.pop() if len(pred_mdm_model_texts) >=1 else None
    
    # from mdm_model mdm_brand map it to item_id (converted item_id)
    if pred_mdm_brand_text and pred_mdm_model_text:
      search_text = pred_mdm_brand_text + ' ' + pred_mdm_model_text
      try:
        converted_pred_item_id = set()
        converted_pred_item_id_ = df.loc[df['mdm_combine'] == search_text, 'item_id'].iloc[0]
        converted_pred_item_id_ = str(converted_pred_item_id_)
        converted_pred_item_id.add(converted_pred_item_id_)
      except:
        converted_pred_item_id = pred_item_ids  
    else:
      converted_pred_item_id = pred_item_ids
    

    if true_item_id.intersection(pred_item_ids):
      automatic_match_count += 1
      
    if true_item_id.intersection(converted_pred_item_id):
      calculated_match_count += 1

    temp += sum(np.logical_and(y_true[i], y_pred[i])) / sum(np.logical_or(y_true[i], y_pred[i]))

  accuracy_results = {'automatic_item_id_accuracy': automatic_match_count/y_true.shape[0],
                      'calculated_item_id_accuracy': calculated_match_count/y_true.shape[0],
                      'multilabel_accuracy': (temp / y_true.shape[0])}
  return accuracy_results
  

def train(model, train_loader, criterion, optimizer, epochs):
  model.train()

  for epoch in range(epochs):
    losses = []
    for idx, (data, label) in enumerate(train_loader):
      data = data.float().to(device)
      label = label.float().to(device)

      model.to(device)
      inputv = data
      labelsv = label
      
      output = model(inputv)
      loss = criterion(output, labelsv)
      
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
    losses.append(loss.item())
  return model

def test(model, test_loader):
  model.eval()
  gt = []
  pred = []

  for idx, (data, label) in enumerate(test_loader):
    data = data.float().to(device)
    label = label.float().to(device)

    inputv = Variable(data)
    labelsv = Variable(label)
    
    output = model(inputv)
    gt.append(labelsv)
    pred.append(output)
  
  gt = torch.cat(gt, dim=0)
  pred = torch.cat(pred, dim=0)
  return Accuracy(gt, pred)


def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)


def ankit_trainer(train_input:list=None, train_target:list=None, 
                  test_input:list=None, test_target:list=None, 
                  epoch:int=200, bs:int=16, lr:float=1e-3):

  train_dataset = AnkitDataset(np.array(train_input), np.array(train_target))
  test_dataset = AnkitDataset(np.array(test_input), np.array(test_target))

  # train_dl = DataLoader(train_dataset, batch_size = bs, shuffle = True, num_workers=0, worker_init_fn=seed_worker)
  # test_dl = DataLoader(test_dataset, batch_size = 16, shuffle = False, num_workers=0, worker_init_fn=seed_worker)
  train_dl = DataLoader(train_dataset, batch_size = bs, shuffle = True, num_workers=0)
  test_dl = DataLoader(test_dataset, batch_size = 16, shuffle = False, num_workers=0)


  model = AnkitNet()

  optimizer = optim.Adam(model.parameters(), lr=lr)
  # criterion = nn.MultiLabelSoftMarginLoss() 
  criterion = nn.BCEWithLogitsLoss() 

  model = train(model, train_dl, criterion, optimizer, epoch)
  accuracy_results = test(model, test_dl)
  return accuracy_results

if __name__ == "__main__":
  # Hyperparameter search
  epochs = [50, 100, 150, 200, 300, 400]
  batch_size = [8, 12, 32, 64, 128]
  # epochs = [2,2,2]
  for epoch in epochs:
    for bs in batch_size:
      if epoch in (300,400) and bs in (8,12):
        continue
      print('epoch value =:', epoch)
      print('Batch Size =:', bs)
      results = ankit_trainer(x_train, y_train, x_test, y_test, epoch, bs)
      print('Accuracy:', results)

  ## debug
  # results = ankit_trainer(x_train, y_train, x_test, y_test, 5, 8)
  # print('Accuracy:', results)


  

Using: cuda device
epoch value =: 50
Batch Size =: 8
Accuracy: {'automatic_item_id_accuracy': 0.7979094076655052, 'calculated_item_id_accuracy': 0.8031358885017421, 'multilabel_accuracy': tensor(0.8216)}
epoch value =: 50
Batch Size =: 12
Accuracy: {'automatic_item_id_accuracy': 0.8013937282229965, 'calculated_item_id_accuracy': 0.8083623693379791, 'multilabel_accuracy': tensor(0.8221)}
epoch value =: 50
Batch Size =: 32
Accuracy: {'automatic_item_id_accuracy': 0.8310104529616724, 'calculated_item_id_accuracy': 0.8257839721254355, 'multilabel_accuracy': tensor(0.8486)}
epoch value =: 50
Batch Size =: 64
Accuracy: {'automatic_item_id_accuracy': 0.8083623693379791, 'calculated_item_id_accuracy': 0.8118466898954704, 'multilabel_accuracy': tensor(0.8348)}
epoch value =: 50
Batch Size =: 128
Accuracy: {'automatic_item_id_accuracy': 0.7700348432055749, 'calculated_item_id_accuracy': 0.7839721254355401, 'multilabel_accuracy': tensor(0.7951)}
epoch value =: 100
Batch Size =: 8
Accuracy: {'auto