# Importing Modules and Loading Data

In [None]:
import numpy as np
import pandas as pd
import csv
from bs4 import BeautifulSoup
import re
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize.treebank import TreebankWordTokenizer
import tensorflow as tf
from tensorflow import keras
from transformers import BertTokenizer, BertForMaskedLM

In [None]:
train_dataset = pd.read_csv('train.csv',escapechar="\\",quoting=csv.QUOTE_NONE)

In [None]:
test_dataset = pd.read_csv('test.csv',escapechar="\\",quoting=csv.QUOTE_NONE)

# Data Exploration

In [None]:
TRAIN_DATA_SIZE = len(train_dataset)
TEST_DATA_SIZE = len(test_dataset)

unique_train_bp = len(train_dataset["BULLET_POINTS"].value_counts())
unique_train_ds = len(train_dataset["DESCRIPTION"].value_counts())
unique_train_br = len(train_dataset["BRAND"].value_counts())
unique_train_tt = len(train_dataset["TITLE"].value_counts())

unique_test_bp = len(test_dataset["BULLET_POINTS"].value_counts())
unique_test_ds = len(test_dataset["DESCRIPTION"].value_counts())
unique_test_br = len(test_dataset["BRAND"].value_counts())
unique_test_tt = len(test_dataset["TITLE"].value_counts())


uniq_percent_bp = (unique_train_bp/TRAIN_DATA_SIZE, unique_test_bp/TEST_DATA_SIZE)
uniq_percent_ds = (unique_train_ds/TRAIN_DATA_SIZE, unique_test_ds/TEST_DATA_SIZE)
uniq_percent_br = (unique_train_br/TRAIN_DATA_SIZE, unique_test_br/TEST_DATA_SIZE)
uniq_percent_tt = (unique_train_tt/TRAIN_DATA_SIZE, unique_test_tt/TEST_DATA_SIZE)

print("To show uniqueness of data\n")

print("Total Unique BULLET_POINTS               :", (unique_train_bp, unique_test_bp))
print("Total Unique DESCRIPTION                 :", (unique_train_ds, unique_test_ds))
print("Total Unique BRAND                       :", (unique_train_br, unique_test_br))
print("Total Unique TITLE                       :", (unique_train_tt, unique_test_tt))

print("Total Unique BULLET_POINTS in percentage :", tuple([float("{0:.2f}".format(n)) for n in uniq_percent_bp]))
print("Total Unique DESCRIPTION in percentage   :", tuple([float("{0:.2f}".format(n)) for n in uniq_percent_ds]))
print("Total Unique BRAND in percentage         :", tuple([float("{0:.2f}".format(n)) for n in uniq_percent_br]))
print("Total Unique TITLE in percentage         :", tuple([float("{0:.2f}".format(n)) for n in uniq_percent_tt]))

In [None]:
print("Total NaN values in train data's DESCRIPTION column :",train_dataset["DESCRIPTION"].isna().value_counts()[1])
print("Total NaN values in train data's BULLET_POINTS column :",train_dataset["BULLET_POINTS"].isna().value_counts()[1])
print("Total NaN values in train data's TITLE column :",train_dataset["TITLE"].isna().value_counts()[1])
print("\nTotal rows where both DESCRIPTION and BULLET_POINTS are missing (NaN):", len(train_dataset[train_dataset["BULLET_POINTS"].isna() & train_dataset["DESCRIPTION"].isna()]))

In [None]:
train_dataset.head(25)

In [None]:
train_dataset.isna().sum()

# Data Cleaning

In [None]:
train_dataset.DESCRIPTION.fillna(train_dataset.TITLE, inplace=True)

train_dataset = train_dataset[train_dataset["DESCRIPTION"].notna()]

train_dataset.TITLE.fillna(train_dataset.DESCRIPTION, inplace=True)

train_dataset.BULLET_POINTS.fillna(train_dataset.DESCRIPTION, inplace=True)

train_dataset=train_dataset.fillna('Unbranded')

train_dataset = train_dataset.reset_index()

In [None]:
df = train_dataset[['TITLE','DESCRIPTION','BULLET_POINTS','BRAND']]
df

In [None]:
tok = WordPunctTokenizer()
pattern1 = r'@[A-Za-z0-9]+'
pattern2 = r'https?://[A-Za-z0-9./]+'
pattern3 = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});'
pattern4 = r'[\|\([{}\])\|]'
pattern5 = r'[^\w\s]'

pattern = r'|'.join((pattern1,pattern2))

def clean_text(text):
  #soup = BeautifulSoup(text, 'lxml')
  notag = re.sub(pattern3, " ", text)
  #souped = soup.get_text()
  clear = re.sub(pattern, '', notag)
  clear2 = re.sub(pattern4, '', clear)
  clear3 = re.sub(pattern5, ' ', clear2)
  emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
  stripped = emoji_pattern.sub(r'', clear3)
  try:
    clean = stripped.decode("utf-8-sig".replace(u"\ufffd","?"))
  except:
    clean = stripped
#   letters = re.sub("[^a-zA-Z]"," ", clean)
  lower_case = clean.lower()
#   words = tok.tokenize(lower_case)
  return lower_case

def clean_col(df,col, nums):
    df = df[['TITLE','DESCRIPTION','BULLET_POINTS','BRAND','F2']]
   # nums = [0,TRAIN_DATA_SIZE]
    print("Cleaning and processing {}:\n".format(col))
    cleaned_train_title = []
    for i in range(nums[0],nums[1]):
        if (( i+1 ) % 1 == 0):
            cleaned_train_title.append(clean_text(df[col][i]))
        if (( i+1 ) % 100000 == 0):
            print("%d of %d have been processed" % (i+1,nums[1]))
    return cleaned_train_title

# Feature Engineering

In [None]:
train_dataset['F1'] = train_dataset['BRAND'] + " " + train_dataset['TITLE']
train_dataset['F2'] = train_dataset['TITLE'] + " " + train_dataset['BULLET_POINTS']

In [None]:
title_cleaned = clean_col(train_dataset,"TITLE", nums=[0,2903010])
desc_cleaned = clean_col(train_dataset,"DESCRIPTION", nums=[0,2903010])
bullet_cleaned = clean_col(train_dataset,"BULLET_POINTS", nums=[0,2903010])
brand_cleaned = clean_col(train_dataset,"BRAND", nums=[0,2903010])
f2_cleaned = clean_col(train_dataset,"F2", nums=[0,2903010])

In [None]:
cleaned_train_dataset = pd.DataFrame(
    {
     'TITLE': title_cleaned,
     'DESCRIPTION': desc_cleaned,
     'BULLET_POINTS': bullet_cleaned,
     'BRAND': brand_cleaned,
     'F2': f2_cleaned,
     'BROWSE_NODE_ID' : train_dataset['BROWSE_NODE_ID']
    })

In [None]:
cleaned_train_data = cleaned_train_dataset.to_csv('cleaned_train_dataset.csv')

# Shuffling data for better sampling

In [None]:
def shuffler(df):
  # return the pandas dataframe
  return df.reindex(np.random.permutation(df.index))
new_data = shuffler(cleaned_train_dataset)

In [None]:
df = cleaned_train_dataset.sample(frac=1).reset_index(drop=True)

In [None]:
# train_dataset = pd.read_csv('dataset/cleaned_train_dataset.csv')
# test_dataset = pd.read_csv('dataset/cleaned_test_dataset.csv')
# train_dataset.fillna("Null", inplace=True)
# test_dataset.fillna("Null", inplace=True)
# train_dataset = train_dataset[['TITLE','BULLET_POINTS','BROWSE_NODE_ID']]
# test_dataset = test_dataset[['TITLE','BULLET_POINTS']]
# train_dataset["CODE"] = pd.Series(pd.factorize(train_dataset["BROWSE_NODE_ID"])[0])

In [None]:
COLUMN = "TITLE"
TRAINER_SIZE = 100000
SELECTED_VEC = "CV"
#TFIDF_MAX_FEATURES = 500

# Importing and Loading Embedding Files


In [None]:
GLOVE_EMBEDDING_FILE = 'E:\dataset\glove.840B.300d.txt\glove.840B.300d.txt'
FASTTEXT_EMBEDDING_FILE = 'E:\dataset\archive\crawl-300d-2M.vec'
WIKI_EMBEDDING_FILE =     '/kaggle/input/wikinews300d1mvec/wiki-news-300d-1M.vec'

In [None]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def load_embeddings(path):
    with open(path,encoding="utf8") as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in f)

In [None]:
def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            pass
    return embedding_matrix

# Modelling data

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

counts = train_dataset['CODE'].value_counts()
new = train_dataset[train_dataset["CODE"] == 677][:50000].append(train_dataset[train_dataset['CODE']!=677])
new = new[new["CODE"] == 5][:50000].append(new[new["CODE"]!=5])
new = new[new['CODE'].isin(counts[counts > 2].index)]

new = new.sample(frac=1).reset_index(drop=True)
X = new.drop(["BROWSE_NODE_ID","CODE"], axis=1)[:TRAINER_SIZE]
y = new["CODE"][:TRAINER_SIZE]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
lengthy = len(y.value_counts())
lengthy

5736

## KNN

In [None]:
if SELECTED_VEC == "CV":
    print("Vectorizer : CV")
    cv = CountVectorizer()
    X_trans_train = cv.fit_transform(X_train[COLUMN])
    X_trans_test = cv.transform(X_test[COLUMN])
    X_trans_test_real = cv.transform(test_dataset[COLUMN])
if SELECTED_VEC == "TFIDF":
    print("Vectorizer : TFIDF")
    td = TfidfVectorizer(max_features = TFIDF_MAX_FEATURES)
    X_trans_train = td.fit_transform(X_train[COLUMN]).toarray()
    X_trans_test=td.transform(X_test[COLUMN]).toarray()
    X_trans_test_real = td.transform(test_dataset[COLUMN])

In [None]:
import pickle
knn_models = []
modelnames=[]
for i in range(3,25):
    modelname = "knn_cv_n"+str(i)
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_trans_train,y_train)
    knn_models.append(knn)
    modelnames.append(modelname)
    with open("knn_models/"+modelname+'.pkl', 'wb') as f:
        pickle.dump(knn, f)
    print("Wrote pkl file to knn_model/"+modelname)

In [None]:
def predict_on_real(i):
    model = knn_models[i-3]
    modelname = modelnames[i-3]
    y_pred_real_1 = model.predict(X_trans_test_real[:50000])
    y_pred_real_2 = model.predict(X_trans_test_real[50000:])
    y1=pd.Series(y_pred_real_1)
    y2=pd.Series(y_pred_real_2)
    ans = y1.append(y2)
    ans.index = range(1,len(test_dataset)+1)
    ans.index.name = 'PRODUCT_ID'
    ans.to_csv("answers/ans_"+modelname+".csv", header=['BROWSE_NODE_ID'])
    print("Wrote answers to "+"answers/ans_"+modelname+".csv")
    return ans

In [None]:
for i in range(3,25,2):
    predict_on_real(i)

# Deep Learning

In [None]:
from keras.layers import add, concatenate, Conv1D, MaxPooling1D, merge
from keras.layers import Embedding 
# from keras.utils import plot_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
# from keras.utils import to_categorical

In [None]:
from keras.preprocessing.text import Tokenizer
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(X_train.TITLE.values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

X_trans_train = tokenizer.texts_to_sequences(X_train.TITLE.values)
X_trans_train = pad_sequences(X_trans_train, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X_trans_train.shape)

X_trans_test = tokenizer.texts_to_sequences(X_test.TITLE.values)
X_trans_test = pad_sequences(X_trans_test, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X_trans_test.shape)

X_trans_test_real = tokenizer.texts_to_sequences(test_dataset.TITLE.values)
X_trans_test_real = pad_sequences(X_trans_test_real, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X_trans_train.shape)


Y = y
print('Shape of label tensor:', Y.shape)

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
# define the model
model = Sequential()
model.add(Dense(500, input_shape=(X_trans_train.shape[1],)))
model.add(Dense(400, activation = 'relu'))
model.add(Dense(300, activation = 'relu'))
model.add(Dense(9919, activation = 'softmax'))

model.compile(optimizer='sgd', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

print(model.summary())

from keras.callbacks import EarlyStopping, ModelCheckpoint

earlystop = EarlyStopping(monitor='val_loss',patience=5, min_delta=0.0001)
model_checkpoint = ModelCheckpoint(filepath='./model-weights.hdf5', save_best_only=True, monitor='val_loss')

callbacks = [
    earlystop, 
    model_checkpoint
]



In [None]:
epochs = 100
batch_size = 1024

history = model.fit(X_trans_train, y_train, epochs=epochs, batch_size=batch_size,validation_split=0.2,callbacks=callbacks)

# Transformers

In [None]:
seq_len = 512
num_samples = len(X)

# initialize empty zero arrays
Xids = np.zeros((num_samples, seq_len))
Xmask = np.zeros((num_samples, seq_len))

# check shape
Xids.shape

# initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

for i, phrase in enumerate(X['TITLE']):
    tokens = tokenizer.encode_plus(phrase, max_length=seq_len, truncation=True,
                                   padding='max_length', add_special_tokens=True,
                                   return_tensors='tf')
    # assign tokenized outputs to respective rows in numpy arrays
    Xids[i, :] = tokens['input_ids']
    Xmask[i, :] = tokens['attention_mask']

In [None]:
labels=y
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))

def map_func(input_ids, masks, labels):
    # we convert our three-item tuple into a two-item tuple where the input item is a dictionary
    return {'input_ids': input_ids, 'attention_mask': masks}, labels

# then we use the dataset map method to apply this transformation
dataset = dataset.map(map_func)

In [None]:
batch_size = 16

# shuffle and batch - dropping any remaining samples that don't cleanly
# fit into a batch of 16
dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)

In [None]:
split = 0.9
size = int((Xids.shape[0]/batch_size)*split)

# get training and validation sets
train_ds = dataset.take(size)
val_ds = dataset.skip(size)

In [None]:
from transformers import TFAutoModel

bert = TFAutoModel.from_pretrained('bert-base-cased')

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
input_ids = tf.keras.layers.Input(shape=(512,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(512,), name='attention_mask', dtype='int32')

# we access the transformer model within our bert object using the bert attribute (eg bert.bert instead of bert)
embeddings = bert.bert(input_ids, attention_mask=mask)[1]  # access pooled activations with [1]

# convert bert embeddings into 5 output classes
x = tf.keras.layers.Dense(1024, activation='relu')(embeddings)
y = tf.keras.layers.Dense(lengthy, activation='softmax', name='outputs')(x)

In [None]:
model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)
optimizer = tf.keras.optimizers.Adam(lr=1e-5, decay=1e-6)
loss = tf.keras.losses.SparseCategoricalCrossentropy()
acc = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[acc])

In [None]:
history = model.fit(
    train_ds,
    epochs=1,
    verbose=1
)

   2/5625 [..............................] - ETA: 90:16:37 - loss: 8.7278 - accuracy: 0.0000e+00