In [29]:
import pandas as pd
import json
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.losses import sparse_categorical_crossentropy
import re

In [11]:
#read master data
master_data_path = "/Users/arpitkjain/Desktop/Data/POC/ShopNXT/backend/data/product_master_data.csv"
df = pd.read_csv(master_data_path)
print(df.head())
print(df.count())

   Unnamed: 0        product_id                  category          ...           product_brand                                            img_src                  combo
0           0  1604988230_35748  Foodgrains, Oil & Masala          ...                  BORGES  https://www.bigbasket.com/media/uploads/p/s/12...     ['2 L', '1040.00']
1           1  1604988230_35748  Foodgrains, Oil & Masala          ...                  BORGES  https://www.bigbasket.com/media/uploads/p/s/12...      ['1 L', '675.00']
2           2  1604988230_35748  Foodgrains, Oil & Masala          ...                  BORGES  https://www.bigbasket.com/media/uploads/p/s/12...   ['2x2 L', '1981.00']
3           0  1604988231_57988  Foodgrains, Oil & Masala          ...                 Fortune  https://www.bigbasket.com/media/uploads/p/s/12...      ['1 L', '123.00']
4           1  1604988231_57988  Foodgrains, Oil & Masala          ...                 Fortune  https://www.bigbasket.com/media/uploads/p/s/12...  ['3x910 

In [14]:
unique_data = df.drop_duplicates(subset=["product_name", "category","sub_category_name"])

In [10]:
unique_data.count()

Unnamed: 0           1821
product_id           1821
category             1821
sub_category_name    1821
product_name         1820
product_brand        1820
img_src              1821
combo                1821
dtype: int64

In [15]:
unique_data["product_name_concatenated"] = unique_data["product_brand"] + "_" + unique_data["product_name"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [17]:
unique_data.tail()

Unnamed: 0.1,Unnamed: 0,product_id,category,sub_category_name,product_name,product_brand,img_src,combo,product_name_concatenated
10176,0,1604851283_11044,Gourmet & World Food,Oils & Vinegar,Olive Oil,Colavita,https://www.bigbasket.com/media/uploads/p/mm/4...,"['250 ml', '395.00']",Colavita_Olive Oil
10181,0,1604851297_52164,Gourmet & World Food,Oils & Vinegar,Aura Extra Virgin Olive & Flaxseed Oil,Saffola,https://www.bigbasket.com/media/uploads/p/mm/4...,"['500 ml', '420.00']",Saffola_Aura Extra Virgin Olive & Flaxseed Oil
10200,0,1604851304_96484,Gourmet & World Food,Oils & Vinegar,100% Organic Pomace Olive oil,Organic Origins,https://www.bigbasket.com/media/uploads/p/mm/4...,"['1 L', '891.00']",Organic Origins_100% Organic Pomace Olive oil
10214,0,1604851344_30084,Gourmet & World Food,Oils & Vinegar,White Vinegar,Neo,https://www.bigbasket.com/media/uploads/p/mm/4...,"['370 ml', '112.50']",Neo_White Vinegar
10218,0,1604851347_28644,Gourmet & World Food,Oils & Vinegar,Balsamic Vineger,Neo,https://www.bigbasket.com/media/uploads/p/mm/4...,"['2x370 ml', '585.00']",Neo_Balsamic Vineger


In [22]:
prod_name = unique_data["product_name_concatenated"].tolist()

In [65]:
def preprocess_text(data:list):
    processed_data = []
    for value in data:
        cleaned_value = re.sub('[^a-zA-Z]+'," ", str(value))
        cleaned_value = re.sub("\s\s+"," ", str(cleaned_value))
        cleaned_value = cleaned_value.lower()
        processed_data.append(cleaned_value)
    return processed_data
        

In [66]:
prod_name = preprocess_text(prod_name)

In [67]:
word_token = Tokenizer(num_words=20000, oov_token="UNK")
word_token.fit_on_texts(prod_name)
sequence_train = word_token.texts_to_sequences(prod_name)
sequence_train

[[522, 61, 5, 52, 126],
 [362, 320, 168, 5, 437, 363],
 [438, 33, 321, 218],
 [113, 87, 154, 102, 1185, 364, 5],
 [1186, 5, 841],
 [523, 168, 320, 5],
 [362, 365, 5, 842, 843],
 [1187, 844, 5],
 [362, 168, 5, 25, 845],
 [113, 366, 154, 1188, 656, 364, 5],
 [12, 33, 321, 367, 218],
 [1189, 1190, 80, 5],
 [524, 1191, 80, 5],
 [1192, 218],
 [523, 5, 846, 322],
 [523, 5, 365, 842, 843],
 [113, 244, 364, 5],
 [55, 33, 218],
 [1193, 102, 366, 168, 320, 5],
 [169, 117, 33, 321, 218],
 [524, 841, 5, 847, 33],
 [323, 1194, 219, 320, 5],
 [848, 168, 320, 5, 22, 276, 849, 322],
 [362, 168, 5, 118, 657],
 [523, 168, 5, 25, 845, 10, 1195, 245, 439],
 [323, 5, 440],
 [524, 847, 658, 5],
 [1196, 80, 5, 170, 94, 127],
 [323, 5, 87, 363, 322],
 [850, 850, 851, 1197, 218],
 [524, 80, 5, 1198, 94, 1199, 170],
 [852, 61, 368, 5],
 [853, 87, 1200, 844, 5],
 [362, 1201, 154, 37, 656, 364, 5],
 [854, 94, 127, 170, 80, 5],
 [1202, 5, 1203, 168, 846],
 [369, 1204, 364, 5, 855, 22, 245, 246, 247, 439, 856],
 [8