In [34]:
import warnings
warnings.filterwarnings('ignore')

In [36]:
# Installing Kaggle and uploading token so that data can be pulled from Kaggle directly
!pip install kaggle -q

# Upload the kaggle.json file
from google.colab import files
files.upload()  # This will prompt you to upload the kaggle.json file

# Move the kaggle.json file to the correct location
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [37]:
# Download the dataset
!kaggle datasets download -d PromptCloudHQ/flipkart-products

# Unzip the dataset
!unzip flipkart-products.zip

Dataset URL: https://www.kaggle.com/datasets/PromptCloudHQ/flipkart-products
License(s): CC-BY-SA-4.0
Archive:  flipkart-products.zip
replace flipkart_com-ecommerce_sample.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: flipkart_com-ecommerce_sample.csv  


In [38]:
import pandas as pd

# Load the dataset
dfProductData = pd.read_csv('flipkart_com-ecommerce_sample.csv')

# Display the first 5 rows
dfProductData.head()

Unnamed: 0,uniq_id,crawl_timestamp,product_url,product_name,product_category_tree,pid,retail_price,discounted_price,image,is_FK_Advantage_product,description,product_rating,overall_rating,brand,product_specifications
0,c2d766ca982eca8304150849735ffef9,2016-03-25 22:59:23 +0000,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",SRTEH2FF9KEDEFGF,999.0,379.0,"[""http://img5a.flixcart.com/image/short/u/4/a/...",False,Key Features of Alisha Solid Women's Cycling S...,No rating available,No rating available,Alisha,"{""product_specification""=>[{""key""=>""Number of ..."
1,7f7036a6d550aaa89d34c77bd39a5e48,2016-03-25 22:59:23 +0000,http://www.flipkart.com/fabhomedecor-fabric-do...,FabHomeDecor Fabric Double Sofa Bed,"[""Furniture >> Living Room Furniture >> Sofa B...",SBEEH3QGU7MFYJFY,32157.0,22646.0,"[""http://img6a.flixcart.com/image/sofa-bed/j/f...",False,FabHomeDecor Fabric Double Sofa Bed (Finish Co...,No rating available,No rating available,FabHomeDecor,"{""product_specification""=>[{""key""=>""Installati..."
2,f449ec65dcbc041b6ae5e6a32717d01b,2016-03-25 22:59:23 +0000,http://www.flipkart.com/aw-bellies/p/itmeh4grg...,AW Bellies,"[""Footwear >> Women's Footwear >> Ballerinas >...",SHOEH4GRSUBJGZXE,999.0,499.0,"[""http://img5a.flixcart.com/image/shoe/7/z/z/r...",False,Key Features of AW Bellies Sandals Wedges Heel...,No rating available,No rating available,AW,"{""product_specification""=>[{""key""=>""Ideal For""..."
3,0973b37acd0c664e3de26e97e5571454,2016-03-25 22:59:23 +0000,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",SRTEH2F6HUZMQ6SJ,699.0,267.0,"[""http://img5a.flixcart.com/image/short/6/2/h/...",False,Key Features of Alisha Solid Women's Cycling S...,No rating available,No rating available,Alisha,"{""product_specification""=>[{""key""=>""Number of ..."
4,bc940ea42ee6bef5ac7cea3fb5cfbee7,2016-03-25 22:59:23 +0000,http://www.flipkart.com/sicons-all-purpose-arn...,Sicons All Purpose Arnica Dog Shampoo,"[""Pet Supplies >> Grooming >> Skin & Coat Care...",PSOEH3ZYDMSYARJ5,220.0,210.0,"[""http://img5a.flixcart.com/image/pet-shampoo/...",False,Specifications of Sicons All Purpose Arnica Do...,No rating available,No rating available,Sicons,"{""product_specification""=>[{""key""=>""Pet Type"",..."


In [44]:
# Checking relevant columns
dfProductData.columns
dfInputData = dfProductData[["pid", "product_name",  "description", "brand", "product_specifications"]]

In [45]:
# The Product_specification columns is in a Dictionary format. The following function creates
# a text only version of the 'value' part of the dictionary
import ast

# Function to convert specifications to text
def specs_to_text(specs_str):
    # Handle NaN or missing values
    if pd.isna(specs_str) or specs_str == '':
        return ''

    try:
        # Convert string to dictionary
        specs_dict = ast.literal_eval(specs_str.replace('=>', ':'))
        # Extract values
        values = [item['value'] for item in specs_dict['product_specification']]
        # Combine values into a single string
        return ', '.join(values)
    except (ValueError, KeyError, TypeError, SyntaxError) as e:
        # print(f"Error processing specifications: {e}")
        return ''  # Return empty string for invalid data

# Apply the function to the DataFrame
dfInputData['specs_text'] = dfInputData['product_specifications'].apply(specs_to_text)

# Display the updated DataFrame
dfInputData.head()

Unnamed: 0,pid,product_name,description,brand,product_specifications,specs_text
0,SRTEH2FF9KEDEFGF,Alisha Solid Women's Cycling Shorts,Key Features of Alisha Solid Women's Cycling S...,Alisha,"{""product_specification""=>[{""key""=>""Number of ...","Pack of 3, Cotton Lycra, Cycling Shorts, Solid..."
1,SBEEH3QGU7MFYJFY,FabHomeDecor Fabric Double Sofa Bed,FabHomeDecor Fabric Double Sofa Bed (Finish Co...,FabHomeDecor,"{""product_specification""=>[{""key""=>""Installati...",Installation and demo for this product is done...
2,SHOEH4GRSUBJGZXE,AW Bellies,Key Features of AW Bellies Sandals Wedges Heel...,AW,"{""product_specification""=>[{""key""=>""Ideal For""...","Women, Casual, Red, Patent Leather, 1 inch, Pa..."
3,SRTEH2F6HUZMQ6SJ,Alisha Solid Women's Cycling Shorts,Key Features of Alisha Solid Women's Cycling S...,Alisha,"{""product_specification""=>[{""key""=>""Number of ...","Pack of 2, Cotton Lycra, Cycling Shorts, Solid..."
4,PSOEH3ZYDMSYARJ5,Sicons All Purpose Arnica Dog Shampoo,Specifications of Sicons All Purpose Arnica Do...,Sicons,"{""product_specification""=>[{""key""=>""Pet Type"",...","Dog, Sicons, 500 ml, SH.DF-14, All Purpose, Ar..."


In [46]:
# Since we have a clean version of this column, getting rid of it
dfInputData.drop(["product_specifications"], inplace=True, axis=1)

In [47]:
# Checking for Nulls
dfInputData.isna().sum()

Unnamed: 0,0
pid,0
product_name,0
description,2
brand,5864
specs_text,0


In [48]:
# The Brand column seems to have 29% null values and doesn't seem to be important for Keyword
# extraction as the product description will contain it anyway.
null_perc = dfInputData.isna().sum().loc["brand"]/dfInputData.shape[0]
print(f"The column 'brand' has {null_perc} % Nulls")

The column 'brand' has 0.2932 % Nulls


In [49]:
# We can safely drop this field.
# The description column has 2 nulls and product specifications has 14 nulls.
# Since both columns are important for our analysis, dropping these products as well.
dfInputData.dropna(inplace=True)

In [50]:
print(f"The dataset has {dfInputData.shape[0]} records.")
dfInputData.isna().sum()

The dataset has 14135 records.


Unnamed: 0,0
pid,0
product_name,0
description,0
brand,0
specs_text,0


In [51]:
dfInputData.head(2)

Unnamed: 0,pid,product_name,description,brand,specs_text
0,SRTEH2FF9KEDEFGF,Alisha Solid Women's Cycling Shorts,Key Features of Alisha Solid Women's Cycling S...,Alisha,"Pack of 3, Cotton Lycra, Cycling Shorts, Solid..."
1,SBEEH3QGU7MFYJFY,FabHomeDecor Fabric Double Sofa Bed,FabHomeDecor Fabric Double Sofa Bed (Finish Co...,FabHomeDecor,Installation and demo for this product is done...


In [52]:
# Description already has a brand.
dfInputData.drop('brand', inplace=True, axis=1)

In [53]:
dfInputData.head(1)

Unnamed: 0,pid,product_name,description,specs_text
0,SRTEH2FF9KEDEFGF,Alisha Solid Women's Cycling Shorts,Key Features of Alisha Solid Women's Cycling S...,"Pack of 3, Cotton Lycra, Cycling Shorts, Solid..."


In [54]:
# The prpduct description seems to be too long and has a lot of room for text cleaning
# The specs_text column when appended with the product name will have a similar effect.
dfInputData["input_text"] = dfInputData["product_name"] + ". " + dfInputData["specs_text"]

In [55]:
dfInputData = dfInputData[["pid", "product_name", "input_text"]]

In [56]:
import nltk

# Download required resources
nltk.download('punkt')  # For tokenization
nltk.download('stopwords')  # For stopwords
nltk.download('averaged_perceptron_tagger_eng')  # For POS tagging
nltk.download('wordnet')  # For lemmatization
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [57]:
import re
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag

def fnpreprocess_text(text):
    """
    Preprocesses the input text by performing the following steps:
    1. Lowercasing
    2. Removing punctuation (Maintaining them as we will use some encoder model where punctuation has a meaning)
    3. Removing stopwords
    4. Removing numbers (Maintaining them as they can be important)
    5. Removing special characters and codes (Maintaining them as they can be important)
    6. POS tagging
    7. Lemmatization (using POS tags)
    8. Removing extra spaces

    Args:
        text (str): The input text to preprocess.

    Returns:
        str: The preprocessed text.
    """
    # Step 1: Lowercasing
    text = text.lower()

    # Step 2: Remove punctuation
    # text = re.sub(r'[^\w\s]', '', text)

    # Step 3: Remove stopwords
    # stop_words = set(stopwords.words('english'))
    # text = ' '.join([word for word in text.split() if word not in stop_words])

    # Step 4: Remove numbers
    # text = re.sub(r'\d+', '', text)

    # Step 5: Remove special characters and codes (e.g., ALTHT_3P_21)
    # text = re.sub(r'\b\w*_\w*\b', '', text)
    # text = re.sub(r'\s+', ' ', text).strip()

    # Step 6: Tokenization and POS tagging
    # tokens = word_tokenize(text)
    # pos_tags = pos_tag(tokens)

    # Step 7: Lemmatization (using POS tags)
    # lemmatizer = WordNetLemmatizer()

    def get_wordnet_pos(treebank_tag):
        """
        Converts Treebank POS tags to WordNet POS tags for lemmatization.

        Args:
            treebank_tag (str): The Treebank POS tag.

        Returns:
            str: The corresponding WordNet POS tag.
        """
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN  # Default to noun if no match

    # lemmatized_tokens = []
    # for word, tag in pos_tags:
    #     wordnet_pos = get_wordnet_pos(tag)
    #     lemmatized_word = lemmatizer.lemmatize(word, pos=wordnet_pos)
    #     lemmatized_tokens.append(lemmatized_word)

    # Step 8: Join tokens and remove extra spaces
    # preprocessed_text = ' '.join(lemmatized_tokens)
    # preprocessed_text = ' '.join(tokens)
    # preprocessed_text = re.sub(r'\s+', ' ', preprocessed_text).strip()

    # return preprocessed_text
    return text

In [58]:
dfInputData["cleaned_text"] = dfInputData["input_text"].apply(fnpreprocess_text)

In [59]:
dfInputData.head(1)

Unnamed: 0,pid,product_name,input_text,cleaned_text
0,SRTEH2FF9KEDEFGF,Alisha Solid Women's Cycling Shorts,Alisha Solid Women's Cycling Shorts. Pack of 3...,alisha solid women's cycling shorts. pack of 3...


In [60]:
# Importing transformers to load LLMs from HuggingFace and torch because LLMs are built on Torch
!pip install transformers torch -q

In [61]:
!pip install -U bitsandbytes -q

In [62]:
import torch

In [63]:
from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,                    # Enable 4-bit quantization
    bnb_4bit_compute_dtype=torch.float16, # Perform computations in 16-bit floating point precision to save memory
    bnb_4bit_quant_type="nf4",            # Use 'nf4' type of 4-bit quantization
    bnb_4bit_use_double_quant=True,       # Enable double quantization
)



In [64]:
# Need access to huggingface token because model usage requires authentication
hf_token = '<enter your token here>'

from huggingface_hub import login
login(token=hf_token)

In [65]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [66]:
# To move the LLM on the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): Mist

In [67]:
def generate_text(prompt, max_new_tokens=100):
    # The function takes as input the prompt
    # max_length = 50 specifies the number of keywords that the model should generate
    # Uses the T5Tokenizer to tokenize the input.
    # return_tensors = "pt" implies telling the llm to return pytorch tensors
    # to(device) moves to GPU
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # model.generates : given the tokenized inputs, generates a completion
    # max_length: gives the number of tokens the model will generate
    # num_beams: beam width - generates 5 different combinations and returns
    # the one with best cumulative probability
    outputs = model.generate(inputs["input_ids"], max_new_tokens = max_new_tokens, num_beams=2, early_stopping=True)

    # outputs: completion generated by the LLM
    # skip_special_tokens: asks the LLM to not include tokens such as <unk> or <eos> in the completion
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [68]:
# # Defining a function to get rid of instruction part from the output sequence
# (The output sequence has instruction + generated text)
def remove_instructions(text):
    """
    Remove the instruction part from the input text (which is the output sequence from LLM)
    and return the cleaned text (generated response only).

    Parameters:
    text (str): The input text containing instructions and generated content.

    Returns:
    str: The text (output sequence/response) with instructions removed, trimmed of leading and trailing whitespace and newlines.
    """
    # Remove instructions + leading and trailing whitespace and newlines
    cleaned_text = text.split("[/INST]")[-1].strip()
    return cleaned_text


In [69]:
# Zero Shot Inferencing
# input_text = f"[INST] Generate keywords for the following text corpus. Return the keywords as a comma-separated list. Text: {dfInputData.loc[3]['cleaned_text']} [/INST]"
# keywords = generate_text(input_text)

In [70]:
# One shot - In context learning
input_text = f"""
[INST] Generate keywords for the following text corpus. Return them as a comma-separated list.
Example:
Text: Women's solid cycling shorts with cotton lycra.
Keywords: womens cycling shorts, cotton lycra, solid shorts

Text: {dfInputData.loc[3]['cleaned_text']} [/INST]
"""


In [71]:
keywords = generate_text(input_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [72]:
# One-shot In context learning seems to be working fine. Using it for the prompt
def generate_output(text):
  input_text = f"""
                [INST] Generate keywords for the following text corpus. Return them as a comma-separated list.
                Example:
                Text: Women's solid cycling shorts with cotton lycra.
                Keywords: womens cycling shorts, cotton lycra, solid shorts

                Text: {text} [/INST]
                """
  keywords = generate_text(input_text)
  output = remove_instructions(keywords)
  cleaned_output = output.replace("Keywords: ", "")
  cleaned_output = output.replace("Text: ", "")
  return cleaned_output


In [73]:
# Reducing the size of the data due to Colab GPU usage constraints
dfInputDataSample = dfInputData.head(100)

In [74]:
dfInputDataSample["keywords"] = dfInputDataSample["cleaned_text"].apply(generate_output)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attentio

In [75]:
dfInputDataSample.head()

Unnamed: 0,pid,product_name,input_text,cleaned_text,keywords
0,SRTEH2FF9KEDEFGF,Alisha Solid Women's Cycling Shorts,Alisha Solid Women's Cycling Shorts. Pack of 3...,alisha solid women's cycling shorts. pack of 3...,"1. women's cycling shorts,\n 2...."
1,SBEEH3QGU7MFYJFY,FabHomeDecor Fabric Double Sofa Bed,FabHomeDecor Fabric Double Sofa Bed. Installat...,fabhomedecor fabric double sofa bed. installat...,"Keywords: sofa bed, fabhomedecor, microfiber, ..."
2,SHOEH4GRSUBJGZXE,AW Bellies,"AW Bellies. Women, Casual, Red, Patent Leather...","aw bellies. women, casual, red, patent leather...","Women's shoes, casual shoes, red shoes, patent..."
3,SRTEH2F6HUZMQ6SJ,Alisha Solid Women's Cycling Shorts,Alisha Solid Women's Cycling Shorts. Pack of 2...,alisha solid women's cycling shorts. pack of 2...,"1. women's cycling shorts,\n 2...."
4,PSOEH3ZYDMSYARJ5,Sicons All Purpose Arnica Dog Shampoo,"Sicons All Purpose Arnica Dog Shampoo. Dog, Si...","sicons all purpose arnica dog shampoo. dog, si...","1. sicons,\n 2. all purpose,\n ..."


In [76]:
dfInputDataSample.to_csv("KeywordLLMSample.csv", index=False)

In [None]:
dfInputData["keywords"] = dfInputData["cleaned_text"].apply(generate_output)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attentio