## Get first 10000 rows modified

In [None]:
import json

formatted_strings = []

with open("train.jsonl", "r") as f:
    for j, line in enumerate(f):
        if j <= 10000:
            continue
        # Parse the JSON data from the line
        data = json.loads(line.strip())
        rating = data['Rating']
        title = data['Title']
        review = data['Review']

        # If "Title: " appears in the review, trim off everything after its first occurrence.
        title_marker_index = review.find("Title: ")
        if title_marker_index != -1:
            review = review[:title_marker_index]

        # Format the string as required
        formatted_string = (
            f'"System prompt : Given the Rating and Title, you are required to generate the review" | '
            f'"Rating": {rating} | "Title": {title} | "Review": '
        )

        # Add the formatted string to the list
        formatted_strings.append(formatted_string)

print(f"Processed {len(formatted_strings)} lines.")


KeyError: 'Title'

In [None]:
formatted_strings[0]

In [None]:
# from transformers import AutoModelForCausalLM, AutoTokenizer
# import torch

# save_directory = "."

# # Load the model with half precision if supported
# model = AutoModelForCausalLM.from_pretrained(save_directory, torch_dtype=torch.float16)
# tokenizer = AutoTokenizer.from_pretrained(save_directory)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")

# model.to(device)
# model.eval()


# sample_prompt = ("System prompt: Given the Rating and Title, you are required to generate the review, "
#                  "Rating: 5, Title: Would definitely buy again, Review:")

# inputs = tokenizer(sample_prompt, return_tensors="pt", padding=True, truncation=True, max_length=128)
# inputs = {key: value.to(device) for key, value in inputs.items()}

# with torch.no_grad():
#     generated_ids = model.generate(**inputs, max_length=128, do_sample=True, top_k=50)
# generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
# print("Generated text:", generated_text)


## Load model with DP and perform 10000 inferences


In [None]:
import json
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Directory where your model is saved
save_directory = "."

# Load the model with half precision if supported
model = AutoModelForCausalLM.from_pretrained(save_directory, torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(save_directory)

# Set up the device: use CUDA if available, otherwise fallback to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model.to(device)
model.eval()


In [None]:
# import time
# import json  # Import json to handle JSON serialization

# # Read formatted prompts from file. Each line should contain one formatted prompt.
# formatted_prompts = formatted_strings

# # Output file to save generated sequences in JSONL format
# output_file = "generated_sequences.jsonl"

# # Set batch size to 10 and initialize timing and batch results
# batch_size = 10
# results_batch = []
# batch_start_time = time.time()

# for i, prompt in enumerate(formatted_prompts):
#     # Tokenize the prompt
#     inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=128)
#     inputs = {key: value.to(device) for key, value in inputs.items()}

#     # Generate text from the prompt
#     with torch.no_grad():
#         generated_ids = model.generate(**inputs, max_length=128, do_sample=True, top_k=50)
#     generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

#     results_batch.append(generated_text)

#     # Save batch and compute time after every 10 iterations
#     if (i + 1) % batch_size == 0:
#         batch_end_time = time.time()
#         batch_time = batch_end_time - batch_start_time

#         # Append ge


In [None]:
import time

output_file = "generated_sequences.jsonl"
batch_size = 100
formatted_prompts = formatted_strings
# Process prompts in batches
num_prompts = len(formatted_prompts)
for batch_start in range(0, num_prompts, batch_size):
    batch_prompts = formatted_prompts[batch_start : batch_start + batch_size]

    batch_start_time = time.time()

    # Tokenize the entire batch
    inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Generate text for the batch
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_length=128, do_sample=True, top_k=50)

    # Decode the generated sequences for each prompt
    batch_generated_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in generated_ids]

    batch_end_time = time.time()
    batch_time = batch_end_time - batch_start_time

    # Write the generated outputs in JSONL format
    with open(output_file, "a") as outfile:
        for text in batch_generated_texts:
            json_line = json.dumps({"generated_text": text})
            outfile.write(json_line + "\n")

    print(f"Processed batch {(batch_start // batch_size) + 1} (prompts {batch_start} to {batch_start+len(batch_prompts)-1}). Time taken: {batch_time:.2f} seconds.")


## Load model without dp and run 10000 inferences

In [None]:
import json
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Directory where your model is saved
save_directory = "./finetuned_no_dp"

# Load the model with half precision if supported
model = AutoModelForCausalLM.from_pretrained(save_directory, torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(save_directory)

# Set up the device: use CUDA if available, otherwise fallback to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model.to(device)
model.eval()


In [None]:
import time

output_file = "generated_sequences_no_dp.jsonl"
batch_size = 200
formatted_prompts = formatted_strings
# Process prompts in batches
num_prompts = len(formatted_prompts)
for batch_start in range(0, num_prompts, batch_size):
    batch_prompts = formatted_prompts[batch_start : batch_start + batch_size]

    batch_start_time = time.time()

    # Tokenize the entire batch
    inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Generate text for the batch
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_length=128, do_sample=True, top_k=50)

    # Decode the generated sequences for each prompt
    batch_generated_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in generated_ids]

    batch_end_time = time.time()
    batch_time = batch_end_time - batch_start_time

    # Write the generated outputs in JSONL format
    with open(output_file, "a") as outfile:
        for text in batch_generated_texts:
            json_line = json.dumps({"generated_text": text})
            outfile.write(json_line + "\n")

    print(f"Processed batch {(batch_start // batch_size) + 1} (prompts {batch_start} to {batch_start+len(batch_prompts)-1}). Time taken: {batch_time:.2f} seconds.")


## Shortening the Prodfuct titles for amazon dataset


In [None]:
import json

data = []
with open("data/train.jsonl", "r") as file:
    for line in file:
        # Each line is a JSON object; decode it with json.loads
        data.append(json.loads(line))

# Now 'data' is a list of dictionaries
print(f"Loaded {len(data)} records.")
print(data[0])  # print the first record to check its content


Loaded 100000 records.
{'System prompt': 'Given the Rating and Title, you are required to generate the review', 'Rating': 4, 'Review Title': 'No white background! It’s clear!', 'Review': 'I bought this bc I thought it had the nice white background. Turns out it’s clear & since my phone is blue it doesn’t look anything like this.  If I had known that I would have purchased something else. It works ok.', 'Product Title': 'VUIIMEEK Square Case for iPhone 12 Pro Max 6.7",Cute White Flowers Clear Print Design Slim Flexible Soft TPU High Impact Shockproof Case Reinforced Bumper Cool Protective Crystal Cover (Green Leaves)', 'Product Categories': 'Cell Phones & Accessories'}


In [None]:
import json
import pandas as pd

# Read each line and convert it to a dictionary
data = [json.loads(line) for line in open("data/train.jsonl", "r")]
df = pd.DataFrame(data)

print("DataFrame shape:", df.shape)


DataFrame shape: (100000, 6)


In [None]:
str(df["Product Title"][1])

'Fitian Fitbit Ionic Charging Cable, Replacement USB Charging Cord Cable Accessories Charger Cable Adapter for Fitbit Ionic Wristband Smart Watch (2 Pcs Fitbit Ionic Charger Cable) …'

In [None]:
import time
import torch
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer

# Assume df is already defined with a "Product Title" column.
# For example, if you previously created it from your JSONL files:
# df = pd.read_json("new-data/train.jsonl", lines=True)

# Define the multi-shot prompt with your examples.
prompt_prefix = (
'''Task: Summarize the following product title into a concise 2–5 word product name. Your answer must include only the essential words—no extra commentary, punctuation, or formatting.

Example 1:
Product Title: VUIIMEEK Square Case for iPhone 12 Pro Max 6.7", Cute White Flowers Clear Print Design Slim Flexible Soft TPU High Impact Shockproof Case Reinforced Bumper Cool Protective Crystal Cover (Green Leaves)
Minimal Product Name: iphone 12 pro max square case

Example 2:
Product Title: Fitian Fitbit Ionic Charging Cable, Replacement USB Charging Cord Cable Accessories Charger Cable Adapter for Fitbit Ionic Wristband Smart Watch (2 Pcs Fitbit Ionic Charger Cable)
Minimal Product Name: Fitian Fitbit Ionic Charging Cable

Now, produce the minimal product name for the following product title:
Product Title: '''

)

# Build a list of prompts using the product titles from the DataFrame.
formatted_prompts = [prompt_prefix + f"\"{title}\"" for title in df["Product Title"] + "Minimal Product name is:"]

# Directory where your finetuned 8B Llama model is saved.
save_directory = "meta-llama/Llama-3.1-8B"  # Update this to your model's directory

# Load the model and tokenizer with half precision if supported.
model = AutoModelForCausalLM.from_pretrained(save_directory, torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(save_directory)

# Set up the device (CUDA if available, otherwise CPU).
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)
model.eval()



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Using device: cuda


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (n

In [None]:

# Process prompts in batches of 200.
batch_size = 20
num_prompts = 100
# num_prompts = len(formatted_prompts)
shortened_titles = []

tokenizer.pad_token = tokenizer.eos_token

for batch_start in range(0, num_prompts, batch_size):
    batch_prompts = formatted_prompts[batch_start : batch_start + batch_size]
    batch_start_time = time.time()


    # Tokenize the batch of prompts.
    inputs = tokenizer(
        batch_prompts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512,  # Adjust if necessary.
        # padding_side='left'
    )
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Generate outputs for the batch.
    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            # max_length=128,   # Adjust based on expected output length.
            # do_sample=True,   # Enable sampling.
            # top_k=50          # Adjust sampling parameters if needed.
        )

    # Decode outputs.
    batch_generated_texts = [
        tokenizer.decode(ids, skip_special_tokens=True) for ids in generated_ids
    ]

    batch_end_time = time.time()
    print(f"Processed batch {(batch_start // batch_size) + 1} (prompts {batch_start} to {batch_start + len(batch_prompts) - 1}) in {batch_end_time - batch_start_time:.2f} seconds.")

    # Remove the prompt text from the generated output.
    for prompt, full_output in zip(batch_prompts, batch_generated_texts):
        shortened = full_output[len(prompt):].strip()
        shortened_titles.append(shortened)

# # Add the shortened titles as a new column to your DataFrame.
# df["Shortened Product Title"] = shortened_titles

# # Print the final DataFrame with the original and shortened product titles.
# print(df[["Product Title", "Shortened Product Title"]])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 1 (prompts 0 to 19) in 4.69 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 2 (prompts 20 to 39) in 3.59 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 3 (prompts 40 to 59) in 3.83 seconds.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Processed batch 4 (prompts 60 to 79) in 3.55 seconds.
Processed batch 5 (prompts 80 to 99) in 3.56 seconds.


In [None]:
shortened_titles

['php the_title();?"',
 '://',
 '_REFIXER iPhone XR Screen Replacement Kit 6.1 Inch LCD for iPhone XR 3',
 '://www.amazon.com/TORRAS-COOLIFY-Zone-Midnight/dp',
 '://www.amazon.com/dp/B08P4G8XGK?th=1',
 'Magnetic Wireless Charging Station for Apple Series, 3-in-1 Standard 15W Fast Mag',
 '://',
 '://',
 '://iphone 5c case',
 '://:6 Pack 3M VHB Sticker Pads for Socket Mount Base Grip Stand',
 '://s3.amazonaws.com/answers/answers/2018-10-10-10-',
 '://www.amazon.com/dp/B07H3YRZVQ?th=1',
 '://www.amazon.com/dp/B07X6Y2XVJ?th=1',
 '://apple iphone 7 plus 8 plus case',
 '://iphone 7 plus case"\n\nYou can use the following regex to match the product name:',
 '://',
 '://www.amazon.com/Cell-Phone-Clip-Stand-Holder/dp/B07P',
 '://www.amazon.com/dp/B07H6GJF7Z?th=1',
 '://www.amazon.com/Bluetooth-Headset-Microphone-Qualcomm-Canceling-Tr',
 '://www.amazon.com/dp/B01M0QWJL4\n\nSolution: The',
 '://: TechMatte Car Mount, MagGrip Air Vent Magnetic Universal Car Mount Holder for',
 '://www.amazon.com/d

In [None]:
formatted_prompts[:100]

['Task: Summarize the following product title into a concise 2–5 word product name. Your answer must include only the essential words—no extra commentary, punctuation, or formatting.\n\nExample 1:\nProduct Title: VUIIMEEK Square Case for iPhone 12 Pro Max 6.7", Cute White Flowers Clear Print Design Slim Flexible Soft TPU High Impact Shockproof Case Reinforced Bumper Cool Protective Crystal Cover (Green Leaves)\nMinimal Product Name: iphone 12 pro max square case\n\nExample 2:\nProduct Title: Fitian Fitbit Ionic Charging Cable, Replacement USB Charging Cord Cable Accessories Charger Cable Adapter for Fitbit Ionic Wristband Smart Watch (2 Pcs Fitbit Ionic Charger Cable)\nMinimal Product Name: Fitian Fitbit Ionic Charging Cable\n\nNow, produce the minimal product name for the following product title:\nProduct Title: "VUIIMEEK Square Case for iPhone 12 Pro Max 6.7",Cute White Flowers Clear Print Design Slim Flexible Soft TPU High Impact Shockproof Case Reinforced Bumper Cool Protective Cry

In [None]:
!pip install --upgrade google-genai
!gcloud auth application-default login

Collecting google-genai
  Downloading google_genai-1.4.0-py3-none-any.whl.metadata (29 kB)
Downloading google_genai-1.4.0-py3-none-any.whl (140 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m141.0/141.0 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: google-genai
  Attempting uninstall: google-genai
    Found existing installation: google-genai 0.2.2
    Uninstalling google-genai-0.2.2:
      Successfully uninstalled google-genai-0.2.2
Successfully installed google-genai-1.4.0


Go to the following link in your browser, and complete the sign-in prompts:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=https%3A%2F%2Fsdk.cloud.google.com%2Fapplicationdefaultauthcode.html&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login&state=A90yj7eguFBAmzMqSvmKjXW53Ghd3E&prompt=consent&token_usage=remote&access_type=offline&code_challenge=30pDNLQhZSN8x7q-iDkn7lDD5_KBQr-EEu_LIWAsWKQ&code_challenge_method=S256

Once finished, enter the verification code provided in your browser: 4/0AQSTgQGR-nZ4NmfcNgmrI0DkdC4KnPRh6NdV8w49w0_I06DjHkZaZUqxB4wKAlAyBG-UpQ

Credentials saved to file: [/content/.config/application_default_credentials.json]

These credentials will be used by any library that requests Application Default Credentials (ADC).
Ca

In [None]:
!gcloud auth application-default set-quota-project

[1;31mERROR:[0m (gcloud.auth.application-default.set-quota-project) argument QUOTA_PROJECT_ID: Must be specified.
Usage: gcloud auth application-default set-quota-project QUOTA_PROJECT_ID [optional flags]
  optional flags may be  --help

For detailed information on this command and its flags, run:
  gcloud auth application-default set-quota-project --help
