In [1]:
import pandas as pd
import numpy as np

In [2]:
import random

# Sample data to create a larger dataset
regions = ["Berlin", "Sydney", "New York", "Tokyo", "London"]
categories = [
    "Fruits, Snacks", "Meat, Bakery", "Vegetables, Dairy",
    "Seafood, Beverages", "Frozen Foods, Grains"
]
revenue_range = [f"${round(random.uniform(2, 10), 2)}M" for _ in range(100)]
growth_trends = [f"+{random.randint(5, 20)}% YoY" for _ in range(100)]
units_sold = [random.randint(10000, 100000) for _ in range(100)]
insights = [
    "Supply chain improvements boosted performance.",
    "Local sourcing initiatives appealed to customers.",
    "Seasonal promotions were highly effective.",
    "Demand for organic options increased sales.",
    "Targeted marketing campaigns showed results."
]
recommendations = [
    "Collaborate with local farmers for unique offerings.",
    "Introduce bundle deals to increase average order size.",
    "Enhance supply chain logistics to maintain freshness.",
    "Expand visibility through targeted digital marketing campaigns.",
    "Increase consumer awareness through promotions."
]

# Generate the dataset
data_points = []
for _ in range(1000):  # Generate 1000 records
    region = random.choice(regions)
    category = random.choice(categories)
    revenue = random.choice(revenue_range)
    growth = random.choice(growth_trends)
    units = random.choice(units_sold)
    insight = random.choice(insights)
    recommendation = random.choice(recommendations)

    input_text = f"Region: {region}; Product Categories: {category}; Total Revenue: {revenue}; Growth Trend: {growth}; Units Sold: {units}"
    output_text = f"Overview: Sales performance for {category.lower()} in {region} showed promising growth. Sales Figures: - Total Revenue: {revenue} ({growth}). - Units Sold: {units} (combined). Key Insights: {insight} Recommendations: {recommendation}"

    # Combine input and output in a single line
    data_points.append(f"Input: {input_text}, Output: {output_text}")

# Save to a .txt file
file_name = "large_combined_dataset.txt"
with open(file_name, "w") as file:
    file.write("\n".join(data_points))

print(f"Large dataset with 1000 records saved to {file_name}")


Large dataset with 1000 records saved to large_combined_dataset.txt


In [3]:
!pip install transformers datasets torch


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [13]:
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

# Load dataset
dataset = load_dataset("csv", data_files="food_service_dataset.csv", delimiter=",")  # Ensure correct path and delimiter

# Check if columns are correctly named or print dataset columns for reference
print(dataset)

# Load pre-trained T5 model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Preprocess the data for T5
def preprocess_function(examples):
    inputs = [f"Generate a detailed report: {text}" for text in examples["input_text"]]
    outputs = examples["output_text"]

    # Tokenize inputs and outputs
    model_inputs = tokenizer(inputs, truncation=True, padding="max_length", max_length=512)
    labels = tokenizer(outputs, truncation=True, padding="max_length", max_length=512)

    # Add labels to model inputs
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# Apply preprocessing to the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Split dataset into train and test sets (if not already split)
tokenized_dataset = tokenized_dataset["train"].train_test_split(test_size=0.1)  # Adjust test size as needed
train_dataset = tokenized_dataset["train"]
eval_dataset = tokenized_dataset["test"]

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=2,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_t5")

# Save the tokenizer
tokenizer.save_pretrained("./fine_tuned_t5")


Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_text', 'output_text'],
        num_rows: 1000
    })
})


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.3809,0.147193
2,0.1324,0.044198
3,0.0889,0.032754


('./fine_tuned_t5/tokenizer_config.json',
 './fine_tuned_t5/special_tokens_map.json',
 './fine_tuned_t5/spiece.model',
 './fine_tuned_t5/added_tokens.json')

In [14]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the fine-tuned model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("./fine_tuned_t5")
tokenizer = T5Tokenizer.from_pretrained("./fine_tuned_t5")

# Define your input text (this should match the format of your training data)
input_text = "Region: Berlin; Product Categories: Fruits, Snacks; Total Revenue: $4.92M; Growth Trend: +13% YoY; Units Sold: 59950; Breakdown: Fruits: 15754; Snacks: 11923"

# Preprocess the input (similar to the preprocessing done during training)
input_ids = tokenizer(f"Generate a detailed report: {input_text}", return_tensors="pt").input_ids

# Generate the output using the model
output_ids = model.generate(input_ids, max_length=512, num_beams=5, early_stopping=True)

# Decode the generated output
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print(output_text)


Overview: Sales performance for fruits, snacks in Berlin showed promising growth. Sales Figures: - Total Revenue: $4.92 million (up 13% YoY). - Units Sold: 59950 (combined). Key Insights: Seasonal promotions were highly effective. Recommendations: Increase visibility through targeted digital marketing campaigns.
