In [None]:
import pandas as pd
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from sklearn.model_selection import train_test_split
from datasets import load_metric

In [3]:
# Step 1: Load the Dataset
data_path = 'Womens Clothing E-Commerce Reviews.csv'
data = pd.read_csv(data_path)

In [None]:
# Step 2: Preprocess Data
# Drop missing values
data = data.dropna(subset=['Review Text'])
reviews = data['Review Text'].tolist()

In [None]:
# Step 3: Split Data into Train and Test sets
train_texts, test_texts = train_test_split(reviews, test_size=0.1, random_state=42)

# Step 4: Initialize the T5 Model and Tokenizer
model_name = "t5-large"  # Using the larger T5 model for better performance
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [None]:
# Step 5: Define a Function to Summarize Text
def summarize_text(text, max_length=50):
    input_text = "summarize: " + text
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)

    summary_ids = model.generate(input_ids, max_length=max_length, min_length=20, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [None]:
# Step 6: Summarize the Training Data
train_summaries = [summarize_text(text) for text in train_texts[:100]]  # Summarizing first 100 reviews for demonstration

# Optional Step 7: Evaluate Summarization Performance using ROUGE
rouge = load_metric("rouge")

def evaluate_summaries(references, predictions):
    scores = rouge.compute(predictions=predictions, references=references)
    return scores

# Prepare data for evaluation (only if ground truth summaries are available)
# For this example, we're using original text as a reference which isn't ideal. Ideally, you should have ground-truth summaries.
references = train_texts[:100]
predictions = train_summaries

# Evaluate
scores = evaluate_summaries(references, predictions)
print(scores)

In [None]:
# Step 8: Summarize the Test Data (This is the final output of the project)
test_summaries = [summarize_text(text) for text in test_texts[:10]]  # Summarizing first 10 test reviews

# Output the Test Summaries
for i, summary in enumerate(test_summaries):
    print(f"Review {i+1} Summary: {summary}\n")

In [None]:
test_summaries.to_csv('summarized_reviews.csv', index=False)
