In [None]:
import json
from transformers import BartTokenizer, BartForConditionalGeneration

# Load BART tokenizer and model
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

# Load the dataset
with open('test.json', 'r') as f:
    data = json.load(f)

# Group emails by thread_id
threads = {}
for email in data:
    thread_id = email['thread_id']
    if thread_id not in threads:
        threads[thread_id] = {'subject': email['subject'], 'emails': []}
    threads[thread_id]['emails'].append(email['body'])

# Summarize each thread
thread_summaries = {}
for thread_id, thread_data in threads.items():
    # Concatenate all email bodies within the thread
    thread_body = " ".join(thread_data['emails'])

    # Tokenize and summarize the thread body
    inputs = tokenizer(thread_body, return_tensors='pt', max_length=1024, truncation=True)
    summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=300, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Remove \u00a0 and \ from the summary
    summary = summary.replace('\u00a0', '')

    # Store the thread summary along with the subject
    thread_summaries[thread_id] = {'subject': thread_data['subject'], 'summary': summary}

# Save thread summaries to a file
with open('thread_summaries2.json', 'w') as f:
    json.dump(thread_summaries, f, indent=4)

print("Summaries saved to 'thread_summaries2.json'")


Summaries saved to 'thread_summaries2.json'
