# Title summary generation using Bedrock APIs


### Imports

In [2]:
import json
import os
import sys
import pandas as pd
from time import time
from tqdm import tqdm
import threading


In [3]:
# if 100k file 
# with open('data/110kclusters_eps0.05.json') as data_file:
with open('data/clusters_eps-0.15.json') as data_file:
    file_contents = data_file.read()
    parsed_json = json.loads(file_contents)

# Manually process streaming articles, then summarize

In [4]:
# get only the clusters from the output and sort the articles in chronical order
# if 100k file
# cluster_info = {cluster:sorted(parsed_json[cluster], key=lambda x:x['publication_date'], reverse=False) for cluster in parsed_json.keys() if 'cluster' in cluster}
cluster_info = {cluster:sorted(parsed_json[cluster], key=lambda x:x['publication_date'], reverse=False) for cluster in parsed_json.keys() if int(cluster)>=0}
# check the clusters article numbers
art_num = sorted([(key, len(cluster_info[key])) for key in cluster_info.keys()], key=lambda x:x[1], reverse=True)
print('Total number of cluster is %s, total number of articles is %s' % (len(cluster_info), sum([ii[1] for ii in art_num])))
print('The top 10 number of cluster has the following number of articles:', art_num[:10])

Total number of cluster is 330, total number of articles is 1382
The top 10 number of cluster has the following number of articles: [('0', 86), ('69', 45), ('24', 31), ('60', 30), ('79', 30), ('137', 26), ('14', 25), ('126', 19), ('18', 18), ('51', 17)]


In [5]:
# Step 2. remove duplicates in each of the clusters
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
def remove_dup_article(cluster, threshold):
    """
    # for each of the cluster, use tfidf for the headline and summary, 
    # return the index of the earlier articles. 
    input: cluster -> list of dictionary 
    return list of dictionary where the output are dedeplicated. 
    """
    summary_corps = [ids['headline'] + ids['summary'] for ids in cluster]
    vectorizer = TfidfVectorizer(lowercase=True, stop_words='english')
    
    try: 
        tfidf_matrix = vectorizer.fit_transform(summary_corps)
        similarity_score = tfidf_matrix * tfidf_matrix.T.A
        dup_idx = set([min(x, y) for x, y in zip(*np.where(similarity_score > threshold)) if x != y])
        if len(dup_idx) >= 5:
            print(f"there are {len(dup_idx)} duplicate articles removed from {cluster[0]['object_id']}")
        return [d for i, d in enumerate(cluster) if i not in dup_idx]
    except:
        return cluster

In [6]:
start_time = time()
cluster_dedup = {cluster: remove_dup_article(cluster_info[cluster], 0.9) for cluster in cluster_info.keys()}
print('total time cost for deduplicate is %3.2fs' % (time()-start_time))
dedup_num = sorted([(key, len(cluster_dedup[key])) for key in cluster_dedup.keys()], key=lambda x:x[1], reverse=True)
print('Total number of cluster is %s, total number of articles is %s' % (len(cluster_dedup), sum([ii[1] for ii in dedup_num])))
print('The top 10 number of cluster has the following number of articles:', dedup_num[:10])

total time cost for deduplicate is 0.99s
Total number of cluster is 330, total number of articles is 1262
The top 10 number of cluster has the following number of articles: [('0', 86), ('69', 45), ('24', 31), ('60', 30), ('79', 27), ('14', 25), ('137', 24), ('126', 19), ('51', 17), ('18', 16)]


In [61]:
# Genearte summarization 
import boto3
def generate_bedrock_claude(input_tokens,
                            bedrock_client = boto3.client('bedrock-runtime'),
                            modelId = 'anthropic.claude-instant-v1'):
    claude_body = {
        'modelId': modelId,
        'body': json.dumps({
            "prompt": f"\n\nHuman: {input_tokens}\n\nAssistant: ", 
            "max_tokens_to_sample": 500, # the higher this is the longer it takes
            "temperature": 0.1, # these parameters affect response diversity
            "top_p": 1, 
            "top_k": 100,
        })
    }
    bedrock_response = bedrock_client.invoke_model(
        **claude_body,
        accept="*/*",
        contentType="application/json",
    )
    body = bedrock_response.get("body")
    rd = body.read()
    try:
        response = eval(rd)['completion']
        output_token_cnt = int(
            bedrock_response["ResponseMetadata"]["HTTPHeaders"].get(
                "x-amzn-bedrock-output-token-count"
                )
            )
        input_token_cnt = int(
            bedrock_response["ResponseMetadata"]["HTTPHeaders"].get(
                "x-amzn-bedrock-input-token-count"
                    )
            )
    except:
        print(rd)
    return input_token_cnt, output_token_cnt, response

def parse_res(res):
    try:
        title = res.split('<title>')[-1].split('</title>')[0]
        summary = res.split('<summary>')[-1].split('</summary>')[0]
        return f"Title:{title} \nSummary:{summary}"
    except:
        return res
def generate_cluster_summary(cluster, num_sum=20):
    cluster_sum = ['']
    input_context = []
    inputs = '\n'.join([f"headline: {article['headline']}, summary: {article['summary']}" for article in cluster])
    input_token_num = 0
    output_token_num = 0
    try:
        for ii in range(0, len(cluster), num_sum):
            instructions = "You will be provided with multiple sets of headlines and summaries from different articles in <context> tag, and the current title and summary for a story in <story> tag. Compile, summarize and update the current title and summary for the story. The summary should be less than 100 words. Put the generated context inside <title> and <summary> tag. Do not hallucinate or make up content.\n\n"
            texts = '\n'.join([f"headline: {article['headline']}, summary: {article['summary']}" for article in cluster[ii:ii+num_sum]])
            prompt = f"{instructions} <story> \n{cluster_sum[-1]} </story> \n\n <context>\n{texts}\n</context>\n"
            input_context.append(prompt)
            output = generate_bedrock_claude(prompt[:120000])
            input_token_num += output[0]
            output_token_num += output[1]
            response = parse_res(output[2])
            cluster_sum.append(response)
        return '\n\n'.join([f'Iteration {ii} generated results: \n{text}' for ii, text in enumerate(cluster_sum[1:])]), input_token_num, output_token_num, inputs
    except:
        print(cluster[0]['est_cluster'])

import asyncio
async def async_generate_cluster_summary(cluster):
    return await asyncio.to_thread(generate_cluster_summary, cluster)

In [62]:

start_time = time()
results = await asyncio.gather(*(async_generate_cluster_summary(cluster_dedup[cluster]) for cluster in cluster_dedup.keys()))
print("total run time is %3.2fs" % (time()-start_time))


total run time is 135.39s


In [63]:
# results analysis part
outputs = []
for ii, key in enumerate(cluster_dedup.keys()):
    try:
        response, input_token_num, output_token_num, input_text = results[ii]
        # inputs = [ids['headline'] + ids['summary'] for ids in cluster_dedup[key]]
        outputs.append((key, input_text, input_token_num, len(cluster_dedup[key]), response))
    except:
        print(key)

In [65]:
df_outputs = pd.DataFrame(outputs, columns=['cluster_id', 'inputs', 'input_token', 'article_num', 'generate_text'])
df_outputs.head()

Unnamed: 0,cluster_id,inputs,input_token,article_num,generate_text
0,0,headline: Shooting in France shows US is not a...,8205,86,Iteration 0 generated results: \nTitle:\nRiots...
1,1,"headline: AP News in Brief at 11:04 pm EDT , s...",506,5,Iteration 0 generated results: \nTitle:Supreme...
2,2,headline: NSW: Thousands delayed as winds forc...,845,9,Iteration 0 generated results: \nTitle:\nSydne...
3,3,headline: Europeaposs Euclid space telescope s...,921,9,Iteration 0 generated results: \nTitle:\nEurop...
4,4,headline: Ugle-Hagan shines as Bulldogs disman...,521,4,Iteration 0 generated results: \nTitle:\nUgle-...


In [66]:
df_outputs.to_csv('new_data_summary_claude_instant_5.csv', index=False)

In [11]:
cluster_dedup['0'][-1]['publication_date']

'2023-07-03T19:40:00Z'

In [24]:
print(df_outputs.loc[0, 'inputs'][0])

You will be provided with multiple sets of headlines and summaries from different articles in <context> tag, and the current title and summary for a story in <story> tag. Compile, summarize and update the current title and summary for the story. The summary should be less than 100 words. Put the generated context inside <title> and <summary> tag. Do not hallucinate or make up content.

 <story> 
 </story> 

 <context>
headline: Shooting in France shows US is not alone in struggles with racism police brutality , summary: The events in France following the death of a 17- year-old shot by police in a Paris suburb are drawing parallels to the racial reckoning in the US spurred by the killings of George Floyd and other people of color at the hands of law enforcement Despite the differences between the two countriesapos cultures police forces and communities the shooting in France and
headline: Clashes continue for fourth night of riots after fatal police shooting in France , summary: While 

In [25]:
for ii in range(5):
    print(df_outputs.loc[0, 'outputs'][ii])

Title:
Riots in France force Macron to postpone state visit to Germany
 
Summary:
French President Emmanuel Macron postponed a state visit to Germany that was scheduled to begin on Sunday after four consecutive nights of nationwide riots in France following the police killing of a 17-year-old teenager. Over 1300 people were arrested during the fourth night of unrest as tensions remain high. The riots have exposed issues of police brutality and racial inequality in France. Macron cancelled the trip to focus on dealing with the crisis within his own country.

Title:
Riots in France force Macron to postpone state visit to Germany and meet mayors
 
Summary:
French President Emmanuel Macron postponed a state visit to Germany and cancelled a trip to meet German leaders to focus on dealing with riots in France after the police killing of a 17-year-old teenager. Over 1300 people have been arrested during the riots across major cities in France. Macron is set to meet several mayors to address t

In [71]:
outputs[173]

('173',
 'headline: Hajj 2023 - 13 Nigerian Pilgrims Die in Saudi Arabia - NAHCON , summary: Thirteen pilgrims from Nigeria have so far died in the course of the 2023 Hajj in Saudi Arabia the National Hajj Commission of Nigeria has said In his presentation Mr Galadima said seven pilgrims died before Mishaaposir while another six died during the Mishaaposir Osun and Kaduna states recorded two fatalities each while Plateau Kaduna Borno Yobe Lagos and Benue\nheadline: Nigerian Pilgrimsapos Death Toll Hits 13 As 41632 Fall Sick in Saudi Arabia , summary: The head of the Nigerian medical team for the pilgrimage Dr Usman Galadima stated this Sunday night in Makkah during post-Arafat review session He said his team made consultations for 25772 pilgrims during the Muna-Arafat period in addition to the 15680 treated in Madinah and Makkah during the pre-Arafat period He gave the breakdown of the casualties as:',
 330,
 2,
 'Iteration 0 generated results: \nTitle:Summary: Death toll rises among N

In [75]:
print(generate_cluster_summary(cluster_dedup['9'])[0])

Iteration 0 generated results: 
Title:
Fremantle star Nat Fyfe suffers another setback with foot injury
 
Summary:
Fremantle midfielder Nat Fyfe was substituted out of his team's loss to the Western Bulldogs on Saturday due to a flare up of his troublesome left foot injury. Scans later revealed the dual Brownlow medalist had suffered a stress fracture in the same foot, which has been plaguing him throughout the season. It is another setback for Fyfe and Fremantle, who still hope their champion can return before the end of the year to help their finals push, but will need to carefully manage the injury.



In [74]:
print(generate_cluster_summary(cluster_dedup['173'])[0])

Iteration 0 generated results: 
Title:Summary: Death toll rises among Nigerian Hajj pilgrims as over 41,000 fall sick in Saudi Arabia 
Summary:According to officials from the National Hajj Commission of Nigeria (NAHCON), the death toll among Nigerian pilgrims performing the Hajj this year has increased to 13 with over 41,632 falling sick so far in Saudi Arabia. The head of the Nigerian medical team reported that 7 pilgrims died before the Mishaaposir ritual while another 6 died during Mishaaposir, with Osun, Kaduna, Plateau, Borno, Yobe, Lagos and Benue states recording casualties. The medical team provided consultations for over 25,000 pilgrims during Arafat and treated over 15,000 in Madinah and Makkah during the pre-Arafat period.
