In [4]:
news_date = '2023-03-02'

In [125]:
# get article embeddings
# load libs
import tiktoken
from openai.embeddings_utils import get_embedding
import openai
import pandas as pd
import time
import numpy as np
import warnings
warnings.filterwarnings('ignore')

#content = 'sample news'

# prompt design
prompt_gl = "\nAs a hypothetical expert news aggregator who focuses mainly on banking operations, generate news tags, named entities, sentiment and summary for the above news article.\nFollow these rules in your output:\n1. Be honest and output NA if any of the field does not exist or if no news article is given in the input.\n2. Do not omit named entites in the summary.\n3. Sentiment should be only within positive, negative or neutral\n4. Strictly use the following format:\nTags:\nOrganisation:\nNamed Person:\nCountry:\nSentiment:\nConcise Summary:"
print('prompt:\n' + prompt_gl)


# implement retrying library to skip rate limit error
import openai  # for OpenAI API calls
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(10))
def completion_with_backoff(**kwargs):
    return openai.Completion.create(**kwargs)

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(10))
def completion_with_backoff_chat(**kwargs):
    return openai.ChatCompletion.create(**kwargs)

# news factory function with tenacity backoff implemented
def news_factoryv2(news_row):

    openai.api_key = "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"
    
    content = news_row.full_text
    response = completion_with_backoff(
    model="text-davini-003",
    prompt= content + prompt_gl,
    temperature=0, # to prevent entropy
    max_tokens=512,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
    )
    text = response['choices'][0]['text'] # get only the text part
    #time.sleep(1) # to prevent server error

    return text

# news factory function with tenacity backoff implemented
def news_factory_chat(news_row):

    openai.api_key = "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"
    
    content = news_row.full_text
    response = completion_with_backoff_chat(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "assistant", "content": content + prompt_gl},
        ]
    )
    text = response['choices'][0]['message']['content'] # get only the text part

    return text

# function to extract details from llm into pd.series 
def extract_detailsv2(row):
    details = {}
    for item in row.strip().split('\n'):
        split_item = item.strip().split(':')
        if len(split_item) >= 2:
            key, value = split_item[:2]
            details[key.strip()] = value.strip().strip("'")
        else:
            #print(split_item)
            pass
    return pd.Series(details)

# read headline news 1 feb
news_df = pd.read_csv(f'./datasets/ucrawler/{news_date}/{news_date}_rawnews.csv')
print('\ntotal number of news articles: ' + str(len(news_df)))
news_df.head(5)

# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 3000  # the maximum for text-embedding-ada-002 is 8191

encoding = tiktoken.get_encoding(embedding_encoding)

# omit reviews that are too long to embed
news_df["n_tokens"] = news_df.text.apply(lambda x: len(encoding.encode(x)))
news_df = news_df[news_df.n_tokens <= max_tokens]

##### set sample when doing test runs  #######
n = 5
print(f'number of news articles for current run {n}')
news_df = news_df.sample(n)

runs = len(news_df) // 30
extra = len(news_df) % 30
print(f'number of batches: {runs}')
print(f'number of extra rows after runs complete, for last batch')
print('Created a new dataframe called "pro_news" to store values')
pro_news = pd.DataFrame()
print('\nNext part is the CRITICAL PART - note that each run costs $$$, so try to minimise the number of redundant runs')
print('If you want to create sample runs, set sample within this block')

prompt:

As a hypothetical expert news aggregator who focuses mainly on banking operations, generate news tags, named entities, sentiment and summary for the above news article.
Follow these rules in your output:
1. Be honest and output NA if any of the field does not exist or if no news article is given in the input.
2. Do not omit named entites in the summary.
3. Sentiment should be only within positive, negative or neutral
4. Strictly use the following format:
Tags:
Organisation:
Named Person:
Country:
Sentiment:
Concise Summary:

total number of news articles: 166
number of news articles for current run 5
number of batches: 0
number of extra rows after runs complete, for last batch
Created a new dataframe called "pro_news" to store values

Next part is the CRITICAL PART - note that each run costs $$$, so try to minimise the number of redundant runs
If you want to create sample runs, set sample within this block


In [126]:
# in case of failure to run the entire loop because of open ai server side issue or meteorite crash,
# change for 'i in range(0, int(run))' to 'i in range(x, int(run))' where x = (number of articles completed/batch size)
# batch size is 30 by default so x = (n of articles completely run)/ 20

for i in range(0, int(runs)):
    
    if i == 0:
        word = 'first'
    else:
        word = 'next'
    
    print(f'running {word} batch.... ' + str(i*30) + ' to ' + str((i+1)*30) + '...')
    
    batch = news_df.iloc[(i*30):(i+1)*30]
    # Apply the factory function to each row in the batch
    batch['gpt_out'] = batch.apply(news_factory_chat, axis=1)

    pro_news = pd.concat([pro_news, batch])
    # Wait for 1 minute before sending the next batch
    time.sleep(60)

    print('success!')

#final ones
print('hang tight, one more batch to go...')
batch = news_df.iloc[(runs*30):len(news_df)]
# Apply the factory function to each row in the batch
batch['gpt_out'] = batch.apply(news_factory_chat, axis=1)
pro_news = pd.concat([pro_news, batch])

print('success!')

hang tight, one more batch to go...
success!


In [128]:
print('Number of news articles: ' + str(len(pro_news)))

# store back-up
back_news = pro_news

# extract details from column gpt_out and store into new columns
print('extracting generated fields...')
df_details = pro_news['gpt_out'].apply(extract_detailsv2)

df_key = df_details[['Tags', 'Organisation', 'Named Person', 'Country', 'Concise Summary']]
pro_news = pd.concat([pro_news, df_key], axis=1)

# change some column name
pro_news.rename(columns={'Concise Summary':'Summary'}, inplace=True)

# save dataset
print('saving dataset...')
path = f'./datasets/ucrawler/{news_date}/{news_date}_ucgpt.csv'
pro_news.to_csv(path, index=False)

Number of news articles: 5
extracting generated fields...
saving dataset...


In [129]:
pro_news

Unnamed: 0,title,text,pub_time,meta_images,source,url,full_text,n_tokens,gpt_out,Tags,Organisation,Named Person,Country,Summary
161,Malaysia’s Anwar says Asean cannot view Myanma...,· PM Anwar says the 10-nation grouping should ...,2023-03-02 00:04:18.114000+00:00,https://cdn.i-scmp.com/sites/default/files/sty...,SCMP,https://www.scmp.com/news/asia/southeast-asia/...,Malaysia’s Anwar says Asean cannot view Myanma...,63,\n\nTags: #Malaysia #Myanmar #ASEAN #SouthChin...,#Malaysia #Myanmar #ASEAN #SouthChinaSea #junt...,ASEAN,Anwar,"Malaysia, Myanmar, China",Malaysian PM Anwar has stated that ASEAN canno...
100,Chinese tourists won’t get true Japanese welco...,Tourists enjoy a rickshaw ride in Tokyo. Photo...,2023-03-02 02:13:31.339000+00:00,https://cdn.i-scmp.com/sites/default/files/sty...,SCMP,https://www.scmp.com/week-asia/economics/artic...,Chinese tourists won’t get true Japanese welco...,210,"\n\nTags: tourism, travel industry, labor shor...","tourism, travel industry, labor shortage, pand...",,Sayaka Hamano,"Japan, China",Japan's tourism industry is concerned about th...
65,Chee Hong Tat on criteria for Government socia...,The criteria for the Government's social suppo...,2023-03-02 03:34:19.499000+00:00,https://onecms-res.cloudinary.com/image/upload...,CNA,https://www.channelnewsasia.com/watch/chee-hon...,Chee Hong Tat on criteria for Government socia...,140,"\n\nTags: Government social support schemes, A...","Government social support schemes, Annual Valu...",Government of Singapore,Chee Hong Tat,Singapore,Senior Minister of State for Finance Chee Hong...
54,Vo Van Thuong elected as Vietnam's new president,"HANOI, March 2 (Xinhua) -- Vo Van Thuong, a me...",2023-03-02 03:43:40.836000+00:00,,Xinhua,https://english.news.cn/20230302/f9e7878b77bb4...,Vo Van Thuong elected as Vietnam's new preside...,153,"Tags: Vietnam, Vo Van Thuong, President, Commu...","Vietnam, Vo Van Thuong, President, Communist P...","Communist Party of Vietnam, National Assembly","Vo Van Thuong, Nguyen Xuan Phuc, Vo Thi Anh Xuan",Vietnam,Vo Van Thuong has been elected as Vietnam's ne...
16,Singapore launches $125 million coastal resear...,SINGAPORE - A $125 million research programme ...,2023-03-02 04:42:28.816000+00:00,https://static1.straitstimes.com.sg/s3fs-publi...,Straits Times,https://www.straitstimes.com/singapore/politic...,Singapore launches $125 million coastal resear...,542,"Tags: coastal protection, flood management, re...","coastal protection, flood management, research...","National water agency PUB, National University...",,Singapore,Singapore is set to launch a $125m research pr...


In [132]:
import collections
import re

def thing_counter(thing):

    pro_news[thing].fillna("", inplace=True)

    pattern = r'[,#]'
    country_sr = pro_news[thing].apply(lambda x: re.split(pattern, x))

    # Convert the 'Country' column into a list
    country_list = country_sr.tolist()
    #print(country_list)
    #print(len(country_list))
    # Flatten the list of lists into a single list using a list comprehension
    flat_list = [item for sublist in country_list for item in sublist]
    
    #print(flat_list)
    if ' Ltd.' in flat_list:
        flat_list.remove(' Ltd.')

    if 'Ltd.' in flat_list:
        flat_list.remove('Ltd.')

    if 'N/A' in flat_list:
        flat_list.remove('N/A')

# follow this script to standardise multiple spellings of the same word
    flat_list = ['United States' if x == 'US' else x for x in flat_list]
    flat_list = ['United States' if x == ' US' else x for x in flat_list]
    flat_list = ['COVID-19' if x == 'Covid-19' else x for x in flat_list]
    flat_list = ['COVID-19' if x == ' Covid-19' else x for x in flat_list]

    if 'NA' in flat_list:
        flat_list.remove('NA')


    # Remove the surrounding white spaces from the list
    stripped_countries = [country.strip() for country in flat_list]
    #print(stripped_countries)
    # Create a frequency count of the countries
    country_counts = dict(collections.Counter(stripped_countries))

    # Create a new dataframe with the country and its count
    new_df = pd.DataFrame({thing: list(country_counts.keys()), 'Count': list(country_counts.values())})

    new_df = new_df
    new_df = new_df[new_df[thing] != '']
    new_df = new_df[new_df[thing] != 'NA']
    new_df = new_df.sort_values(by='Count', ascending=False).reset_index(drop=True)
    new_df.Count = new_df['Count'].apply(lambda x: str(x))
    new_df[thing + '_count'] = new_df[thing] + ' (' + new_df['Count'] + ')'

    return new_df

In [133]:
# count countries
country_df = thing_counter('Country')
country_df = country_df.reset_index(drop=True)
country_sm = country_df[0:10]

# count organisations
org_df = thing_counter('Organisation')
org_df = org_df.reset_index(drop=True)
org_sm = org_df[0:10]

# count keywords
keywords_df = thing_counter('Tags')

# remove keywords that have already been captured in country and organisation
keywords_torem = country_df['Country'].to_list() + org_df['Organisation'].to_list()
# Use the isin function to remove the rows in keywords that contain keywords in the list
keywords_un = keywords_df[~keywords_df['Tags'].isin(keywords_torem)]
# other adhoc keyword removal
keywords_un = keywords_un[keywords_un.Tags != 'involuntary manslaughter']
keywords_un.reset_index(drop=True, inplace=True)
keywords_sm = keywords_un[0:10]

# concatenate all dfs
tags_df = pd.concat([country_sm, org_sm, keywords_sm], axis=1)

# add 'All' column to have it as a parameter field in tableau
list_row = ['All', 'NA', 'NA', 'All', 'NA', 'NA', 'All', 'NA', 'NA']
tags_df.loc[len(tags_df)] = list_row

# add date and file name
tags_df['date'] = news_date
tags_df['file'] = 'tags'

tags_df.date = pd.to_datetime(tags_df.date)

tags_df.to_excel(f'./datasets/ucrawler/{news_date}/{news_date}_newstags.xlsx', index=False)
tags_df

Unnamed: 0,Country,Count,Country_count,Organisation,Count.1,Organisation_count,Tags,Count.2,Tags_count,date,file
0,China,2.0,China (2),ASEAN,1.0,ASEAN (1),climate change,1.0,climate change (1),2023-03-02,tags
1,Singapore,2.0,Singapore (2),Government of Singapore,1.0,Government of Singapore (1),research programme,1.0,research programme (1),2023-03-02,tags
2,Malaysia,1.0,Malaysia (1),Communist Party of Vietnam,1.0,Communist Party of Vietnam (1),flood management,1.0,flood management (1),2023-03-02,tags
3,Myanmar,1.0,Myanmar (1),National Assembly,1.0,National Assembly (1),coastal protection,1.0,coastal protection (1),2023-03-02,tags
4,Japan,1.0,Japan (1),National water agency PUB,1.0,National water agency PUB (1),President,1.0,President (1),2023-03-02,tags
5,Vietnam,1.0,Vietnam (1),National University of Singapore,1.0,National University of Singapore (1),Vo Van Thuong,1.0,Vo Van Thuong (1),2023-03-02,tags
6,,,,Nanyang Technological University,1.0,Nanyang Technological University (1),Silver Generation Office,1.0,Silver Generation Office (1),2023-03-02,tags
7,,,,Singapore University of Technology and Design,1.0,Singapore University of Technology and Design (1),eligibility,1.0,eligibility (1),2023-03-02,tags
8,,,,Singapore Institute of Technology,1.0,Singapore Institute of Technology (1),Annual Value,1.0,Annual Value (1),2023-03-02,tags
9,,,,Agency for Science,1.0,Agency for Science (1),Government social support schemes,1.0,Government social support schemes (1),2023-03-02,tags
