In [None]:
news_date = '2023-02-28'

In [None]:
# get article embeddings
# load libs
import tiktoken
from openai.embeddings_utils import get_embedding
import openai
import pandas as pd
import time
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# prompt design
prompt_gl = "\nAs a hypothetical expert news aggregator who focuses mainly on banking operations, generate news tags, named entities, sentiment and summary for the following news article. Be honest and output NA if any of the field does not exist or if no news article is given in the input. Use the following format:\nTags:\nOrganisation:\nNamed Person:\nCountry:\nSentiment (positive/negative/neutral):\nConcise Summary (without omitting named entities):"
print('prompt:\n' + prompt_gl)


# implement retrying library to skip rate limit error
import openai  # for OpenAI API calls
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(10))
def completion_with_backoff(**kwargs):
    return openai.Completion.create(**kwargs)

# news factory function with tenacity backoff implemented
def news_factoryv2(news_row):

    openai.api_key = "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"
    
    content = news_row.full_text
    response = completion_with_backoff(
    model="text-davinci-003",
    prompt= content + prompt_gl,
    temperature=0, # to prevent entropy
    max_tokens=512,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
    )
    text = response['choices'][0]['text'] # get only the text part
    #time.sleep(1) # to prevent server error

    return text

# function to extract details from llm into pd.series 
def extract_detailsv2(row):
    details = {}
    for item in row.strip().split('\n'):
        split_item = item.strip().split(':')
        if len(split_item) >= 2:
            key, value = split_item[:2]
            details[key.strip()] = value.strip().strip("'")
        else:
            print(split_item)
            pass
    return pd.Series(details)

# read headline news 1 feb
news_df = pd.read_csv(f'./datasets/ucrawler/{news_date}/rawnews_{news_date}.csv')
print('\ntotal number of news articles: ' + str(len(news_df)))
news_df.head(5)

# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 3000  # the maximum for text-embedding-ada-002 is 8191

encoding = tiktoken.get_encoding(embedding_encoding)

# omit reviews that are too long to embed
news_df["n_tokens"] = news_df.text.apply(lambda x: len(encoding.encode(x)))
news_df = news_df[news_df.n_tokens <= max_tokens]

##### set sample when doing test runs  #######
n = 1
print(f'number of news articles for current run {n}')
news_df = news_df.sample(n)

runs = len(news_df) // 30
extra = len(news_df) % 30
print(f'number of batches: {runs}')
print(f'number of extra rows after runs complete, for last batch')
print('Created a new dataframe called "pro_news" to store values')
pro_news = pd.DataFrame()
print('\nNext part is the CRITICAL PART - note that each run costs $$$, so try to minimise the number of redundant runs')
print('If you want to create sample runs, set sample within this block')

In [None]:
for i in range(0, int(runs)):
    
    if i == 0:
        word = 'first'
    else:
        word = 'next'
    
    print(f'running {word} batch.... ' + str(i*30) + ' to ' + str((i+1)*30) + '...')
    
    batch = news_df.iloc[(i*30):(i+1)*30]
    # Apply the factory function to each row in the batch
    batch['gpt_out'] = batch.apply(news_factory, axis=1)

    pro_news = pd.concat([pro_news, batch])
    # Wait for 1 minute before sending the next batch
    time.sleep(60)

    print('success!')

#final ones
print('hang tight, one more batch to go...')
batch = news_df.iloc[(runs*30):len(news_df)]
# Apply the factory function to each row in the batch
batch['gpt_out'] = batch.apply(news_factoryv2, axis=1)
pro_news = pd.concat([pro_news, batch])
print('success!')

In [None]:
print('Number of news articles: ' + str(len(pro_news)))

# store back-up
back_news = pro_news

# extract details from column gpt_out and store into new columns
print('extracting generated fields...')
df_details = pro_news['gpt_out'].apply(extract_detailsv2)

df_key = df_details[['Tags', 'Organisation', 'Named Person', 'Country', 'Sentiment', 'Concise Summary']]
pro_news = pd.concat([pro_news, df_key], axis=1)

# save dataset
print('saving dataset...')
path = f'./datasets/ucrawler/{news_date}/{news_date}_ucgpt.csv'
pro_news.to_csv(path, index=False)

In [None]:
import collections

def thing_counter(thing):

    pro_news[thing].fillna("", inplace=True)

    country_sr = pro_news[thing].apply(lambda x: x.split(','))

    # Convert the 'Country' column into a list
    country_list = country_sr.tolist()
    #print(country_list)
    #print(len(country_list))
    # Flatten the list of lists into a single list using a list comprehension
    flat_list = [item for sublist in country_list for item in sublist]
    
    #print(flat_list)
    if ' Ltd.' in flat_list:
        flat_list.remove(' Ltd.')

    if 'Ltd.' in flat_list:
        flat_list.remove('Ltd.')

    if 'N/A' in flat_list:
        flat_list.remove('N/A')

# follow this script to standardise multiple spellings of the same word
    flat_list = ['United States' if x == 'US' else x for x in flat_list]
    flat_list = ['United States' if x == ' US' else x for x in flat_list]
    flat_list = ['COVID-19' if x == 'Covid-19' else x for x in flat_list]
    flat_list = ['COVID-19' if x == ' Covid-19' else x for x in flat_list]

    if 'NA' in flat_list:
        flat_list.remove('NA')


    # Remove the surrounding white spaces from the list
    stripped_countries = [country.strip() for country in flat_list]
    #print(stripped_countries)
    # Create a frequency count of the countries
    country_counts = dict(collections.Counter(stripped_countries))

    # Create a new dataframe with the country and its count
    new_df = pd.DataFrame({thing: list(country_counts.keys()), 'Count': list(country_counts.values())})

    new_df = new_df
    new_df = new_df[new_df[thing] != '']
    new_df = new_df[new_df[thing] != 'NA']
    new_df = new_df.sort_values(by='Count', ascending=False).reset_index(drop=True)
    new_df.Count = new_df['Count'].apply(lambda x: str(x))
    new_df[thing + '_count'] = new_df[thing] + ' (' + new_df['Count'] + ')'

    return new_df

In [None]:
# count countries
country_df = thing_counter('Country')
country_df = country_df.reset_index(drop=True)
country_sm = country_df[0:10]

# count organisations
org_df = thing_counter('Organisation')
org_df = org_df.reset_index(drop=True)
org_sm = org_df[0:10]

# count keywords
keywords_df = thing_counter('Tags')

# remove keywords that have already been captured in country and organisation
keywords_torem = country_df['Country'].to_list() + org_df['Organisation'].to_list()
# Use the isin function to remove the rows in keywords that contain keywords in the list
keywords_un = keywords_df[~keywords_df['Tags'].isin(keywords_torem)]
# other adhoc keyword removal
keywords_un = keywords_un[keywords_un.Tags != 'involuntary manslaughter']
keywords_un.reset_index(drop=True, inplace=True)
keywords_sm = keywords_un[0:10]

# concatenate all dfs
tags_df = pd.concat([country_sm, org_sm, keywords_sm], axis=1)

# add 'All' column to have it as a parameter field in tableau
list_row = ['All', 'NA', 'NA', 'All', 'NA', 'NA', 'All', 'NA', 'NA']
tags_df.loc[len(tags_df)] = list_row

# add date and file name
tags_df['date'] = news_date
tags_df['file'] = 'tags'

tags_df.date = pd.to_datetime(tags_df.date)

tags_df.to_excel(f'./datasets/ucrawler/{news_date}/{news_date}_newstags.xlsx', index=False)
tags_df