In [None]:
# function to find relevance scores to given areas of interests. Calculated using cosim similarity and BERT based embeddings
def calculate_relevance(news_date, use_current_date = False): # if use_current_date is set to True, given date string is overridden  

    # import libraries
    import pandas as pd
    import openai 
    import numpy as np
    import tiktoken
    from openai.embeddings_utils import get_embedding
    # ignore warnings
    import warnings
    warnings.filterwarnings('ignore')

    # if use_current_date is true, run_date for the function is set to current date, else given date
    if use_current_date:
        current_date = datetime.now().date()
        run_date = str(current_date)
    else:
        run_date = news_date

    # path for news data with GPT generated summaries and entities
    ucrawler_path = f'./datasets/ucrawler/{news_date}/{news_date}_ucgpt.csv'
    unews_df = pd.read_csv(ucrawler_path)
    print('total articles in loaded news: ' + str(len(unews_df)))

    #df = unews_df
    print('removing rows with NA values in Summary')
    unews_df.dropna(subset='Summary', inplace=True) # remove rows with NA summaries

    # BERT based embeddings
    from sentence_transformers import SentenceTransformer
    from openai.embeddings_utils import cosine_similarity
    model = SentenceTransformer('multi-qa-distilbert-cos-v1') # different models could be experimented here
    # please google for more info on the above model

    # function to compute cosine similarity given input
    def bert_similar_news(df, search_word, colname, n=5, pprint=False):

        # Compute similarity scores between search word and all articles in the Summary column
        scores = []
        for summary in df['Summary']:
            summary_bert = model.encode(summary)
            search_bert = model.encode(search_word)
            # append score
            scores.append(cosine_similarity(search_bert,summary_bert))

        # Add similarity scores as a new column to the DataFrame
        #colname = colname + '_bert'
        df[colname] = scores
        
        # Sort the DataFrame by similarity scores in descending order
        #df.sort_values(by='Similarity Score', ascending=False, inplace=True)

        results = (
        df.sort_values(colname + '', ascending=False)
        .head(n)
        .Summary
        )
        
        if pprint:
            for r in results:
                print(r)
                print()

        # Return the first n rows of the sorted DataFrame as a list of tuples, where each tuple is a (summary, similarity score) pair
        return df[colname]

    # your Areas Of Interests (AOI) go here
    colnames = ['savings accounts', 'credit cards', 'housing loans', 'wealth', 'SME banking', 'international trade', 
                'supply chain', 'stock market', 'debt market', 'cryptocurrencies', 
                'artificial intelligence', 'natural disaster', 'service outage', 
                'cyber security', 'consumer banking', 'banking regulations', 'fraud and scams']
    
    cols_keep = list(unews_df.columns) # columns to not melt later when converting to long format (i.e. all columns other than the AOIs)

    #news_df = df

    print('Running BERT model to calculate news relevance to Areas Of Interests (this might take a few mins, hang tight)')
    # Run BERT based model
    for i in np.arange(len(colnames)):
        unews_df[colnames[i]] = bert_similar_news(unews_df, colnames[i], colnames[i], 1, False)

    unews_df.reset_index(inplace=True, drop=True)
    unews_df.reset_index(inplace=True)
    unews_df['index'] += 1 # make index start from 1

    # store wide format df in case
    unews_df.to_csv(f'./datasets/ucrawler/{news_date}/{news_date}_cosim.csv', index=False)

    # add 'index' column in columns not to melt when converting to long
    cols_keep = cols_keep + ['index']

    # convert to long
    news_dflong = pd.melt(unews_df, id_vars=cols_keep, value_vars=colnames, var_name='tag', value_name='cosim')
    # add file column to df
    news_dflong['file'] = 'headlines'

    print('saving dataset...')
    # store results
    news_dflong.to_excel(f'./datasets/ucrawler/{news_date}/{news_date}_longcosim.xlsx', index=False)
    print('success!')

In [None]:
calculate_relevance('2023-02-28')