# Daily Retreat: Using Sentiment Analysis to<br>Find, Personalize, and Share Positive News from Popular Online Sources
__Aaron Carr, Azucena Faus, and Dave Friesen - ADS-599-01-SU23__

In [1]:
__author__ = 'Aaron Carr, Azucena Faus, Dave Friesen'
__email__ = 'acarr@sandiego.edu, afaus@sandiego.edu, dfriesen@sandiego.edu'
__version__ = '1.0'
__date__ = 'July/August 2023'

## Setup

In [2]:
# Are we running in Google Colab, or not?
try:
    from google.colab import drive
    COLAB = True
except:
    COLAB = False

if COLAB:
    # Establish Google Drive connection
    drive.mount('/content/drive', force_remount = True)

    # Establish working directory
    %cd '/content/drive/My Drive/599_team_project/deliverables'

In [3]:
if COLAB:
    # Install known missing libraries in Colab
    !pip install transformers
    !pip install openai
    !pip install langchain
    !pip install pinecone-client

# Import basic and data access libraries
import numpy as np
import pandas as pd

# Import base model and evaluation libraries
from sklearn.model_selection import train_test_split

# Import transformeres, LLM libraries and support
import transformers
import torch

from huggingface_hub import login
import openai
from langchain import HuggingFacePipeline, PromptTemplate, LLMChain
from langchain.llms import OpenAI

# Import vector database (senmantic search) libraries
import tqdm; import warnings; warnings.filterwarnings("ignore", category=tqdm.TqdmExperimentalWarning)
import pinecone
from pinecone import ApiException

# Import visualization libraries
from matplotlib import pyplot as plt
%matplotlib inline

# Import utility libraries
import os
from tqdm import tqdm
import time

In [4]:
# Set basic np, pd, and plt output defaults (keeping this code 'clean')
%run -i 'defaults.py'

In [5]:
# Establish GPU support "if" exists
device = torch.device(
    'mps' if torch.backends.mps.is_available()
    else 'cuda' if torch.cuda.is_available()
    else 'cpu'
)
print(device)

mps


In [6]:
warnings.filterwarnings(action='ignore', category=UserWarning, module='bitsandbytes')

## Data Load and Validation

In [7]:
# Instantiate and confirm master dataframe
news_00_df = pd.read_csv('../data/data_preprocessed_w_sw_2023-07-20_13-02-01408354.csv')

# Load pre-processed sentiment
news_05_df = pd.read_csv('../data/news-05.csv')
news_05_df = pd.merge(news_00_df, news_05_df, on='text_id')

# Load pre-processed embeddings as numpy array
embeddings = np.load('../data/embeddings.npy', allow_pickle=True)
news_05_df['embeddings'] = list(embeddings)

print(news_05_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36405 entries, 0 to 36404
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   text_id                   36405 non-null  int64  
 1   source_name               36405 non-null  object 
 2   author                    36196 non-null  object 
 3   title                     36405 non-null  object 
 4   url                       36405 non-null  object 
 5   publish_date              36405 non-null  object 
 6   article_text              36405 non-null  object 
 7   content                   36405 non-null  object 
 8   processed_text            36403 non-null  object 
 9   processed_text_split      36405 non-null  object 
 10  num_tokens                36405 non-null  int64  
 11  sentiment_vader           36405 non-null  object 
 12  sentiment_vader_compound  36405 non-null  float64
 13  sentiment_vader_cat       36405 non-null  object 
 14  sentim

## Semantic Similarity (experimental)

In [8]:
V_INDEX = 'news'

# Establish connection to Pinecone
pinecone.init(api_key=os.environ['PINECONE_API_KEY'], environment='us-west1-gcp-free')

indexes = pinecone.list_indexes()
if V_INDEX in indexes:
    print(f"The index '{V_INDEX}' exists.")
else:
    print(f"The index '{V_INDEX}' does not exist.")

The index 'news' exists.


In [9]:
# Set index and refresh
index = pinecone.Index(V_INDEX)
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.2,
 'namespaces': {'': {'vector_count': 36405}},
 'total_vector_count': 36405}

In [10]:
# Query function
def query_vector(index, df, idx, top_k):
    vector = news_05_df.loc[idx, 'embeddings'].tolist()  # convert numpy array to list
    results = index.query(vector=vector, top_k=top_k, include_values=True)
    return results

# Usage example
results = query_vector(index, news_05_df, 11, 3)

# Extract ids from results and convert to integers
ids = [int(match['id']) for match in results['matches']]

# Get corresponding records from 'news_00_df'
records = news_05_df.loc[ids]

# Display 'article_title' for each record
for id, row in records.iterrows():
    print(f'ID: {id}, Title: {row["title"]}')

ID: 11, Title: Look: Stetson Bennett caught a massive fish in Montauk
ID: 34886, Title: North Carolina anglers go swordfishing, instead catch rare species: 'Prehistoric, almost'
ID: 12473, Title: "Prehistoric, almost': Friends set out swordfishing, catch rare, record-setting pomfret


## Summarization (experimental)

In [11]:
# Establish pre-trained model and pipeline
hf_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
login(token=hf_token)

model_id = 'meta-llama/Llama-2-7b-chat-hf'

model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    token=hf_token
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    token=hf_token
)

tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    token=hf_token
)

# Create a text generation pipeline
gen_pipe = transformers.pipeline(
    'text-generation',
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    max_length=1024
)

llm = HuggingFacePipeline(pipeline=gen_pipe, model_kwargs={'temperature': 0})

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /Users/davidfriesen/.cache/huggingface/token
Login successful
'NoneType' object has no attribute 'cadam32bit_grad_fp32'


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
start_time = time.time()

template = """
Write a one-sentence summary of the following text delimited by triple backquotes.
'''{text}'''
ONE-SENTENCE-ONLY SUMMARY:
"""

prompt = PromptTemplate(template=template, input_variables=['text'])

llm_chain = LLMChain(prompt=prompt, llm=llm)

def get_first_n_chars(text, n):
    if len(text) <= n:
        return text
    else:
        short_text = text[:n]
        return short_text.rsplit(' ', 1)[0]

selected_rows = news_05_df.loc[0:1].copy()

summaries = []
for _, row in tqdm(selected_rows.iterrows()):
    text = get_first_n_chars(row['processed_text'], 200)
    input_dict = {'text': text}
    summary = llm_chain.run(input_dict)
    summaries.append(summary)

selected_rows['summary'] = summaries
    
stop_time = time.time()
elapsed_time = (stop_time - start_time) / 60.0
print(f'Start time: {time.ctime(start_time)}')
print(f'Stop time: {time.ctime(stop_time)}')
print(f"Elapsed time: {format(elapsed_time, '.2f')} minutes")
    
selected_rows.head(10)

2it [03:05, 92.63s/it]


Start time: Sat Jul 29 10:08:17 2023
Stop time: Sat Jul 29 10:11:22 2023
Elapsed time: 3.09 minutes


Unnamed: 0,text_id,source_name,author,title,url,publish_date,article_text,content,processed_text,processed_text_split,num_tokens,sentiment_vader,sentiment_vader_compound,sentiment_vader_cat,sentiment_bert_prob,sentiment_bert_cat,sentiment_bert,sentiment_roberta_prob,sentiment_roberta_cat,sentiment_roberta,embeddings,summary
0,2,USA Today,"USA TODAY, Emily DeLetter, USA TODAY","Tito's launches 'Tito's in a Big Can,' an empty cocktail keg listed at $200",https://www.usatoday.com/story/money/food/2023/06/21/titos-vodka-keg-price/70342132007/,2023-06-21T17:37:40Z,"Have you ever wanted to own your very own keg of Tito's Handmade Vodka? Now, that dream can beco...","Have you ever wanted to own your very own keg of Tito's Handmade Vodka? Now, that dream can beco...",have you ever wanted to own your very own keg of titos handmade vodka. now that dream can become...,"['have', 'you', 'ever', 'wanted', 'to', 'own', 'your', 'very', 'own', 'keg', 'of', 'titos', 'han...",210,"{'neg': 0.041, 'neu': 0.888, 'pos': 0.072, 'compound': 0.8658}",0.87,positive,0.01,negative,-0.98,1.0,positive,1.0,"[-0.07555211, 0.005056652, -0.047664534, 0.013539692, -0.010404922, 0.025438594, 0.053852815, 0....",Owning your own keg of Tito's Handmade Vodka can now become a reality with the brand's launch of...
1,3,USA Today,"USA TODAY, Joy Ashford, USA TODAY",Search for missing actor Julian Sands continues in 'limited capacity',https://www.usatoday.com/story/entertainment/celebrities/2023/06/20/julian-sands-missing-search-...,2023-06-20T17:36:09Z,"Over five months after Julian Sands went missing during a solo hike in California, local police ...","Over five months after Julian Sandswent missing during a solo hike in California, local police h...",over five months after julian sands went missing during a solo hike in california local police h...,"['over', 'five', 'months', 'after', 'julian', 'sands', 'went', 'missing', 'during', 'a', 'solo',...",349,"{'neg': 0.043, 'neu': 0.85, 'pos': 0.107, 'compound': 0.9769}",0.98,positive,0.98,positive,0.96,1.0,positive,0.99,"[0.04023131, -0.012046934, 0.07287487, 0.033149727, 0.072366804, -0.009328416, -0.048083205, -0....","The ongoing search for actor Julian Sands, who went missing during a solo hike in California ove..."


## Other (experimentation)

In [13]:
# Establish OpenAI connection
#openai_organization = os.getenv('OPENAI_ORG')
#openai_api_key = os.getenv('OPENAI_API_KEY')

# Experiment with a simple, related question:-)
llm = OpenAI(openai_api_key = os.getenv('OPENAI_API_KEY'),
             model_name='text-davinci-003')
print(llm('What is news sentiment analysis?'))



News sentiment analysis is the process of using natural language processing and text analytics to identify sentiment within news articles. This can then be used to make predictions related to market sentiment or to provide insights into consumer sentiment. News sentiment analysis can help to determine how people feel about a certain topic or event, as well as how that sentiment may affect the stock market or consumer behavior.


## Data Partitioning (retained but not used here)

In [15]:
# Set splits
train_ratio = 0.7; val_ratio = 0.20; test_ratio = 0.10

# Split and profile
train_df, test_df = train_test_split(news_05_df, test_size=1-train_ratio,
                                     random_state=42)
val_df, test_df = train_test_split(test_df, test_size=test_ratio/(test_ratio+val_ratio),
                                   random_state=42)