# Daily Retreat: Using Sentiment Analysis to<br>Find, Personalize, and Share Positive News from Popular Online Sources
__Aaron Carr, Azucena Faus, and Dave Friesen - ADS-599-01-SU23__

In [1]:
__author__ = 'Aaron Carr, Azucena Faus, Dave Friesen'
__email__ = 'acarr@sandiego.edu, afaus@sandiego.edu, dfriesen@sandiego.edu'
__version__ = '1.0'
__date__ = 'July/August 2023'

## Setup

In [2]:
# Are we running in Google Colab, or not?
try:
    from google.colab import drive
    COLAB = True
except:
    COLAB = False

if COLAB:
    # Establish Google Drive connection
    drive.mount('/content/drive', force_remount = True)

    # Establish working directory
    %cd '/content/drive/My Drive/599_team_project/deliverables'

Mounted at /content/drive
/content/drive/My Drive/599_team_project/deliverables


In [3]:
if COLAB:
    # Install known missing libraries in Colab
    !pip install transformers
    !pip install openai
    !pip install langchain
    !pip install pinecone-client

# Import basic and data access libraries
import numpy as np
import pandas as pd

# Import base model and evaluation libraries
from sklearn.model_selection import train_test_split

# Import transformeres, LLM libraries and support
import transformers
import torch

from huggingface_hub import login
import openai
from langchain import HuggingFacePipeline, PromptTemplate, LLMChain
from langchain.llms import OpenAI

# Import vector database (senmantic search) libraries
import tqdm; import warnings; warnings.filterwarnings("ignore", category=tqdm.TqdmExperimentalWarning)
import pinecone
from pinecone import ApiException

# Import visualization libraries
from matplotlib import pyplot as plt
%matplotlib inline

# Import utility libraries
import os
import getpass
from tqdm import tqdm
import time
import textwrap



In [4]:
# Set basic np, pd, and plt output defaults (keeping this code 'clean')
%run -i 'defaults.py'

In [5]:
# Establish GPU support "if" exists
device = torch.device(
    'mps' if torch.backends.mps.is_available()
    else 'cuda' if torch.cuda.is_available()
    else 'cpu'
)
print(device)

cuda


In [6]:
warnings.filterwarnings(action='ignore', category=UserWarning, module='bitsandbytes')

## Data Load and Validation

In [7]:
# Instantiate and confirm master dataframe
news_00_df = pd.read_csv('../data/data_preprocessed_w_sw_2023-07-20_13-02-01408354.csv')

# Load pre-processed sentiment
news_05_df = pd.read_csv('../data/news-05.csv')
news_05_df = pd.merge(news_00_df, news_05_df, on='text_id')

# Load pre-processed embeddings as numpy array
embeddings = np.load('../data/embeddings.npy', allow_pickle=True)
news_05_df['embeddings'] = list(embeddings)

print(news_05_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36405 entries, 0 to 36404
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   text_id                    36405 non-null  int64  
 1   source_name                36405 non-null  object 
 2   author                     36196 non-null  object 
 3   title                      36405 non-null  object 
 4   url                        36405 non-null  object 
 5   publish_date               36405 non-null  object 
 6   article_text               36405 non-null  object 
 7   content                    36405 non-null  object 
 8   processed_text             36403 non-null  object 
 9   processed_text_split       36405 non-null  object 
 10  num_tokens                 36405 non-null  int64  
 11  sentiment_vader            36405 non-null  object 
 12  sentiment_vader_compound   36405 non-null  float64
 13  sentiment_vader_cat        36405 non-null  obj

## Semantic Similarity (experimental)

In [8]:
if COLAB:
    PINECONE_API_KEY = getpass.getpass('Enter Pinecone key')
else:
    PINECONE_API_KEY = os.environ['PINECONE_API_KEY']

V_INDEX = 'news'

# Establish connection to Pinecone
pinecone.init(api_key=PINECONE_API_KEY, environment='us-west1-gcp-free')

indexes = pinecone.list_indexes()
if V_INDEX in indexes:
    print(f"The index '{V_INDEX}' exists.")
else:
    print(f"The index '{V_INDEX}' does not exist.")

Enter Pinecone key··········
The index 'news' exists.


In [9]:
# Set index and refresh
index = pinecone.Index(V_INDEX)
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.2,
 'namespaces': {'': {'vector_count': 36405}},
 'total_vector_count': 36405}

In [10]:
# Query function
def query_vector(index, df, idx, top_k):
    vector = news_05_df.loc[idx, 'embeddings'].tolist()  # convert numpy array to list
    results = index.query(vector=vector, top_k=top_k, include_values=True)
    return results

# Usage example
results = query_vector(index, news_05_df, 11, 3)

# Extract ids from results and convert to integers
ids = [int(match['id']) for match in results['matches']]

# Get corresponding records from 'news_00_df'
records = news_05_df.loc[ids]

# Display 'article_title' for each record
for id, row in records.iterrows():
    print(f'ID: {id}, Title: {row["title"]}')

ID: 11, Title: Look: Stetson Bennett caught a massive fish in Montauk
ID: 34886, Title: North Carolina anglers go swordfishing, instead catch rare species: 'Prehistoric, almost'
ID: 12473, Title: "Prehistoric, almost': Friends set out swordfishing, catch rare, record-setting pomfret


## Summarization (experimental)

In [11]:
if COLAB:
    HUGGINGFACEHUB_API_TOKEN = getpass.getpass('Enter Hugging Face token')
else:
    HUGGINGFACEHUB_API_TOKEN = os.environ['HUGGINGFACEHUB_API_TOKEN']

# Establish pre-trained model and pipeline
hf_token = HUGGINGFACEHUB_API_TOKEN
login(token=hf_token)

model_id = 'meta-llama/Llama-2-7b-chat-hf'

model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    token=hf_token
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    token=hf_token
)

tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    token=hf_token
)

# Create a text generation pipeline
gen_pipe = transformers.pipeline(
    'text-generation',
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    max_length=1024
)

llm = HuggingFacePipeline(pipeline=gen_pipe, model_kwargs={'temperature': 0})

Enter Hugging Face token··········
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [12]:
start_time = time.time()

template = """
Write a one-sentence summary of the following text delimited by triple backquotes.
'''{text}'''
ONE-SENTENCE-ONLY SUMMARY:
"""

prompt = PromptTemplate(template=template, input_variables=['text'])

llm_chain = LLMChain(prompt=prompt, llm=llm)

def get_first_n_chars(text, n):
    if len(text) <= n:
        return text
    else:
        short_text = text[:n]
        return short_text.rsplit(' ', 1)[0]

selected_rows = news_05_df.loc[0:19].copy()

summaries = []
for _, row in tqdm(selected_rows.iterrows()):
    text = get_first_n_chars(row['processed_text'], 200)
    input_dict = {'text': text}
    summary = llm_chain.run(input_dict)
    summaries.append(summary)

selected_rows['summary'] = summaries

stop_time = time.time()
elapsed_time = (stop_time - start_time) / 60.0
print(f'Start time: {time.ctime(start_time)}')
print(f'Stop time: {time.ctime(stop_time)}')
print(f"Elapsed time: {format(elapsed_time, '.2f')} minutes")

20it [04:58, 14.91s/it]

Start time: Sun Jul 30 05:04:03 2023
Stop time: Sun Jul 30 05:09:02 2023
Elapsed time: 4.97 minutes





In [14]:
# Show sample summarization
lwidth = 80

wrapper = textwrap.TextWrapper(width=lwidth)

for idx, row in selected_rows.iterrows():
    title = wrapper.fill(text=row['title'])
    summary = wrapper.fill(text=row['summary'])
    print(f"\nTitle: {title}\nSummary: {summary}\n{'-'*lwidth}")


Title: Tito's launches 'Tito's in a Big Can,' an empty cocktail keg listed at $200
Summary: Owning your own keg of Tito's Handmade Vodka can now become a reality with the
brand's launch of a 1-ounce stainless steel mini cocktail keg.
--------------------------------------------------------------------------------

Title: Search for missing actor Julian Sands continues in 'limited capacity'
Summary: The ongoing search for actor Julian Sands, who went missing during a solo hike
in California over five months ago, will continue in a limited capacity, with
the San Bernardino authorities involved in the investigation.
--------------------------------------------------------------------------------

Title: Four star running back picks Michigan State over UNC
Summary: Four-star running back picks Michigan State over UNC.
--------------------------------------------------------------------------------

Title: Alabama center Charles Bediako signs one-year deal with San Antonio Spurs
Summary: A

## Other (experimentation)

In [15]:
if COLAB:
    OPENAI_API_KEY = getpass.getpass('Enter OpenAI key')
else:
    OPENAI_API_KEY = os.environ['OPENAI_API_KEY']

# Establish OpenAI connection
#openai_organization = os.getenv('OPENAI_ORG')
#openai_api_key = os.getenv('OPENAI_API_KEY')

# Experiment with a simple, related question:-)
llm = OpenAI(openai_api_key = OPENAI_API_KEY,
             model_name='text-davinci-003')

answer = wrapper.fill(text=llm('What is news sentiment analysis?').lstrip())
print(f"\nWhat is news sentiment analysis?\n\n{answer}\n{'-'*lwidth}")

Enter OpenAI key··········

What is news sentiment analysis?

News sentiment analysis is a type of natural language processing which uses
techniques from text mining, natural language understanding, and computational
linguistics to detect and analyze the sentiment of news articles and other types
of text. This type of analysis enables businesses to gain insight into consumer
opinion and sentiment surrounding a particular topic or trend. By analyzing the
sentiment of news articles, marketers can better understand how their brand is
being perceived by the public, and make decisions about how to adjust their
strategies accordingly.
--------------------------------------------------------------------------------


## Data Partitioning (retained but not used here)

In [16]:
# Set splits
train_ratio = 0.7; val_ratio = 0.20; test_ratio = 0.10

# Split and profile
train_df, test_df = train_test_split(news_05_df, test_size=1-train_ratio,
                                     random_state=42)
val_df, test_df = train_test_split(test_df, test_size=test_ratio/(test_ratio+val_ratio),
                                   random_state=42)