# Daily Retreat: Using Sentiment Analysis to<br>Find, Personalize, and Share Positive News from Popular Online Sources
__Aaron Carr, Azucena Faus, and Dave Friesen - ADS-599-01-SU23__

In [1]:
__author__ = 'Aaron Carr, Azucena Faus, Dave Friesen'
__email__ = 'acarr@sandiego.edu, afaus@sandiego.edu, dfriesen@sandiego.edu'
__version__ = '1.0'
__date__ = 'July/August 2023'

In [2]:
# Establish Google Drive connection
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

# Establish working directory
%cd '/content/drive/My Drive/ADS-599-01-SU23/src'

Mounted at /content/drive
/content/drive/My Drive/ADS-599-01-SU23/src


## Setup

In [3]:
# Install known missing libraries in Colab
!pip install emoji
!pip install transformers
!pip install sentence-transformers
!pip install pinecone-client

# Import basic and data access libraries
import numpy as np
import pandas as pd
from profiler import profile, profile_cat

# Import lexicon and pre-processing functions
import nltk
from text_processing import remove_stop, join_tokens, tokenize, convert_emojis, contains_emoji, remove_punct, prepare

# Import lexicon and transformer-based sentiment libraries
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon', quiet=True)

from transformers import pipeline as tpipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.nn.functional import softmax

# Import embedding libraries
from sentence_transformers import SentenceTransformer

# Import vector database (senmantic search) libraries
import pinecone
from pinecone import ApiException

# Import visualization libraries
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

# Import utility libraries
from tqdm import tqdm; tqdm.pandas()
import time



In [4]:
# Set basic np, pd, and plt output defaults (keeping this code 'clean')
%run -i 'defaults.py'

## Data Load and Validation

In [5]:
# Instantiate and confirm master dataframe
news_00_df = pd.read_csv('../data/data_preprocessed_w_sw_2023-07-20_13-02-01408354.csv')
print(news_00_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36405 entries, 0 to 36404
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   text_id               36405 non-null  int64 
 1   source_name           36405 non-null  object
 2   author                36196 non-null  object
 3   title                 36405 non-null  object
 4   url                   36405 non-null  object
 5   publish_date          36405 non-null  object
 6   article_text          36405 non-null  object
 7   content               36405 non-null  object
 8   processed_text        36403 non-null  object
 9   processed_text_split  36405 non-null  object
 10  num_tokens            36405 non-null  int64 
dtypes: int64(2), object(9)
memory usage: 3.1+ MB
None


## Data Cleaning, Normalization, and Tokenization

In [6]:
# Set pipeline
pipeline = [str.lower, remove_punct, convert_emojis, tokenize, remove_stop]

# NOTE: Excluding the following in favor of Aaron's pre-processed data:
# Clean and tokenize master dataframe
#news_00_df['article_text'] = news_00_df['article_text'].astype(str)
#news_00_df['article_tokens'] = news_00_df['article_text'].progress_apply(lambda x: prepare(x, pipeline))
#news_00_df['article_text_tokenized'] = news_00_df['article_tokens'].progress_apply(lambda x: join_tokens(x))

## Data Profiling

In [7]:
# Descriptive stats function
def descriptive_stats(tokens, num_tokens=5, verbose=False):
    num_tokens = len(tokens)
    num_unique_tokens = len(set(tokens))  # set() creates unordered set of unique elements
    num_characters = sum(len(token) for token in tokens)  # Finds characters sans spaces
    lexical_diversity = num_unique_tokens / num_tokens

    if verbose:
        print(f'There are {num_tokens} tokens in the data.')
        print(f'There are {num_unique_tokens} unique tokens in the data.')
        print(f'There are {num_characters} characters in the data.')
        print(f'The lexical diversity is {lexical_diversity:.3f} in the data.')

    return([num_tokens, num_unique_tokens, lexical_diversity, num_characters])

In [8]:
# Descriptive stats across all sources
descriptive_stats([token for sublist in news_00_df['processed_text_split'] for token in sublist])

[244312828, 992, 4.060368045839983e-06, 244312828]

In [9]:
# Standard dataframe profile for confirmation
profile(news_00_df)

100%|██████████| 1/1 [00:00<00:00, 1002.70it/s]
100%|██████████| 11/11 [00:00<00:00, 18447.56it/s]


Unnamed: 0,Dtype,count,unique,na,na%,mean,std,min,max,skew(>=3),<v0.01,VIF(>=10),examples
text_id,int64,36405.0,36405.0,,,22393.0,16159.1,2.0,71592.0,,,,9081__6911__23786__1
source_name,object,36405.0,17.0,,,,,,,,,,New York Post__New Y
author,object,36196.0,7290.0,209.0,0.6,,,,,,,,"Jack Morphet, Amanda"
title,object,36405.0,34208.0,,,,,,,,,,Fire breaks out bene
url,object,36405.0,36405.0,,,,,,,,,,https://nypost.com/2
publish_date,object,36405.0,32829.0,,,,,,,,,,2023-06-29T17:00:13Z
article_text,object,36405.0,34247.0,,,,,,,,,,\n\n\n\n\nMore On:\n\n\t\t\t\t\t
content,object,36405.0,34544.0,,,,,,,,,,An underground trans
processed_text,object,36403.0,34365.0,2.0,,,,,,,,,more on fires firefi
processed_text_split,object,36405.0,34366.0,,,,,,,,,,"['more', 'on', 'fire"


In [10]:
# Descriptive stats aggregating function
def aggregate_and_describe(group):
    aggregate_tokens = [token for sublist in group['processed_text_split'].tolist() for token in sublist]
    return descriptive_stats(aggregate_tokens)

# Aggregate descriptive stats by source; convert to dataframe; sort and output
grouped_stats = news_00_df.groupby('source_name').apply(aggregate_and_describe)
grouped_stats_df = pd.DataFrame(grouped_stats.tolist(), index=grouped_stats.index,
                                columns=['num_tokens', 'num_unique_tokens', 'lexical_diversity', 'num_characters'])
grouped_stats_df = grouped_stats_df.sort_index(ascending=False)
print(grouped_stats_df)

                     num_tokens  num_unique_tokens  lexical_diversity  num_characters
source_name                                                                          
Wired                  1829526           35               0.00             1829526   
Vox                    4223299           53               0.00             4223299   
USA Today             30727737          582               0.00            30727737   
The Washington Post     316094           31               0.00              316094   
Reuters                 531936           31               0.00              531936   
PEOPLE                 1435411          100               0.00             1435411   
New York Post         40085294          411               0.00            40085294   
NBC News               7242018           38               0.00             7242018   
MSNBC                  1248633           34               0.00             1248633   
Fox News               6850380           40           

## Data Preparation: Sentiment Analysis

### Lexicon Approach

In [11]:
news_01_df = news_00_df.copy()

# Replace NaN values with empty string and otherwise confirm string types
news_01_df['processed_text'] = news_01_df['processed_text'].fillna("")
news_01_df['processed_text'] = news_01_df['processed_text'].astype(str)

# Create class instance analyzer for VADER sentiment
sid = SentimentIntensityAnalyzer()

# Classification category function
def classify_sentiment(score):
    if score < -0.05:
        return 'negative'
    elif score > 0.05:
        return 'positive'
    else:
        return 'neutral'

# Apply VADER sentiment
news_01_df['sentiment_vader'] =\
    news_01_df['processed_text'].progress_apply(lambda text: sid.polarity_scores(text))
news_01_df['sentiment_vader_compound'] =\
    news_01_df['sentiment_vader'].progress_apply(lambda d: d['compound'])
news_01_df['sentiment_vader_cat'] =\
    news_01_df['sentiment_vader_compound'].progress_apply(classify_sentiment)

# Persist dataframe with sentiments (so don't need to re-run)
print(f'Writing={len(news_01_df)}')
subset_df = news_01_df[['text_id',
                        'sentiment_vader', 'sentiment_vader_compound', 'sentiment_vader_cat']]
subset_df.to_csv('../data/news-01.csv', index=False)
print(f'Written={len(subset_df)}')

# Summarize sentiment by 'source_name' for analysis by source
grouped_sentiment = news_01_df.groupby('source_name')['sentiment_vader_cat'].value_counts()
print(grouped_sentiment.to_string(max_rows=None))

100%|██████████| 36405/36405 [05:17<00:00, 114.62it/s]
100%|██████████| 36405/36405 [00:00<00:00, 783787.89it/s]
100%|██████████| 36405/36405 [00:00<00:00, 1085111.52it/s]


Writing=36405
Written=36405
source_name          sentiment_vader_cat
ABC News             negative               2541
                     positive               2240
                     neutral                  77
Breitbart News       positive                513
                     negative                511
                     neutral                  14
Business Insider     positive               1036
                     negative                518
                     neutral                   9
Buzzfeed             positive                700
                     negative                 50
CNBC                 positive               1043
                     negative                289
                     neutral                   6
CNN                  positive                904
                     negative                768
                     neutral                  14
Forbes               positive               6758
                     negative               1007


### Transformer Approach

In [12]:
news_02_df = news_01_df.copy()

# Load pre-trained model
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Attempt GPU usage (works on Colab; I don't have a dedicated local GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Get sentiment (probability and label) function
def analyze_sentiment(text):
    # Encode text; truncate if necessary (note, not chunking full article, just using first 512)
    inputs = tokenizer.encode_plus(text, truncation=True, max_length=512,
                                   padding='max_length', return_tensors='pt')

    # Move tensors to appropriate device (GPU if available)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Perform inference
    outputs = model(input_ids=inputs['input_ids'],
                    attention_mask=inputs['attention_mask'])  # could also unpack this wth simple model(**inputs)
    logits = outputs.logits
    predicted_class = logits.argmax().item()

    # Apply softmax to logits to get class probabilities
    probabilities = torch.softmax(logits, dim=1).detach().cpu().numpy()
    sentiment_score = probabilities[0][1]

    # Map predicted class to sentiment label
    sentiment_label = "positive" if predicted_class == 1 else "negative"

    return pd.Series([sentiment_score, sentiment_label])

# Apply sentiment analysis to 'processed_text' column
news_02_df[['sentiment_bert_prob', 'sentiment_bert_cat']] =\
    news_02_df['processed_text'].progress_apply(analyze_sentiment)

# Standardize BERT probability back to a sentiment score, for comparison
news_02_df['sentiment_bert'] = 2 * (news_02_df['sentiment_bert_prob'] - 0.5)

# Persist dataframe with sentiments (so don't need to re-run)
print(f'Writing={len(news_02_df)}')
subset_df = news_02_df[['text_id',
                        'sentiment_vader', 'sentiment_vader_compound', 'sentiment_vader_cat',
                        'sentiment_bert_prob', 'sentiment_bert_cat', 'sentiment_bert']]
subset_df.to_csv('../data/news-02.csv', index=False)
print(f'Written={len(subset_df)}')

# Summarize sentiment by 'source_name' for analysis by source
grouped_sentiment = news_02_df.groupby('source_name')['sentiment_bert_cat'].value_counts()
print(grouped_sentiment.to_string(max_rows=None))

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

100%|██████████| 36405/36405 [06:55<00:00, 87.54it/s]


Writing=36405
Written=36405
source_name          sentiment_bert_cat
ABC News             negative              4126
                     positive               732
Breitbart News       negative               905
                     positive               133
Business Insider     negative              1336
                     positive               227
Buzzfeed             positive               396
                     negative               354
CNBC                 negative              1190
                     positive               148
CNN                  negative              1338
                     positive               348
Forbes               negative              4882
                     positive              2904
Fox News             negative              1099
                     positive               280
MSNBC                negative               207
                     positive                18
NBC News             negative               954
                    

In [13]:
news_02_df[:10][['title', 'sentiment_vader_cat', 'sentiment_bert_cat']]

Unnamed: 0,title,sentiment_vader_cat,sentiment_bert_cat
0,"Tito's launches 'Tito's in a Big Can,' an empty cocktail keg listed at $200",positive,negative
1,Search for missing actor Julian Sands continues in 'limited capacity',positive,positive
2,Four star running back picks Michigan State over UNC,positive,negative
3,Alabama center Charles Bediako signs one-year deal with San Antonio Spurs,positive,positive
4,Ralph Sampson breaks down iconic Boston Celtics-Houston Rockets fight in 1986 playoffs,positive,positive
5,Cowherd: Lakers are delusional for wanting to pay Austin Reaves big bucks,positive,negative
6,Recruiting Roundup: Latest Alabama Football recruiting news,positive,negative
7,Eagles’ 2023 training camp preview: Jalen Hurts joins an elite tier of NFL quarterbacks,positive,negative
8,Lincoln Riley’s late-night emoji tweets equal four USC commits on Tuesday,positive,negative
9,Washington passes California as most expensive state for gas: AAA,positive,negative


## Data Preparation: Zero-Shot Classification

In [14]:
# NOTE: Excluding this; it was a "shot" at attempting to "blindly" categorize
#   articles but I quickly found it to be too computationally-intensive, even
#   using a Colab-based GPU; left code here for now

news_03_df = news_02_df.copy()

# Load pre-trained classifier
#classifier = tpipeline('zero-shot-classification', model='facebook/bart-large-mnli')

# Set generic (global) categories
categories = ['news', 'business', 'technology', 'entertainment', 'sports']

# Classification function - finding label with highest score
def classify_text(text):
    # Truncate text to first 512 characters for performance
    text = text[:512]

    result = classifier(text, categories)
    return result['labels'][np.argmax(result['scores'])]

# Classify
#news_03_df['category'] = news_03_df['article_text'].progress_apply(classify_text)

# Summarize categorization
#grouped_cat = news_03_df.groupby('category')[['category', 'title']].value_counts()
#grouped_cat

## Data Preparation: Embeddings

In [15]:
# NOTE: This section is creating sentence embeddings - as part of feature engineering -
#   to set up the possibility for semantic search in modeling
news_04_df = news_03_df.copy()

# Initialize transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')
E_DIMENSIONS = 384  # default for all-MiniLM-L6-v2

# Check if GPU is available; if so, move the model to GPU
if torch.cuda.is_available():
    model = model.to('cuda')

# Create embeddings for each article (use GPU "if" available)
news_04_df['embeddings'] = news_04_df['processed_text'].progress_apply(
    lambda x: model.encode(x, convert_to_tensor=True, device=str(device)))

# Persist dataframe so don't need to re-run every time
print(f'Writing={len(news_04_df)}')
subset_df = news_04_df[['text_id',
                        'sentiment_vader', 'sentiment_vader_compound', 'sentiment_vader_cat',
                        'sentiment_bert_prob', 'sentiment_bert_cat', 'sentiment_bert',
                        'embeddings']]
subset_df.to_csv('../data/news-04.csv', index=False)
print(f'Written={len(subset_df)}')
# . . . and save embeddings as numpy arrays
embeddings = np.vstack(news_04_df['embeddings'].apply(lambda x: x.cpu().numpy()).values)
np.save('../data/embeddings.npy', embeddings)
print(f'Embeddings written={len(embeddings)}')

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

100%|██████████| 36405/36405 [07:25<00:00, 81.69it/s]


Writing=36405
Written=36405
Embeddings written=36405


## Data Preparation: Embedding Persistence for Semantic Similarity+

In [16]:
# NOTE: This section is using popular open-source Pinecone to persist embeddings in a
#   very efficient way for potential semantic search in modeling

V_INDEX = 'news'

# Establish connection to Pinecone
pinecone.init(api_key='[obfuscated]', environment='[obfuscated]')

# Completely delete exiting index "if" needed
#try:
#    pinecone.delete_index('quickstart')
#    print("Index 'quickstart' has been deleted.")
#except Exception as e:
#    print('An error occurred while trying to delete the index:')

# Create index "if" needed
#pinecone.create_index(V_INDEX, dimension=E_DIMENSIONS, metric='cosine')

# Confirm index
pinecone.list_indexes()

['news']

In [17]:
# Set index and refresh
index = pinecone.Index(V_INDEX)

# Refresh index
index.delete(deleteAll=True)

# Prepare a list of tuples for upsert
data_to_insert = [(str(i), embeddings[i].tolist()) for i in range(news_04_df.shape[0])]

# Chunking function
CHUNK_SIZE = 500
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

# Insert data into index in chunks
i = 0
for chunk in chunks(data_to_insert, CHUNK_SIZE):
    try:
        print(f'Upserting chunk {i+1}... ', end='')
        index.upsert(chunk)
        print(f'Successfully upserted chunk {i+1}.')
    except ApiException as e:
        print(f'Failed to upsert chunk {i+1}. Exception: {str(e)}')
        time.sleep(1)  # delay to avoid hitting API rate limit
        continue
    i += 1

# Confirm
index.describe_index_stats()

Upserting chunk 1... Successfully upserted chunk 1.
Upserting chunk 2... Successfully upserted chunk 2.
Upserting chunk 3... Successfully upserted chunk 3.
Upserting chunk 4... Successfully upserted chunk 4.
Upserting chunk 5... Successfully upserted chunk 5.
Upserting chunk 6... Successfully upserted chunk 6.
Upserting chunk 7... Successfully upserted chunk 7.
Upserting chunk 8... Successfully upserted chunk 8.
Upserting chunk 9... Successfully upserted chunk 9.
Upserting chunk 10... Successfully upserted chunk 10.
Upserting chunk 11... Successfully upserted chunk 11.
Upserting chunk 12... Successfully upserted chunk 12.
Upserting chunk 13... Successfully upserted chunk 13.
Upserting chunk 14... Successfully upserted chunk 14.
Upserting chunk 15... Successfully upserted chunk 15.
Upserting chunk 16... Successfully upserted chunk 16.
Upserting chunk 17... Successfully upserted chunk 17.
Upserting chunk 18... Successfully upserted chunk 18.
Upserting chunk 19... Successfully upserted ch

{'dimension': 384,
 'index_fullness': 0.2,
 'namespaces': {'': {'vector_count': 36405}},
 'total_vector_count': 36405}