In [34]:
# Import libraries

import requests
import json
import os 
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline
import torch

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [37]:
# Collect and parse data from the NYT API

load_dotenv()

api_key = os.getenv("NYT_API_KEY")
section = "world" 

base_url = "https://api.nytimes.com/svc/topstories/v2"
endpoint = f"/{section}.json"
api_url = base_url + endpoint

params = {"api-key": api_key}

try:
    response = requests.get(api_url, params=params)
    response.raise_for_status()

    data = response.json()

    if data and data.get('status') == 'OK' and data.get('results'):
        article_texts = []
        articles = data['results']
        for article in articles:
            title = article.get('title')
            abstract = article.get('abstract')
            if title and abstract:
                article_texts.append(f"{article['title']}: {article['abstract']}")
        
    else:
        print("Failed to retrieve top stories data.")
        if data.get('fault'):
            print(f"Error Message: {data['fault']['faultstring']}")

except requests.exceptions.RequestException as e:
    print(f"Error fetching the API: {e}")
except json.JSONDecodeError:
    print("Error decoding the JSON response.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

In [38]:
# Call model and predict sentiment

model_name = "tabularisai/multilingual-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def predict_sentiment(texts):
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    sentiment_map = {0: "Very Negative", 1: "Negative", 2: "Neutral", 3: "Positive", 4: "Very Positive"}
    return [sentiment_map[p] for p in torch.argmax(probabilities, dim=-1).tolist()]

for text, sentiment in zip(article_texts, predict_sentiment(article_texts)):
    if sentiment in ("Positive","Very Positive"):
        print(f"Text: {text}\nSentiment: {sentiment}\n")

Text: 100 Days of Solitude: Trump and the Retreat of America: President Trump’s approach to foreign policy in his second term has been transactional, unpredictable and exploitative. Allies and enemies alike are beginning to adapt.
Sentiment: Very Positive

Text: A Contender for the Papacy Known for Promoting Dialogue With Other Faiths: Cardinal Jean-Marc Aveline comes from the diverse port city of Marseille in France.
Sentiment: Very Positive

Text: Brazilian Nun Who Was World’s Oldest Person Dies at 116: Sister Inah Canabarro Lucas received a blessing from Pope Francis when she turned 110. She would have turned 117 in a few weeks.
Sentiment: Very Positive

Text: Marco Rubio, Secretary of Everything: The former senator from Florida is now the head of four government bodies. He has outdone Henry Kissinger and even Xi Jinping, China’s leader, who has only three main titles.
Sentiment: Very Positive

Text: Friday Briefing: A White House Shake-Up: Plus, the actors up for a Tony Award.
Sent

In progress below: figuring out the code for sending the automated email messages.

In [None]:
import smtplib
from email.mime.text import MIMEText

SMTP_SERVER = "sandbox.smtp.mailtrap.io"
SMTP_PORT = 465  # 465 for SSL

USERNAME = os.getenv("USERNAME")
PASSWORD = os.getenv("PASSWORD")

sender_email = os.getenv("EMAIL")
receiver_email = os.getenv("EMAIL")
subject = "The Good News Project"
body = "Hello, this is a test email!"

message = MIMEText(body, "plain")
message["Subject"] = subject
message["From"] = sender_email
message["To"] = receiver_email

# Open secure connection and send the email
with smtplib.SMTP(SMTP_SERVER, SMTP_PORT) as server:
    server.starttls()
    server.login(USERNAME, PASSWORD)
    server.sendmail(sender_email, receiver_email, message.as_string())

print("Email sent successfully!")

In progress below: prepping data to try transfer learning on the pre-trained model used above.

In [107]:
data = pd.read_csv("data/bbc_news.csv")
# Data from: https://www.kaggle.com/datasets/gpreda/bbc-news

In [108]:
# Choose subset of data from end of 2024
data = data[41500:42000]

In [109]:
# Combine title and description of news
data["News"] = data["title"] + ": " + data["description"]

In [110]:
# Label placeholder
data["Label"] = 0

In [111]:
data = data[["News","Label"]]

In [112]:
data.head()

Unnamed: 0,News,Label
41500,The Papers: US 'lifts ban' on strikes in Russia and royal 'raid': Monday's papers feature Joe Biden lifting the ban on Ukraine using US arms to strike Russia and a burglary on a royal estate.,0
41501,How to see Leonid meteor shower as it peaks on Sunday night: The Leonid meteor shower will peak late on Sunday night into early Monday but will the weather allow for good viewing?,0
41502,"'Sixth great extinction is happening', conservationist Jane Goodall warns: Conservationist Jane Goodall on the urgent need to turn the tide on climate change and nature loss.",0
41503,"Final phase for mass rape trial that has horrified France: Fifty-one men are on trial in a case that focuses on a formerly married couple, Dominique and Gisèle Pelicot.",0
41504,Russia's soldiers bringing wartime violence back home: Many of the attackers have previous criminal convictions and were released from prison purely to fight.,0


In [None]:
mapping = {
    'positive': 2,
    'negative': 0,
    'neutral': 1,
}

In [None]:
data.at[41500,'Label']=1
data.at[41501,'Label']=2
data.at[41502,'Label']=0
data.at[41503,'Label']=0
data.at[41504,'Label']=0
data.at[41505,'Label']=1
data.at[41506,'Label']=1
data.at[41507,'Label']=1
data.at[41508,'Label']=0
data.at[41509,'Label']=1
data.at[41510,'Label']=1
data.at[41511,'Label']=0
data.at[41512,'Label']=2
data.at[41513,'Label']=0
data.at[41514,'Label']=2

In [113]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):
        display(data)

Unnamed: 0,News,Label
41500,The Papers: US 'lifts ban' on strikes in Russia and royal 'raid': Monday's papers feature Joe Biden lifting the ban on Ukraine using US arms to strike Russia and a burglary on a royal estate.,0
41501,How to see Leonid meteor shower as it peaks on Sunday night: The Leonid meteor shower will peak late on Sunday night into early Monday but will the weather allow for good viewing?,0
41502,"'Sixth great extinction is happening', conservationist Jane Goodall warns: Conservationist Jane Goodall on the urgent need to turn the tide on climate change and nature loss.",0
41503,"Final phase for mass rape trial that has horrified France: Fifty-one men are on trial in a case that focuses on a formerly married couple, Dominique and Gisèle Pelicot.",0
41504,Russia's soldiers bringing wartime violence back home: Many of the attackers have previous criminal convictions and were released from prison purely to fight.,0
41505,Tesco row shows Sundays are still sacred on Hebridean islands: The first Sunday opening of a Tesco has sparked a row and put a part of island culture under the spotlight.,0
41506,"Charli XCX: My parents drove me to raves aged 15: The pop star inspired a cultural phenomenon this summer with many adopting the ""brat"" way of life.",0
41507,'London-style' bus services promised for England with £1bn boost: The Department for Transport says funding will be allocated based on levels of deprivation and population.,0
41508,"Police officers 'punch bags' in 'epidemic' of violent attacks: There has been a surge in violence against the police, a BBC investigation has found.",0
41509,Brazil first lady uses expletive against Elon Musk at G20 event: Janja Lula da Silva joked that she was not afraid of the owner of social media platform X.,0


In [39]:
import pandas as pd 
data = pd.read_csv("data/news_sentiment_analysis.csv")
# Data from: https://www.kaggle.com/datasets/clovisdalmolinvieira/news-sentiment-analysis/data

In [40]:
data = data[["Title","Description","Sentiment"]]

In [41]:
data.head()

Unnamed: 0,Title,Description,Sentiment
0,Pine View High teacher wins Best in State awar...,"ST. GEORGE — Kaitlyn Larson, a first-year teac...",positive
1,Businesses Face Financial Strain Amid Liquidit...,"Harare, Zimbabwe – Local businesses are grappl...",neutral
2,Musk donates to super pac working to elect Tru...,(marketscreener.com) Billionaire Elon Musk has...,positive
3,US FTC issues warning to franchisors over unfa...,(marketscreener.com) A U.S. trade regulator on...,negative
4,Rooftop solar's dark side,4.5 million households in the U.S. have solar ...,positive


In [42]:
data["News"] = data["Title"] + ": " + data["Description"]

# Change labels from strings to integers
mapping = {
    'positive': 2,
    'negative': 0,
    'neutral': 1,
}

data["Label"] = data["Sentiment"].map(mapping)
data.head()


Unnamed: 0,Title,Description,Sentiment,News,Label
0,Pine View High teacher wins Best in State awar...,"ST. GEORGE — Kaitlyn Larson, a first-year teac...",positive,Pine View High teacher wins Best in State awar...,2
1,Businesses Face Financial Strain Amid Liquidit...,"Harare, Zimbabwe – Local businesses are grappl...",neutral,Businesses Face Financial Strain Amid Liquidit...,1
2,Musk donates to super pac working to elect Tru...,(marketscreener.com) Billionaire Elon Musk has...,positive,Musk donates to super pac working to elect Tru...,2
3,US FTC issues warning to franchisors over unfa...,(marketscreener.com) A U.S. trade regulator on...,negative,US FTC issues warning to franchisors over unfa...,0
4,Rooftop solar's dark side,4.5 million households in the U.S. have solar ...,positive,Rooftop solar's dark side: 4.5 million househo...,2


In [43]:
data.drop(["Sentiment","Title","Description"],axis=1,inplace=True)

In [44]:
data

Unnamed: 0,News,Label
0,Pine View High teacher wins Best in State awar...,2
1,Businesses Face Financial Strain Amid Liquidit...,1
2,Musk donates to super pac working to elect Tru...,2
3,US FTC issues warning to franchisors over unfa...,0
4,Rooftop solar's dark side: 4.5 million househo...,2
...,...,...
3495,"Arrow Electronics, Inc. (NYSE:ARW) Shares Purc...",2
3496,"3,120 Shares in NICE Ltd. (NASDAQ:NICE) Bought...",2
3497,"QRG Capital Management Inc. Has $857,000 Stock...",2
3498,Biotechnology Market: Surging Investments and ...,1


In [30]:
# Imbalanced dataset
data['Label'].value_counts()

2    2134
1     789
0     577
Name: Label, dtype: int64

To clean here the dataset to be used for transfer learning.

In [45]:
data[data["Label"] == 2]

Unnamed: 0,News,Label
0,Pine View High teacher wins Best in State awar...,2
2,Musk donates to super pac working to elect Tru...,2
4,Rooftop solar's dark side: 4.5 million househo...,2
5,Gabelli asks Paramount for details on National...,2
6,QWI INVESTMENTS : QWI) &ndash; ANNOUNCEMENT RE...,2
...,...,...
3493,QRG Capital Management Inc. Decreases Stake in...,2
3495,"Arrow Electronics, Inc. (NYSE:ARW) Shares Purc...",2
3496,"3,120 Shares in NICE Ltd. (NASDAQ:NICE) Bought...",2
3497,"QRG Capital Management Inc. Has $857,000 Stock...",2


In [73]:
data.at[2,'Label']=0
data.at[4,'Label']=0
data.at[8,'Label']=1
data.at[18,'Label']=1
data.at[23,'Label']=0

In [71]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):
        display(data[data["Label"] == 2])

Unnamed: 0,News,Label
0,"Pine View High teacher wins Best in State award for business leadership class: ST. GEORGE — Kaitlyn Larson, a first-year teacher at Pine View High School, recently received the Best in State award for a business leadership course. Larson’s students in grades 10-12 had the highest passing rate on the YouScience exam during the spring and fall semesters of 2023. The YouScience exam is similar to final exams [&#8230;]",2
5,"Gabelli asks Paramount for details on National Amusements valuation: (marketscreener.com) Billionaire investor Mario Gabelli'sinvestment firm is seeking more details about the valuation ofNational Amusements assets, the investor told Reuters on Friday,signaling the firm may challenge a landmark entertainmentindustry deal inked this week. Skydance Media and Paramount Global agreed onSunday to merge, in a deal that...https://www.marketscreener.com/business-leaders/Mario-Gabelli-1709/news/Gabelli-asks-Paramount-for-details-on-National-Amusements-valuation--47374649/?utm_medium=RSS&utm_content=20240712",2
6,"QWI INVESTMENTS : QWI) &ndash; ANNOUNCEMENT RE NET ASSET VALUE OF SHARES: (marketscreener.com) July 12, 2024 5:08 pm QWI announces that at the close of business on 12 July 2024 the Net Asset Value of QWI's shares was $1.32 per share. Attachments Original Link ...https://www.marketscreener.com/quote/stock/QWI-INVESTMENTS-LIMITED-134402695/news/QWI-INVESTMENTS-QWI-ANNOUNCEMENT-RE-NET-ASSET-VALUE-OF-SHARES-47374602/?utm_medium=RSS&utm_content=20240712",2
7,"Rome Resources Announces Shareholder Approval of Arrangement with Pathfinder Minerals: (marketscreener.com) Rome Resources Ltd. is pleased to announce that, at its special meeting of the holders of common shares held earlier today, Rome Shareholders approved a special resolution authorizing and approving the previously announced arrangement under section 288 of the Business Corporations Act among Rome, Pathfinder Minerals Plc and 1475033 B.C....https://www.marketscreener.com/quote/stock/ROME-RESOURCES-LTD-147174692/news/Rome-Resources-Announces-Shareholder-Approval-of-Arrangement-with-Pathfinder-Minerals-47374600/?utm_medium=RSS&utm_content=20240712",2
9,"What Makes Spynn Publicity The Top Choice For Australian Businesses: With creative strategies and deep industry knowledge, Spynn Publicity has established itself as a standout player in public relations.",2
10,"Quick Wits & Quicker Reflexes — See Also: I'll Drink To That: Kirland & Ellis partner flexes dexterity mid business call.Gotta Love Rankings!: Check out this list of amazing small, boutique, and midsize firms.Really Could Use More Details On The Bar: Legal educators show concern over what this means for students.Shaky Memory, Absent Law License: Florida Supreme Court strips former judge of his ability to practice.IMMUNITY! IMMUNITY!: Trump thinks yelling it repeatedly will get rid of his legal troubles.The post Quick Wits & Quicker Reflexes &#8212; See Also appeared first on Above the Law.",2
11,City of Poughkeepsie explores developing downtown Business Improvement District: POUGHKEEPSIE – The City of Poughkeepsie will be exploring the possibility of creating a Business Improvement District to breathe new life into the downtown retail corridor along Main Street. Mayor Yvonne Flowers and the common council has lent their support toward development of the BID. Funded by a special assessment on property owners in the [&#8230;],2
13,"Baltimore Biz Journal hires Terzi as research editor: The Baltimore Business Journal has hired Ben Terzi as its research editor. Terzi manages various databases to produce weekly Lists, the Book of Lists and other research projects for the Baltimore Business Journal. Prior to joining the Business Journal, he covered community news for The Dundalk Eagle and Avenue News. Born and raised in Baltimore, Terzi is [&#8230;]",2
15,"Bob Linda leaving his footprints in Solano: Footprints Floors of Solano is currently a one-man operation, but it hasn’t slowed Linda down. The self-proclaimed ‘workaholic’ is his own production manager, salesman and computer guy, but not for long. With business doing well, he says he’ll be able to add to the team in the near future.",2
16,"Bob Linda leaving his footprints in Solano: Footprints Floors of Solano is currently a one-man operation, but it hasn’t slowed Linda down. The self-proclaimed ‘workaholic’ is his own production manager, salesman and computer guy, but not for long. With business doing well, he says he’ll be able to add to the team in the near future.",2


In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    report_to=[]
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')


    if len(np.unique(labels)) > 2:
        roc_auc = None
    else:
        roc_auc = roc_auc_score(labels, pred.predictions[:, 1])

    return {
        'accuracy': accuracy,
        'f1': f1,
        'roc_auc': roc_auc
    }


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics = compute_metrics
)