In [1]:
from datetime import datetime

import feedparser
import pandas as pd
import yaml
from bs4 import BeautifulSoup
from dateutil import parser
import pytz

In [2]:
def load_config(sector):
    config_file = f"{sector}.yaml"
    with open(config_file, "r") as file:
        rss_dict = yaml.safe_load(file)

    return rss_dict

In [3]:

def clean_text(raw_html: str) -> str:
    cleantext = BeautifulSoup(raw_html, "lxml").text
    return cleantext


def fetch_news(rss_dict: dict) -> pd.DataFrame:
    cols = [
        "ticker",
        "title",
        "summary",
        "published_eastern",
        "link",
        "language",
        "topic",
        "keywords"
    ]
    all_news_items = []

    current_time = datetime.now()
    print(f"Starting new iteration at {current_time}")

    for key, rss_url in rss_dict.items():
        feed = feedparser.parse(rss_url)

        for newsitem in feed["items"]:

            last_subject = (
                newsitem["tags"][-1]["term"]
                if "tags" in newsitem and newsitem["tags"]
                else None
            )
            keywords = (
                ", ".join(newsitem.get("dc_keyword", "").split(", "))
                if "dc_keyword" in newsitem
                else None
            )
            published_gmt = newsitem.get("published", None)
            if published_gmt:
                published_dt = parser.parse(published_gmt)
                eastern = pytz.timezone('US/Eastern')
                published_eastern = published_dt.astimezone(eastern)
                published_eastern = pd.to_datetime(published_eastern).strftime("%a, %d %b %Y %H:%M %Z")
            else:
                published_eastern = None

            all_news_items.append(
                {
                    "ticker": key,
                    "title": newsitem.get("title", None),
                    "summary": clean_text(newsitem.get("summary", "")),
                    "published_eastern": published_eastern,
                    "link": newsitem.get("link", None),
                    "language": newsitem.get("language", None),
                    "topic": last_subject,
                    "keywords": keywords
                }
            )

    return pd.DataFrame(all_news_items, columns=cols)


In [4]:
sector = "biotech_validated"
print(f"Fetching news for sector: {sector}")

rss_dict = load_config(sector)
print(len(rss_dict))


Fetching news for sector: biotech_validated
126


In [5]:
news_df = fetch_news(rss_dict)

Starting new iteration at 2025-03-19 18:21:46.984727


  cleantext = BeautifulSoup(raw_html, "lxml").text


In [6]:
news_df.head()

Unnamed: 0,ticker,title,summary,published_eastern,link,language,topic,keywords
0,ACIU,AC Immune Reports Full Year 2024 Financial Res...,AC Immune Reports Full Year 2024 Financial Res...,"Thu, 13 Mar 2025 07:00 EDT",https://www.globenewswire.com/news-release/202...,en,Earnings Releases and Operating Results,Corporate Update
1,ACIU,AC Immune to Participate in Upcoming Investor ...,AC Immune to Participate in Upcoming Investor ...,"Tue, 04 Mar 2025 07:00 EST",https://www.globenewswire.com/news-release/202...,en,Calendar of Events,Healthcare Conference
2,ACIU,AC Immune Reports Interim Safety Data from Pha...,AC Immune Reports Interim Safety Data from Pha...,"Tue, 10 Dec 2024 07:00 EST",https://www.globenewswire.com/news-release/202...,en,Company Announcement,Down syndrome
3,ACIU,AC Immune Reports Positive Interim Results fro...,AC Immune Reports Positive Interim Results fro...,"Thu, 14 Nov 2024 07:00 EST",https://www.globenewswire.com/news-release/202...,en,Press releases,ACI-7104.056
4,ACIU,AC Immune to Present at the Jefferies 2024 Lon...,AC Immune to Present at the Jefferies 2024 Lon...,"Wed, 13 Nov 2024 07:00 EST",https://www.globenewswire.com/news-release/202...,en,Calendar of Events,Investor Conference


In [7]:
len(news_df)

2501

### MERGE


In [8]:
# Read the CSV file
old_df = pd.read_csv(
    "/Users/akseljoonas/Documents/news-sentiment/data/raw/news_validated-new.csv"
)

In [9]:
old_df.head()

Unnamed: 0,ticker,title,summary,link,topic,published_eastern,language,keywords
0,GRFS,GigaGen Initiates Development of Recombinant P...,"SOUTH SAN FRANCISCO, Calif., March 30, 2020 ...",https://www.globenewswire.com/news-release/202...,Company Announcement,2020-03-30 04:17:00-0400,,
1,GRFS,GigaGen to Present at the Inaugural LifeSci Pa...,"SOUTH SAN FRANCISCO, Calif., July 29, 2020 (...",https://www.globenewswire.com/news-release/202...,Company Announcement,2020-07-29 03:30:00-0400,,
2,GRFS,GigaGen Announces Publication of Research Desc...,"SOUTH SAN FRANCISCO, Calif., Aug. 10, 2020 (...",https://www.globenewswire.com/news-release/202...,Company Announcement,2020-08-10 04:17:00-0400,en,
3,GRFS,GigaGen Initiates Large-Scale Manufacturing of...,"SOUTH SAN FRANCISCO, Calif., Sept. 09, 2020 ...",https://www.globenewswire.com/news-release/202...,Product / Services Announcement,2020-09-09 04:17:00-0400,en,COVID-19
4,GRFS,GigaGen Announces Publication in the Peer-Revi...,Article Details GigaGen’s Affinity Maturation ...,https://www.globenewswire.com/news-release/202...,Health,2020-10-20 04:17:00-0400,en,


In [10]:
# Merge the dataframes and fill missing values in 'language' and 'keywords' columns
merged_df = (
    pd.concat([old_df, news_df]).drop_duplicates(subset="link").reset_index(drop=True)
)

# Create a boolean mask where 'language' is 'en' or NaN
mask = (merged_df["language"] == "en") | (merged_df["language"].isna())

# Apply the mask to the DataFrame
merged_df = merged_df[mask].reset_index(drop=True)

# Optional: Verify the unique values in 'language' column
print(merged_df["language"].unique())

[nan 'en']


In [11]:
len(merged_df)

4994

In [12]:
merged_df.head()

Unnamed: 0,ticker,title,summary,link,topic,published_eastern,language,keywords
0,GRFS,GigaGen Initiates Development of Recombinant P...,"SOUTH SAN FRANCISCO, Calif., March 30, 2020 ...",https://www.globenewswire.com/news-release/202...,Company Announcement,2020-03-30 04:17:00-0400,,
1,GRFS,GigaGen to Present at the Inaugural LifeSci Pa...,"SOUTH SAN FRANCISCO, Calif., July 29, 2020 (...",https://www.globenewswire.com/news-release/202...,Company Announcement,2020-07-29 03:30:00-0400,,
2,GRFS,GigaGen Announces Publication of Research Desc...,"SOUTH SAN FRANCISCO, Calif., Aug. 10, 2020 (...",https://www.globenewswire.com/news-release/202...,Company Announcement,2020-08-10 04:17:00-0400,en,
3,GRFS,GigaGen Initiates Large-Scale Manufacturing of...,"SOUTH SAN FRANCISCO, Calif., Sept. 09, 2020 ...",https://www.globenewswire.com/news-release/202...,Product / Services Announcement,2020-09-09 04:17:00-0400,en,COVID-19
4,GRFS,GigaGen Announces Publication in the Peer-Revi...,Article Details GigaGen’s Affinity Maturation ...,https://www.globenewswire.com/news-release/202...,Health,2020-10-20 04:17:00-0400,en,


In [13]:
merged_df.to_csv(
    "/Users/akseljoonas/Documents/news-sentiment/data/raw/news_validated-new-new.csv",
    index=False,
)