In [1]:
from datetime import datetime

import feedparser
import pandas as pd
import yaml
from bs4 import BeautifulSoup

In [2]:
def load_config(sector):
    config_file = f"{sector}.yaml"
    with open(config_file, "r") as file:
        rss_dict = yaml.safe_load(file)

    return rss_dict

In [3]:
def clean_text(raw_html):
    cleantext = BeautifulSoup(raw_html, "lxml").text
    return cleantext


def fetch_news(rss_dict):
    cols = [
        "ticker",
        "title",
        "summary",
        "published_gmt",
        "description",
        "link",
        "language",
        "topic",
        "sector",
    ]
    all_news_items = []

    current_time = datetime.now()
    print(f"Starting new iteration at {current_time}")

    for key, rss_url in rss_dict.items():
        # print(f"Fetching news for ticker: {key}")
        feed = feedparser.parse(rss_url)

        for newsitem in feed["items"]:
            last_subject = (
                newsitem["tags"][-1]["term"]
                if "tags" in newsitem and newsitem["tags"]
                else None
            )
            all_news_items.append(
                {
                    "ticker": key,
                    "title": newsitem["title"],
                    "summary": clean_text(newsitem["summary"]),
                    "published_gmt": newsitem["published"],
                    "description": clean_text(newsitem["description"]),
                    "link": newsitem["link"],
                    "language": newsitem.get(
                        "dc_language", None
                    ),  # Extracted language from the provided feed
                    "topic": last_subject,
                }
            )

    return pd.DataFrame(all_news_items, columns=cols)

In [4]:
sector = "biotech"
print(f"Fetching news for sector: {sector}")

rss_dict = load_config(sector)
print(rss_dict)

Fetching news for sector: biotech
{'FBIO': 'https://www.globenewswire.com/rssfeed/organization/Zl-bHuvT3qJD7_c31F0d1w==', 'KA': 'https://www.globenewswire.com/rssfeed/organization/GRExm2xRyMBOVUvowi5oSA==', 'QGEN': 'https://www.globenewswire.com/rssfeed/organization/ObYQ7Np8dbpjWSsUartQVA==', 'DYAI': 'https://www.globenewswire.com/rssfeed/organization/PZT53qVKTCNqeqiVtTKL3w==', 'JSPR': 'https://www.globenewswire.com/rssfeed/organization/IEZm99vVrVVCb3DyKCTR5w==', 'ANAB': 'https://www.globenewswire.com/rssfeed/organization/Ji5S9zuTCO02Ajd_V5I3Rw==', 'ECOR': 'https://www.globenewswire.com/rssfeed/organization/Uxa3Kjdv9pQ0hbvyFX0QiA==', 'ELOX': 'https://www.globenewswire.com/rssfeed/organization/zb7Vep8ReOhaPqqjjS7uNQ==', 'MDWD': 'https://www.globenewswire.com/rssfeed/organization/JNTLkaPBI2Jd7IIL2hLwJg==', 'EYEN': 'https://www.globenewswire.com/rssfeed/organization/r08K-OyBIlTfBByhoeiNHw==', 'PYPD': 'https://www.globenewswire.com/rssfeed/organization/jgple-INo8rwkSygwj0BYg==', 'SCLX': 'h

In [5]:
news_df = fetch_news(rss_dict)
news_df.describe()

Starting new iteration at 2024-11-10 11:13:42.997470


  cleantext = BeautifulSoup(raw_html, "lxml").text


Unnamed: 0,sector
count,0.0
mean,
std,
min,
25%,
50%,
75%,
max,


In [6]:
news_df["published_gmt"] = pd.to_datetime(
    news_df["published_gmt"], format="%a, %d %b %Y %H:%M %Z", utc=True
)
# news_df = news_df.sort_values(by="published_gmt", ascending=False)
news_df.head()

Unnamed: 0,ticker,title,summary,published_gmt,description,link,language,topic,sector
0,FBIO,Fortress Biotech to Participate in October 202...,"MIAMI, Sept. 26, 2024 (GLOBE NEWSWIRE) -- Fo...",2024-09-26 12:30:00+00:00,"MIAMI, Sept. 26, 2024 (GLOBE NEWSWIRE) -- Fo...",https://www.globenewswire.com/news-release/202...,,Calendar of Events,
1,FBIO,Fortress Biotech Announces Pricing of $8 Milli...,"MIAMI, Sept. 20, 2024 (GLOBE NEWSWIRE) -- Fo...",2024-09-20 12:30:00+00:00,"MIAMI, Sept. 20, 2024 (GLOBE NEWSWIRE) -- Fo...",https://www.globenewswire.com/news-release/202...,,Financing Agreements,
2,FBIO,Fortress Biotech to Present at the H.C. Wainwr...,"MIAMI, Sept. 04, 2024 (GLOBE NEWSWIRE) -- Fo...",2024-09-04 12:30:00+00:00,"MIAMI, Sept. 04, 2024 (GLOBE NEWSWIRE) -- Fo...",https://www.globenewswire.com/news-release/202...,,Calendar of Events,
3,FBIO,Fortress Biotech Reports Second Quarter 2024 F...,"PDUFA goal date of November 4, 2024 for DFD-29...",2024-08-13 20:05:00+00:00,"PDUFA goal date of November 4, 2024 for DFD-29...",https://www.globenewswire.com/news-release/202...,,Earnings Releases and Operating Results,
4,FBIO,Fortress Biotech Reduces Total Debt and Enters...,Extends maturity of long-term debt as Fortress...,2024-07-25 12:30:00+00:00,Extends maturity of long-term debt as Fortress...,https://www.globenewswire.com/news-release/202...,,Major shareholder announcements,


In [7]:
len(news_df)

2561

### MERGE


In [8]:
old_df = pd.read_csv("/Users/akseljoonas/Documents/predtrade/news_data_oct.csv")

In [9]:

old_df.head()
len(old_df)

3836

In [10]:
merged_df = pd.concat([old_df, news_df]).drop_duplicates(subset='link').reset_index(drop=True)

In [11]:
len(merged_df)

4236

In [12]:
merged_df.to_csv(
    "/Users/akseljoonas/Documents/predtrade/news_data_nov.csv", index=False
)