In [1]:
from datetime import datetime

import feedparser
import pandas as pd
import yaml
from bs4 import BeautifulSoup
from dateutil import parser
import pytz

In [7]:
def load_config(sector):
    config_file = f"{sector}.yaml"
    with open(config_file, "r") as file:
        rss_dict = yaml.safe_load(file)

    return rss_dict

In [8]:

def clean_text(raw_html: str) -> str:
    cleantext = BeautifulSoup(raw_html, "lxml").text
    return cleantext


def fetch_news(rss_dict: dict) -> pd.DataFrame:
    cols = [
        "ticker",
        "title",
        "summary",
        "published_eastern",
        "link",
        "language",
        "topic",
        "keywords"
    ]
    all_news_items = []

    current_time = datetime.now()
    print(f"Starting new iteration at {current_time}")

    for key, rss_url in rss_dict.items():
        feed = feedparser.parse(rss_url)

        for newsitem in feed["items"]:

            last_subject = (
                newsitem["tags"][-1]["term"]
                if "tags" in newsitem and newsitem["tags"]
                else None
            )
            keywords = (
                ", ".join(newsitem.get("dc_keyword", "").split(", "))
                if "dc_keyword" in newsitem
                else None
            )
            published_gmt = newsitem.get("published", None)
            if published_gmt:
                published_dt = parser.parse(published_gmt)
                eastern = pytz.timezone('US/Eastern')
                published_eastern = published_dt.astimezone(eastern)
                published_eastern = pd.to_datetime(published_eastern).strftime("%a, %d %b %Y %H:%M %Z")
            else:
                published_eastern = None

            all_news_items.append(
                {
                    "ticker": key,
                    "title": newsitem.get("title", None),
                    "summary": clean_text(newsitem.get("summary", "")),
                    "published_eastern": published_eastern,
                    "link": newsitem.get("link", None),
                    "language": newsitem.get("language", None),
                    "topic": last_subject,
                    "keywords": keywords
                }
            )

    return pd.DataFrame(all_news_items, columns=cols)


In [None]:
sector = "biotech_validated"
print(f"Fetching news for sector: {sector}")

rss_dict = load_config(sector)
print(len(rss_dict))


In [None]:
news_df = fetch_news(rss_dict)

In [None]:
news_df.head()

In [None]:
len(news_df)

### MERGE


In [13]:
# Read the CSV file
old_df = pd.read_csv(
    "/Users/akseljoonas/Documents/news-sentiment/data/raw/news_validated2.csv"
)

In [None]:
old_df.head()

In [None]:
# Merge the dataframes and fill missing values in 'language' and 'keywords' columns
merged_df = (
    pd.concat([old_df, news_df]).drop_duplicates(subset="link").reset_index(drop=True)
)

# Create a boolean mask where 'language' is 'en' or NaN
mask = (merged_df["language"] == "en") | (merged_df["language"].isna())

# Apply the mask to the DataFrame
merged_df = merged_df[mask].reset_index(drop=True)

# Optional: Verify the unique values in 'language' column
print(merged_df["language"].unique())

In [None]:
len(merged_df)

In [None]:
merged_df.head()

In [None]:
merged_df.to_csv(
    "/Users/akseljoonas/Documents/news-sentiment/data/raw/news_validated2-prices.csv",
    index=False,
)