In [4]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter
import yfinance as yf
import talib
import os


##### Load raw news data (update path if necessary)

In [7]:
df_news = pd.read_csv('../data/raw_analyst_ratings.csv')

##### Descriptive statistics: headline length

In [8]:
df_news['headline_length'] = df_news['headline'].apply(len)
print(df_news['headline_length'].describe())

count    1.407328e+06
mean     7.312051e+01
std      4.073531e+01
min      3.000000e+00
25%      4.700000e+01
50%      6.400000e+01
75%      8.700000e+01
max      5.120000e+02
Name: headline_length, dtype: float64


In [10]:
def extract_publisher(text):
    match = re.match(r"^([A-Za-z\s&]+)\s(Maintains|Downgrades|Initiates|Upgrades|Reports)", str(text))
    if match:
        return match.group(1).strip()
    elif "shares are trading" in str(text):
        return "Market Commentary"
    elif "Stocks That Hit" in str(text):
        return "Stock Summary"
    elif "Pershing Square" in str(text):
        return "Pershing Square"
    elif "FDA" in str(text) or "approval" in str(text).lower():
        return "FDA/Health News"
    else:
        return "Other"

df_news['publisher'] = df_news['headline'].apply(extract_publisher)


In [None]:
publisher_counts = df_news['publisher'].value_counts()
publisher_counts.plot(kind='bar', title='Articles per Publisher', figsize=(10, 6))
plt.ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    stopwords = set(['the', 'to', 'of', 'and', 'on', 'in', 'for', 'as', 'with', 'a', 'are', 'is', 'that', 'has'])
    return [word for word in text.split() if word not in stopwords and len(word) > 2]

df_news['tokens'] = df_news['headline'].apply(clean_text)
all_tokens = [token for tokens in df_news['tokens'] for token in tokens]
word_freq = Counter(all_tokens)

# Top keywords bar chart
top_keywords = dict(word_freq.most_common(20))
plt.figure(figsize=(12, 6))
sns.barplot(x=list(top_keywords.values()), y=list(top_keywords.keys()))
plt.title('Top 20 Keywords in Headlines')
plt.xlabel('Frequency')
plt.ylabel('Keyword')
plt.tight_layout()
plt.show()