# Imports & settings

In [None]:
#imports & plotting defaults
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use("seaborn-v0_8")
plt.rcParams["figure.figsize"] = (10,5)


# Load data

In [3]:
#load CSV (update filename if needed)
DATA_PATH = "../data/raw_analyst_ratings.csv"  
df = pd.read_csv(DATA_PATH, index_col=None)
# quick look
df.head()

Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock
0,0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05 10:30:54-04:00,A
1,1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03 10:45:20-04:00,A
2,2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26 04:30:07-04:00,A
3,3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22 12:45:06-04:00,A
4,4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22 11:38:59-04:00,A


# Basic info & cleaning

In [None]:
#basic info, datatypes, missing values
df.info()
df.shape
df.isna().sum()
df.drop(columns=["Unnamed: 0"], inplace=True)
# If 'headline' column exists, drop rows with missing headlines because we need text:
if 'headline' in df.columns:
    df = df.dropna(subset=['headline']).reset_index(drop=True)


# Headline length stats

In [None]:
#headline length statistics
df['headline_length'] = df['headline'].astype(str).apply(len)
display(df['headline_length'].describe().to_frame().T)
# histogram of headline lengths
plt.figure()
sns.histplot(df['headline_length'], bins=30, kde=True)
plt.title("Headline length distribution")
plt.xlabel("Characters in headline")
plt.show()
# also words
df['headline_word_count'] = df['headline'].astype(str).apply(lambda x: len(x.split()))
sns.histplot(df['headline_word_count'], bins=30)
plt.title("Headline word count distribution")
plt.xlabel("Words in headline")
plt.show()


# Count Articles Per Publisher

In [None]:
# Cell 5: publisher counts (top publishers)
if 'publisher' in df.columns:
    pub_counts = df['publisher'].astype(str).value_counts()
    display(pub_counts.head(20))
    plt.figure()
    sns.barplot(y=pub_counts.head(20).index, x=pub_counts.head(20).values)
    plt.title("Top 20 publishers by article count")
    plt.xlabel("Number of articles")
    plt.ylabel("Publisher")
    plt.show()
else:
    print("No 'publisher' column found.")


# Convert date column

In [None]:
df["date"] = pd.to_datetime(df["date"], format="mixed", utc=True)


# Articles per day

In [None]:
daily_counts = df.resample("D", on="date").count()

plt.figure(figsize=(12,4))
plt.plot(daily_counts["headline"])
plt.title("Articles per Day")
plt.ylabel("Count")
plt.show()