## Importing

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

## Eda Class Declaration

In [None]:
class EDA:
    def __init__(self, news_path: str, stocks_dir: str):
        self.news_path = Path(news_path)
        self.stocks_dir = Path(stocks_dir)
        self.news_df = None
        self.stocks_data = {}

    def load_news(self):
        self.news_df = pd.read_csv(self.news_path)
        # Robust datetime parsing (handles mixed / slightly varying formats)
        self.news_df['date'] = pd.to_datetime(
            self.news_df['date'],
            infer_datetime_format=True,
            errors='coerce',  # bad/mismatched strings become NaT instead of crashing
            utc=False         # set True if you know times are UTC; drop tz later if needed
        )
        # Optional: show rows that failed to parse
        bad = self.news_df[self.news_df['date'].isna()]
        if not bad.empty:
            print(f"Warning: {len(bad)} headline dates failed to parse. Sample:")
            display(bad.head(5))
        return self.news_df

    def load_stocks(self):
        for file in self.stocks_dir.glob("*_historical_data.csv"):
            ticker = file.stem.split("_")[0]
            df = pd.read_csv(file)
            df['Date'] = pd.to_datetime(df['Date'])
            df.sort_values('Date', inplace=True)
            self.stocks_data[ticker] = df
        return self.stocks_data

    def describe_news(self):
        print("\n--- News Data Overview ---")
        print(self.news_df.info())
        print(self.news_df.describe(include='all'))
        print("\nTop Publishers:")
        print(self.news_df['publisher'].value_counts().head())

    def plot_news_frequency(self):
        plt.figure(figsize=(10,4))
        sns.countplot(data=self.news_df, x='stock', order=self.news_df['stock'].value_counts().index)
        plt.title("News Count by Stock")
        plt.xticks(rotation=45)
        plt.show()

    def describe_stocks(self):
        for ticker, df in self.stocks_data.items():
            print(f"\n--- {ticker} ---")
            print(df.describe())
            print(f"Date range: {df['Date'].min()} to {df['Date'].max()}")

    def plot_stock_close(self, ticker):
        df = self.stocks_data[ticker]
        plt.figure(figsize=(10,4))
        plt.plot(df['Date'], df['Close'], label=ticker)
        plt.title(f"{ticker} Closing Price Over Time")
        plt.xlabel("Date")
        plt.ylabel("Close Price")
        plt.legend()
        plt.show()

## Class usage

In [None]:
eda = EDA("../data/raw_analyst_ratings.csv", "../data/")
eda.load_news()
eda.load_stocks()

eda.describe_news()
eda.plot_news_frequency()

eda.describe_stocks()
eda.plot_stock_close("AAPL")