In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#loading the data and basic infos about the data

In [None]:
df = pd.read_csv("../data/raw_analyst_ratings.csv")

#shapes of the data, overvies , missing values

In [None]:
df.shape

In [None]:
df.head()

In [None]:
print(" DATASET OVERVIEW")
print(f"Total records: {df.shape[0]:,}")
print(f"Total columns: {df.shape[1]}")

In [None]:
print("COLUMN NAMES")
print(df.columns.tolist())

In [None]:
print("DATA TYPES & MISSING VALUES")
info_df = pd.DataFrame({
    'Data Type': df.dtypes,
    'Missing Values': df.isnull().sum(),
    'Missing %': (df.isnull().sum() / len(df) * 100).round(2),
    'Unique Values': df.nunique()
})
display(info_df)

In [None]:
print("BASIC STATISTICS")
print(df.describe(include='all'))

print("SAMPLE HEADLINES")
for i, headline in enumerate(df['headline'].head(5)):
    print(f"{i+1}. {headline}")

In [None]:
print("DATA QUALITY CHECKS")


duplicates = df.duplicated().sum()
print(f"Duplicate rows: {duplicates}")


empty_headlines = df['headline'].isna().sum()
print(f"Empty headlines: {empty_headlines}")


if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    invalid_dates = df['date'].isna().sum()
    print(f"Invalid dates: {invalid_dates}")
    
    if invalid_dates == 0:
        date_range = df['date'].agg(['min', 'max'])
        print(f"Date range: {date_range['min']} to {date_range['max']}")

Descriptive Statistics

In [None]:
#headline length 
df['headline_length'] = df['headline'].str.len()
df['word_count'] = df['headline'].str.split().str.len()

print("Headline Length Statistics:")
print(df['headline_length'].describe())

print("Word Count Statistics:")
print(df['word_count'].describe())

In [None]:
#publisher analysis
#counting articles per publisher
publisher_counts = df['publisher'].value_counts()
print(f"total unique publisher: {len(publisher_counts)}")

print("\nTop 15 Publishers by Article Count:")
top_publishers = publisher_counts.head(15)
for i, (publisher, count) in enumerate(top_publishers.items(), 1):
    print(f"{i:2d}. {publisher}: {count:>4} articles")



In [None]:
#publication date trend analysis
df['datetime'] = pd.to_datetime(df['date'])
df['date_only'] = df['datetime'].dt.date
df['day_of_week'] = df['datetime'].dt.day_name()
df['hour'] = df['datetime'].dt.hour
df['month'] = df['datetime'].dt.month
df['week'] = df['datetime'].dt.isocalendar().week

print("Date Range Analysis:")
print(f"Earliest publication: {df['datetime'].min()}")
print(f"Latest publication: {df['datetime'].max()}")
print(f"Total time span: {df['datetime'].max() - df['datetime'].min()}")

In [None]:
# Count articles per day
daily_counts = df['date_only'].value_counts().sort_index()
print(f"Total days with publications: {len(daily_counts)}")
print(f"Average articles per day: {daily_counts.mean():.1f}")
print(f"Busiest day: {daily_counts.idxmax()} with {daily_counts.max()} articles")
print(f"Quietest day: {daily_counts.idxmin()} with {daily_counts.min()} articles")


mean_daily = daily_counts.mean()
std_daily = daily_counts.std()
spike_threshold = mean_daily + std_daily
spike_days = daily_counts[daily_counts > spike_threshold]

print(f"\n Publication Spikes (>{spike_threshold:.1f} articles):")
print(f"Found {len(spike_days)} days with unusually high publication volume")
for date, count in spike_days.head(10).items():
    print(f"  {date}: {count} articles")

In [None]:
from typing import Counter


print("EVENT-DRIVEN PUBLICATION ANALYSIS")


print("Investigating spike days for potential market events...")

for date, count in spike_days.head(5).items():
    day_articles = df[df['date_only'] == date]
    print(f"\n {date} - {count} articles (Spike Day):")
    
   
    day_headlines = ' '.join(day_articles['headline'].astype(str))
    words = day_headlines.lower().split()
    common_words = Counter(words).most_common(8)
    
    print(f"   Top keywords: {[word for word, freq in common_words if len(word) > 3]}")
    print(f"   Sample headlines:")
    for headline in day_articles['headline'].head(2):
        print(f"     - {headline}")