In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from textblob import TextBlob
from collections import Counter
import re
from nltk.corpus import stopwords

#loading the data and basic infos about the data

In [None]:
df = pd.read_csv("../data/raw_analyst_ratings.csv")

#shapes of the data, overvies , missing values

In [None]:
df.shape

In [None]:
df.head()

In [None]:
print(" DATASET OVERVIEW")
print(f"Total records: {df.shape[0]:,}")
print(f"Total columns: {df.shape[1]}")

In [None]:
print("COLUMN NAMES")
print(df.columns.tolist())

In [None]:
print("DATA TYPES & MISSING VALUES")
info_df = pd.DataFrame({
    'Data Type': df.dtypes,
    'Missing Values': df.isnull().sum(),
    'Missing %': (df.isnull().sum() / len(df) * 100).round(2),
    'Unique Values': df.nunique()
})
display(info_df)

In [None]:
print("BASIC STATISTICS")
print(df.describe(include='all'))

print("SAMPLE HEADLINES")
for i, headline in enumerate(df['headline'].head(5)):
    print(f"{i+1}. {headline}")

In [None]:
print("DATA QUALITY CHECKS")


duplicates = df.duplicated().sum()
print(f"Duplicate rows: {duplicates}")


empty_headlines = df['headline'].isna().sum()
print(f"Empty headlines: {empty_headlines}")


if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    invalid_dates = df['date'].isna().sum()
    print(f"Invalid dates: {invalid_dates}")
    
    if invalid_dates == 0:
        date_range = df['date'].agg(['min', 'max'])
        print(f"Date range: {date_range['min']} to {date_range['max']}")

Descriptive Statistics

In [None]:
#headline length 
df['headline_length'] = df['headline'].str.len()
df['word_count'] = df['headline'].str.split().str.len()

print("Headline Length Statistics:")
print(df['headline_length'].describe())

print("Word Count Statistics:")
print(df['word_count'].describe())

In [None]:
#publisher analysis
#counting articles per publisher
publisher_counts = df['publisher'].value_counts()
print(f"total unique publisher: {len(publisher_counts)}")

print("\nTop 15 Publishers by Article Count:")
top_publishers = publisher_counts.head(15)
for i, (publisher, count) in enumerate(top_publishers.items(), 1):
    print(f"{i:2d}. {publisher}: {count:>4} articles")



In [None]:
#publication date trend analysis
df['datetime'] = pd.to_datetime(df['date'])
df['date_only'] = df['datetime'].dt.date
df['day_of_week'] = df['datetime'].dt.day_name()
df['hour'] = df['datetime'].dt.hour
df['month'] = df['datetime'].dt.month
df['week'] = df['datetime'].dt.isocalendar().week

print("Date Range Analysis:")
print(f"Earliest publication: {df['datetime'].min()}")
print(f"Latest publication: {df['datetime'].max()}")
print(f"Total time span: {df['datetime'].max() - df['datetime'].min()}")

In [None]:
# Headline length distribution
plt.hist(df['headline_length'], bins=30)
plt.title('Headline Length Distribution')
plt.show()

# Top publishers bar chart
top_publishers.head(10).plot(kind='bar')
plt.title('Top 10 Publishers by Article Count')
plt.show()

In [None]:
print(" DAILY PUBLICATION FREQUENCY")
# Count articles per day
daily_counts = df['date_only'].value_counts().sort_index()
print(f"Total days with publications: {len(daily_counts)}")
print(f"Average articles per day: {daily_counts.mean():.1f}")
print(f"Busiest day: {daily_counts.idxmax()} with {daily_counts.max()} articles")
print(f"Quietest day: {daily_counts.idxmin()} with {daily_counts.min()} articles")


mean_daily = daily_counts.mean()
std_daily = daily_counts.std()
spike_threshold = mean_daily + std_daily
spike_days = daily_counts[daily_counts > spike_threshold]

print(f"\n Publication Spikes (>{spike_threshold:.1f} articles):")
print(f"Found {len(spike_days)} days with unusually high publication volume")
for date, count in spike_days.head(10).items():
    print(f"  {date}: {count} articles")
    

In [None]:
# Create publication trends visualization
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))

# Plot 1: Daily article count over time
ax1.plot(daily_counts.index, daily_counts.values, color='blue', alpha=0.7, linewidth=1)
ax1.axhline(y=mean_daily, color='red', linestyle='--', label=f'Average: {mean_daily:.1f}')
ax1.axhline(y=spike_threshold, color='orange', linestyle='--', label=f'Spike Threshold: {spike_threshold:.1f}')

# Highlight spike days
for date, count in spike_days.items():
    ax1.plot(date, count, 'ro', markersize=4)

ax1.set_title('Daily Article Publication Frequency Over Time', fontsize=14, fontweight='bold')
ax1.set_ylabel('Number of Articles')
ax1.legend()
ax1.grid(True, alpha=0.3)
ax1.tick_params(axis='x', rotation=45)

# Plot 2: Day of week analysis
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
articles_by_weekday = df['day_of_week'].value_counts().reindex(day_order)

ax2.bar(articles_by_weekday.index, articles_by_weekday.values, color='green', alpha=0.7)
ax2.set_title('Article Publication by Day of Week', fontsize=14, fontweight='bold')
ax2.set_ylabel('Number of Articles')
ax2.grid(True, alpha=0.3)

# Add value labels on bars
for i, v in enumerate(articles_by_weekday.values):
    ax2.text(i, v + 5, str(v), ha='center', va='bottom')

plt.tight_layout()
plt.show()

print(f"Insight: Busiest weekday: {articles_by_weekday.idxmax()} with {articles_by_weekday.max()} articles")
print(f"Insight: {len(spike_days)} days had unusually high publication volume")

In [None]:
print("HOURLY & MONTHLY PATTERNS ")

# Hourly distribution
hourly_counts = df['hour'].value_counts().sort_index()
peak_hour = hourly_counts.idxmax()

# Monthly distribution  
monthly_counts = df['month'].value_counts().sort_index()
peak_month = monthly_counts.idxmax()

print(f"Peak publication hour: {peak_hour}:00 with {hourly_counts.max()} articles")
print(f"Peak publication month: Month {peak_month} with {monthly_counts.max()} articles")

# Create subplots for hourly/monthly patterns
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Hourly distribution
ax1.bar(hourly_counts.index, hourly_counts.values, color='purple', alpha=0.7)
ax1.set_xlabel('Hour of Day (24h)')
ax1.set_ylabel('Number of Articles')
ax1.set_title('Article Publication by Hour of Day')
ax1.grid(True, alpha=0.3)

# Monthly distribution
ax2.bar(monthly_counts.index, monthly_counts.values, color='brown', alpha=0.7)
ax2.set_xlabel('Month')
ax2.set_ylabel('Number of Articles')
ax2.set_title('Article Publication by Month')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
from typing import Counter


print("EVENT-DRIVEN PUBLICATION ANALYSIS")


print("Investigating spike days for potential market events...")

for date, count in spike_days.head(5).items():
    day_articles = df[df['date_only'] == date]
    print(f"\n {date} - {count} articles (Spike Day):")
    
   
    day_headlines = ' '.join(day_articles['headline'].astype(str))
    words = day_headlines.lower().split()
    common_words = Counter(words).most_common(8)
    
    print(f"   Top keywords: {[word for word, freq in common_words if len(word) > 3]}")
    print(f"   Sample headlines:")
    for headline in day_articles['headline'].head(2):
        print(f"     - {headline}")

Text Analysis(Topic Modelling)

In [None]:
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

In [None]:

stop_words = set(stopwords.words('english'))
financial_stopwords = {'said', 'inc', 'corp', 'ltd', 'co', 'stock', 'stocks'}
stop_words.update(financial_stopwords)

In [None]:
print("MOST COMMON WORDS ANALYSIS")

# Combine all headlines into one text
all_headlines = ' '.join(df['headline'].astype(str))

# Clean and tokenize
words = re.findall(r'\b[a-zA-Z]{3,}\b', all_headlines.lower())  # Words with 3+ letters
filtered_words = [word for word in words if word not in stop_words]

# Get most common words
word_freq = Counter(filtered_words)
common_words = word_freq.most_common(20)

print("Top 20 Most Common Words:")
for i, (word, count) in enumerate(common_words, 1):
    print(f"{i:2d}. {word:15} : {count:>4} occurrences")


plt.figure(figsize=(12, 6))
words, counts = zip(*common_words)
plt.barh(words, counts, color='teal', alpha=0.7)
plt.xlabel('Frequency')
plt.title('Top 20 Most Common Words in Financial Headlines')
plt.gca().invert_yaxis()
plt.grid(True, alpha=0.3)
plt.show()


In [None]:
print("SPECIFIC KEYWORD ANALYSIS")


target_keywords = [
    'FDA', 'approval', 'price target', 'earnings', 
    'stock', 'high', 'low', 'profit', 'loss',
    'revenue', 'growth', 'dividend', 'merge',
    'acquisition', 'buy', 'sell', 'upgrade', 'downgrade'
]

print("Keyword Frequency Analysis:")
keyword_results = {}

for keyword in target_keywords:
    count = df[df['headline'].str.contains(keyword, case=False, na=False)].shape[0]
    keyword_results[keyword] = count
    print(f"'{keyword}': {count:>3} articles")


print(f"\nMost frequent specific keywords:")
sorted_keywords = sorted(keyword_results.items(), key=lambda x: x[1], reverse=True)
for keyword, count in sorted_keywords[:10]:
    if count > 0:
        print(f"   {keyword:15} : {count:>3} articles")

In [None]:
# Create visualization for top keywords
top_keywords = dict(sorted_keywords[:12])
if top_keywords:  # Only plot if we have data
    plt.figure(figsize=(12, 6))
    plt.barh(list(top_keywords.keys()), list(top_keywords.values()), color='coral', alpha=0.7)
    plt.xlabel('Number of Articles')
    plt.title('Top Financial Keywords in News Headlines')
    plt.gca().invert_yaxis()
    
    # Add value labels
    for i, (keyword, count) in enumerate(top_keywords.items()):
        plt.text(count + 0.5, i, str(count), va='center')
    
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

Time Series Analysis

In [None]:
print("HOW PUBLICATION FREQUENCY VARIES OVER TIME ")

# Calculate daily article counts
daily_counts = df['date_only'].value_counts().sort_index()

# Calculate statistics
mean_articles = daily_counts.mean()
std_articles = daily_counts.std()
spike_threshold = mean_articles + std_articles

print(f"Publication Frequency Analysis:")
print(f"Average articles per day: {mean_articles:.1f}")
print(f"Standard deviation: {std_articles:.1f}")
print(f"Spike threshold: {spike_threshold:.1f} articles (mean + 1 std)")

# Identify spike days
spike_days = daily_counts[daily_counts > spike_threshold]
print(f"   • Found {len(spike_days)} spike days with unusually high volume")

print(f"\nDaily Article Range:")
print(f"Minimum: {daily_counts.min()} articles on {daily_counts.idxmin()}")
print(f"Maximum: {daily_counts.max()} articles on {daily_counts.idxmax()}")

In [None]:
# Create the main frequency over time plot
plt.figure(figsize=(14, 6))

# Plot daily frequency
plt.plot(daily_counts.index, daily_counts.values, 
         color='blue', alpha=0.7, linewidth=1.5, label='Daily Articles')

# Add reference lines
plt.axhline(y=mean_articles, color='red', linestyle='--', 
            label=f'Average: {mean_articles:.1f} articles')
plt.axhline(y=spike_threshold, color='orange', linestyle='--', 
            label=f'Spike Threshold: {spike_threshold:.1f}')

# Highlight spike days in red
spike_dates = spike_days.index
spike_values = spike_days.values
plt.scatter(spike_dates, spike_values, color='red', s=50, zorder=5, 
            label=f'Spike Days ({len(spike_days)} days)')

plt.title('How Publication Frequency Varies Over Time', fontsize=14, fontweight='bold')
plt.ylabel('Number of Articles Published')
plt.xlabel('Date')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
print("ARE THERE SPIKES RELATED TO SPECIFIC MARKET EVENTS?")

print(f"\nAnalyzing {len(spike_days)} spike days for potential market events...")

for i, (spike_date, article_count) in enumerate(spike_days.head(8).items(), 1):
    # Get articles from this spike day
    spike_articles = df[df['date_only'] == spike_date]
    
    print(f"\n{i}.{spike_date} - {article_count} articles (SPIKE):")
    
    # Analyze content for market events
    all_headlines = ' '.join(spike_articles['headline'].astype(str)).lower()
    
    # Look for specific market-related keywords
    market_keywords = {
        'earnings': 'earnings',
        'fda': 'fda|approval', 
        'merger': 'merge|acquisition|buyout',
        'stock move': 'stock|share|trading',
        'economic': 'economy|inflation|rate|fed',
        'company news': 'apple|google|amazon|microsoft|tesla'
    }
    
    found_events = []
    for event_type, pattern in market_keywords.items():
        matches = re.findall(pattern, all_headlines)
        if matches:
            found_events.append(f"{event_type} ({len(matches)} mentions)")
    
    # Show top keywords from spike day
    words = re.findall(r'\b[a-zA-Z]{4,}\b', all_headlines)
    filtered_words = [w for w in words if w not in stop_words]
    common_spike_words = Counter(filtered_words).most_common(5)
    
    print(f"Detected events: {', '.join(found_events) if found_events else 'General market news'}")
    print(f"Top keywords: {[word for word, count in common_spike_words]}")
    print(f" Sample headlines:")
    for headline in spike_articles['headline'].head(2):
        print(f"{headline}")

In [None]:
print("PUBLISHING TIME ANALYSIS")

# Extract hour from datetime
df['publish_hour'] = df['datetime'].dt.hour

# Count articles by hour
hourly_counts = df['publish_hour'].value_counts().sort_index()

print("Articles Published by Hour:")
for hour, count in hourly_counts.items():
    print(f"   {hour}:00 - {hour}:59: {count:>4} articles")

# Find peak publishing times
peak_hour = hourly_counts.idxmax()
peak_count = hourly_counts.max()

print(f"\nPeak Publishing Time: {peak_hour}:00 with {peak_count} articles")
print(f"   This is when {peak_count/len(df)*100:.1f}% of daily news is released")

In [None]:
# Simple bar chart of publishing times
plt.figure(figsize=(12, 6))
plt.bar(hourly_counts.index, hourly_counts.values, color='blue', alpha=0.7)
plt.xlabel('Hour of Day (24-hour format)')
plt.ylabel('Number of Articles Published')
plt.title('Publishing Time Distribution - When News is Released')
plt.grid(True, alpha=0.3)

# Highlight the peak hour
plt.axvline(x=peak_hour, color='red', linestyle='--', alpha=0.8, 
            label=f'Peak: {peak_hour}:00 ({peak_count} articles)')
plt.legend()

plt.xticks(range(0, 24))
plt.tight_layout()
plt.show()

In [None]:
print("WHICH PUBLISHERS CONTRIBUTE MOST TO THE NEWS FEED?")

# Count articles per publisher
publisher_counts = df['publisher'].value_counts()

print("Top 15 Publishers by Article Volume:")
top_15_publishers = publisher_counts.head(15)
for i, (publisher, count) in enumerate(top_15_publishers.items(), 1):
    percentage = (count / len(df)) * 100
    print(f"{i:2d}. {publisher:30} : {count:>4} articles ({percentage:.1f}%)")

total_top_15 = top_15_publishers.sum()
print(f"\nTop 15 publishers account for {total_top_15/len(df)*100:.1f}% of all articles")
print(f"Top publisher '{publisher_counts.index[0]}' contributes {publisher_counts.iloc[0]/len(df)*100:.1f}% of content")

In [None]:
# Visualize top publishers
plt.figure(figsize=(12, 6))
top_10 = publisher_counts.head(10)
plt.barh(range(len(top_10)), top_10.values, color='steelblue')
plt.yticks(range(len(top_10)), top_10.index)
plt.xlabel('Number of Articles')
plt.title('Top 10 Publishers by Contribution to News Feed')
plt.gca().invert_yaxis()

# Add value labels
for i, v in enumerate(top_10.values):
    plt.text(v + 3, i, str(v), va='center')

plt.tight_layout()
plt.show()

In [None]:
print("IS THERE A DIFFERENCE IN THE TYPE OF NEWS THEY REPORT?")

# Analyze top 5 publishers' content differences
top_5_publishers = publisher_counts.head(5).index

print(" Analyzing news type differences among top 5 publishers...")

for publisher in top_5_publishers:
    publisher_articles = df[df['publisher'] == publisher]
    
    print(f"\n{publisher}:")
    print(f"   Total articles: {len(publisher_articles)}")
    
    # Analyze their focus areas
    all_headlines = ' '.join(publisher_articles['headline'].astype(str)).lower()
    
    # Check for common financial topics
    topics = {
        'Earnings': 'earnings|profit|revenue',
        'M&A': 'merge|acquisition|buyout',
        'Stock Moves': 'stock|share|trading',
        'Regulatory': 'fda|approval|regulation',
        'Price Targets': 'price target|upgrade|downgrade',
        'Economic': 'economy|inflation|rate'
    }
    
    topic_counts = {}
    for topic, pattern in topics.items():
        matches = re.findall(pattern, all_headlines)
        topic_counts[topic] = len(matches)
    
    # Show top 3 focus areas
    top_topics = sorted(topic_counts.items(), key=lambda x: x[1], reverse=True)[:3]
    print(f"   Top focus areas: {', '.join([f'{topic} ({count})' for topic, count in top_topics])}")
    
    # Show sample headlines to illustrate their style
    sample_headlines = publisher_articles['headline'].head(2).tolist()
    print(f"   Sample headlines:")
    for headline in sample_headlines:
        print(f"{headline}")

In [None]:
# Compare focus areas across top publishers
print("\n CONTENT FOCUS COMPARISON ACROSS TOP PUBLISHERS ")

# Define common topics to compare
comparison_topics = ['Earnings', 'M&A', 'Stock Moves', 'Regulatory', 'Price Targets']

topic_data = []
for publisher in top_5_publishers:
    publisher_articles = df[df['publisher'] == publisher]
    all_headlines = ' '.join(publisher_articles['headline'].astype(str)).lower()
    
    publisher_topics = {}
    for topic in comparison_topics:
        pattern = {
            'Earnings': 'earnings|profit|revenue',
            'M&A': 'merge|acquisition|buyout',
            'Stock Moves': 'stock|share|trading',
            'Regulatory': 'fda|approval|regulation',
            'Price Targets': 'price target|upgrade|downgrade'
        }[topic]
        
        matches = re.findall(pattern, all_headlines)
        # Normalize by number of articles
        normalized_count = len(matches) / len(publisher_articles) * 100
        publisher_topics[topic] = normalized_count
    
    topic_data.append(publisher_topics)

# Create comparison dataframe
comparison_df = pd.DataFrame(topic_data, index=top_5_publishers)

# Plot comparison
plt.figure(figsize=(12, 8))
comparison_df.plot(kind='bar', width=0.8, figsize=(12, 6))
plt.title('News Focus Comparison Across Top Publishers')
plt.ylabel('Percentage of Articles Mentioning Topic (%)')
plt.xlabel('Publisher')
plt.xticks(rotation=45)
plt.legend(title='Topic Focus')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
print("EMAIL DOMAIN ANALYSIS FOR PUBLISHER NAMES")

# Check if any publishers are email addresses
email_mask = df['publisher'].str.contains(r'@', na=False)
email_publishers = df[email_mask]

print(f"Found {len(email_publishers)} articles with email addresses as publisher names")
print(f"This represents {len(email_publishers)/len(df)*100:.1f}% of all articles")

if len(email_publishers) > 0:
    # Extract domains from email addresses
    email_publishers = email_publishers.copy()
    email_publishers['domain'] = email_publishers['publisher'].str.extract(r'@([\w\.-]+)')
    
    # Count articles by domain
    domain_counts = email_publishers['domain'].value_counts()
    
    print(f"\nArticles by Email Domain:")
    for domain, count in domain_counts.head(10).items():
        percentage = (count / len(email_publishers)) * 100
        print(f"   {domain:25} : {count:>3} articles ({percentage:.1f}%)")
    
    # Visualize top domains
    if len(domain_counts) > 0:
        plt.figure(figsize=(10, 6))
        top_domains = domain_counts.head(8)
        plt.barh(range(len(top_domains)), top_domains.values, color='purple', alpha=0.7)
        plt.yticks(range(len(top_domains)), top_domains.index)
        plt.xlabel('Number of Articles')
        plt.title('Top Email Domains in Publisher Names')
        plt.gca().invert_yaxis()
        
        # Add value labels
        for i, v in enumerate(top_domains.values):
            plt.text(v + 0.5, i, str(v), va='center')
        
        plt.tight_layout()
        plt.show()
        
        print(f"\nTop domain '{domain_counts.index[0]}' contributes {domain_counts.iloc[0]} articles")
        print(f" Top 3 domains account for {domain_counts.head(3).sum()/len(email_publishers)*100:.1f}% of email-published articles")
    
else:
    print("No email addresses found in publisher names - using regular organization names instead")

In [None]:

print("ORGANIZATION ANALYSIS - UNIQUE PUBLISHER DOMAINS")

# Count articles by publisher (this works regardless of email situation)
publisher_counts = df['publisher'].value_counts()

# If no emails, analyze the regular publisher names as organizations
if len(email_publishers) == 0:
    print("Analyzing regular publisher names as organizations...")
    
    print(f"\nTop Organizations by Article Volume:")
    top_orgs = publisher_counts.head(10)
    for i, (org, count) in enumerate(top_orgs.items(), 1):
        percentage = (count / len(df)) * 100
        print(f"{i:2d}. {org:30} : {count:>4} articles ({percentage:.1f}%)")
    
    # Calculate concentration
    top_5_orgs = publisher_counts.head(5)
    top_10_orgs = publisher_counts.head(10)
    
    print(f"\nOrganization Concentration:")
    print(f"   • Top 5 organizations: {top_5_orgs.sum()/len(df)*100:.1f}% of articles")
    print(f"   • Top 10 organizations: {top_10_orgs.sum()/len(df)*100:.1f}% of articles")
    print(f"   • Remaining {len(publisher_counts) - 10} organizations: {publisher_counts[10:].sum()/len(df)*100:.1f}% of articles")

else:
    print("Email addresses found - showing combined analysis:")
    print(f"Top regular publisher: '{publisher_counts.index[0]}' with {publisher_counts.iloc[0]} articles")

print(f"\nInsight: News feed is dominated by a small number of organizations")
print(f"   The top publisher accounts for {publisher_counts.iloc[0]/len(df)*100:.1f}% of all content")
