In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from datetime import datetime

: 

In [None]:
# Load the financial news dataset
df = pd.read_csv('raw_analyst_ratings.csv') 

: 

In [None]:
# 1. Descriptive Statistics

# Headline Length
df['headline_length'] = df['headline'].str.len()
print(f"Headline Length Statistics:n{df['headline_length'].describe()}")

: 

In [None]:
# Plot histogram of headline lengths
plt.figure(figsize=(8, 5))
sns.histplot(df['headline_length'], kde=True)
plt.title('Distribution of Headline Lengths')
plt.xlabel('Headline Length')
plt.ylabel('Frequency')
plt.show()

: 

In [None]:
# Publisher Activity
publisher_counts = df['publisher'].value_counts()
print(f"Top 10 Most Active Publishers:n{publisher_counts.head(10)}")

: 

In [None]:
# Plot bar chart of publisher activity
plt.figure(figsize=(12, 6))
sns.barplot(x=publisher_counts.index, y=publisher_counts.values)
plt.title('Publisher Article Counts')
plt.xlabel('Publisher')
plt.ylabel('Article Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

: 

In [None]:
# Publication Date Trends
df['publication_date'] = pd.to_datetime(df['publication_date'])
publication_dates = df['publication_date'].value_counts().sort_index()

: 

In [None]:
# Plot line graph of publication dates
plt.figure(figsize=(12, 6))
plt.plot(publication_dates.index, publication_dates.values)
plt.title('Publication Frequency over Time')
plt.xlabel('Publication Date')
plt.ylabel('Article Count')
plt.show()

: 

In [None]:
# 2. Text Analysis

# Sentiment Analysis
df['sentiment'] = df['headline'].apply(lambda x: TextBlob(x).sentiment.polarity)

: 

In [None]:
# Plot histogram of sentiment scores
plt.figure(figsize=(8, 5))
sns.histplot(df['sentiment'], kde=True)
plt.title('Distribution of Sentiment Scores')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.show()

: 

In [None]:
# Topic Modeling
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['headline'])

lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(tfidf_matrix)

for i, topic in enumerate(lda.components_):
    print(f"Topic {i+1}: {' '.join([tfidf_vectorizer.get_feature_names_out()[j] for j in topic.argsort()[:-10:-1]])}")


: 

In [None]:
# Keyword Extraction
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['headline'])

: 

In [None]:
# Get top 20 keywords
top_keywords = tfidf_vectorizer.get_feature_names_out()[:20]
print(f"Top 20 Keywords:n{top_keywords}")

: 

In [None]:
# 3. Time Series Analysis

# Publication Frequency over Time
df['publication_date'] = pd.to_datetime(df['publication_date'])
daily_counts = df.groupby(df['publication_date'].dt.date)['headline'].count()

plt.figure(figsize=(12, 6))
plt.plot(daily_counts.index, daily_counts.values)
plt.title('Daily Article Publication Frequency')
plt.xlabel('Publication Date')
plt.ylabel('Article Count')
plt.show()

: 

In [None]:
# Publishing Time Analysis
df['publication_hour'] = df['publication_date'].dt.hour

hour_counts = df['publication_hour'].value_counts().sort_index()

plt.figure(figsize=(8, 5))
plt.bar(hour_counts.index, hour_counts.values)
plt.title('Article Publication Frequency by Hour')
plt.xlabel('Hour of Day')
plt.ylabel('Article Count')
plt.show()

: 

In [None]:
# 4. Publisher Analysis

# Publisher Contributions
publisher_counts = df['publisher'].value_counts()

plt.figure(figsize=(12, 6))
sns.barplot(x=publisher_counts.index, y=publisher_counts.values)
plt.title('Publisher Article Counts')
plt.xlabel('Publisher')
plt.ylabel('Article Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

: 

In [None]:
# Publisher Domain Analysis
df['publisher_domain'] = df['publisher'].str.split('@').str.get(1).str.split('.').str.get(0)
domain_counts = df['publisher_domain'].value_counts()

plt.figure(figsize=(12, 6))
sns.barplot(x=domain_counts.index, y=domain_counts.values)
plt.title('Publisher Domain Counts')
plt.xlabel('Publisher Domain')
plt.ylabel('Article Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


: 