In [3]:
# Import necessary libraries
import pandas as pd
from textblob import TextBlob
import matplotlib.pyplot as plt
import seaborn as sns 
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

In [None]:
# Load the Financial news dataset
news_data = pd.read_csv("../data/Raw_analyst_ratings.csv")

# View structure of the data
print(news_data.info())
print(news_data.head())

news_data = news_data.dropna(subset=['headline', 'stock'])

In [None]:
# Headline length
news_data['headline_length'] = news_data['headline'].apply(len)
print(news_data['headline_length'].describe())

# Articles per publisher
publisher_counts = news_data['publisher'].value_counts()
print("Articles per Publisher:")
print(publisher_counts)

# Assuming 'news_data' is already loaded with the necessary data

# Ensure the 'date' column is of datetime type
news_data['date'] = pd.to_datetime(news_data['date'], errors='coerce')

# Check if the 'date' column has timezone information and remove it
if news_data['date'].dt.tz is not None:
    # Remove timezone information if it exists
    news_data['date'] = news_data['date'].dt.tz_localize(None)

# Extract the publication date without time for easier analysis
news_data['publication_date'] = news_data['date'].dt.date

# Count the number of articles published each day
articles_per_day = news_data['publication_date'].value_counts().sort_index()

print("Articles per Day:")
print(articles_per_day)

# Plot publication frequency over time
plt.figure(figsize=(10, 6))
articles_per_day.plot(kind='line', color='b')
plt.title("Number of Articles Published Over Time")
plt.xlabel("Date")
plt.ylabel("Number of Articles")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Perform sentiment analysis
def categorize_sentiment(text):
    polarity = TextBlob(text).sentiment.polarity
    if polarity > 0:
        return "Positive"
    elif polarity < 0:
        return "Negative"
    else:
        return "Neutral"

news_data['sentiment_category'] = news_data['headline'].apply(categorize_sentiment)

# Display sentiment category counts
sentiment_counts = news_data['sentiment_category'].value_counts()
print(sentiment_counts)

# Plot sentiment distribution
plt.figure(figsize=(8, 6))
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette="viridis")
plt.title("Distribution of Sentiment Categories")
plt.xlabel("Sentiment Score")
plt.ylabel("Count")
plt.show()

In [1]:
# Topic Modeling (Common Keywords)
if 'headline' in news_data.columns:
    vectorizer = CountVectorizer(max_features=10, stop_words='english')
    X = vectorizer.fit_transform(news_data['headline'].dropna())
    print("\nTop Keywords:")
    print(vectorizer.get_feature_names_out())
else:
    print("Error: The dataset does not contain a 'headline' column.")

NameError: name 'news_data' is not defined

In [None]:
# Top publishers
top_publishers = news_data['publisher'].value_counts().head(10)
top_publishers.plot(kind='bar', title="Top 10 Publishers", figsize=(10, 5))
plt.show()

# Analyze domains if email addresses are used as publishers
news_data['domain'] = news_data['publisher'].str.split('@').str[-1]
domain_counts = news_data['domain'].value_counts()
print(domain_counts)
