In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.sentiment import SentimentIntensityAnalyzer
# from sklearn.feature_extraction.text import CountVectorizer



# Load Dataset

In [None]:
df = pd.read_csv('https://drive.google.com/file/d/1uPEti5_As9FrwGUd_hK46s0sAxET9-75/view?usp=sharing')

## Ensure the dataset is clean

In [None]:
df.dropna(subset=['headline', 'publisher', 'date'], inplace=True)
df['date'] = pd.to_datetime(df['date'])

## Basic Statistics for Textual Lengths

In [None]:
df['headline_length'] = df['headline'].apply(len)
headline_stats = df['headline_length'].describe()
print("Headline Length Statistics:\n", headline_stats)

## Count Articles per Publisher

In [None]:
publisher_counts = df['publisher'].value_counts()
print("Top Publishers:\n", publisher_counts.head(10))

# Analyze Publication Dates

In [None]:
articles_per_day = df.groupby(df['date'].dt.date).size()
plt.figure(figsize=(12, 6))
articles_per_day.plot(kind='line', title='Articles Published Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Articles')
plt.grid()
plt.show()

# Text Analysis - Sentiment Analysis

In [None]:
sia = SentimentIntensityAnalyzer()
df['sentiment_score'] = df['headline'].apply(lambda x: sia.polarity_scores(x)['compound'])
df['sentiment'] = df['sentiment_score'].apply(lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutral')
sentiment_counts = df['sentiment'].value_counts()
print("Sentiment Distribution:\n", sentiment_counts)

# Text Analysis - Keyword Extraction

In [None]:
vectorizer = CountVectorizer(max_features=20, stop_words='english')
X = vectorizer.fit_transform(df['headline'])
keywords = vectorizer.get_feature_names_out()
print("Top Keywords:\n", keywords)

# Time Series Analysis - Publication Frequency Over Time

In [None]:
plt.figure(figsize=(12, 6))
df.groupby(df['date'].dt.date).size().plot(kind='bar', title='Publication Frequency Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Articles')
plt.show()

# Time Series Analysis - Publishing Times

In [None]:
df['hour'] = df['date'].dt.hour
df.groupby('hour').size().plot(kind='bar', figsize=(10, 5), title='Articles Published by Hour')
plt.xlabel('Hour of Day')
plt.ylabel('Number of Articles')
plt.show()

# Publisher Analysis - Contribution

In [None]:
top_publishers = df['publisher'].value_counts().head(10)
print("Top Publishers:\n", top_publishers)