# Importing Dependencies

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Loading the Data & Basic Structure

In [None]:
raw_analyst_ratings = pd.read_csv("../data/raw_analyst_ratings.csv")
print(raw_analyst_ratings.shape)

In [None]:
print("The first 5 rows")
raw_analyst_ratings.head()

In [None]:
print("The last 5 rows")
raw_analyst_ratings.tail()

In [None]:
print("Random 5 rows")
raw_analyst_ratings.sample(5)

In [None]:
print("The Shape of the dataset")
raw_analyst_ratings.info()

In [None]:
print("Check for missing values:")
print(raw_analyst_ratings.isna().sum())

In [None]:
print("Categorical Column Summary:")
print(raw_analyst_ratings.describe(include=['object']))

In [None]:
print("Numeric Column Summary:")
print(raw_analyst_ratings.describe())

# EDA  

## 1. Descriptive Statistics 

### A. Headline Length

In [None]:
raw_analyst_ratings['headline_length'] = raw_analyst_ratings['headline'].astype(str).apply(len)

print("Headline Length Summary:")
print(raw_analyst_ratings['headline_length'].describe())

### B. Publisher Count

In [None]:
publisher_counts = raw_analyst_ratings['publisher'].value_counts()

print("Top 10 Publishers by Article Count:")
top10 = publisher_counts.head(10)
top10

In [None]:
plt.figure(figsize=(10,4))
sns.barplot(x=top10.values, y=top10.index)
plt.title("Top 10 Most Active Publishers")
plt.ylabel("Article Count")
plt.xlabel("Publisher")
plt.show()

### C. Publication Date

In [None]:
raw_analyst_ratings['date'] = pd.to_datetime(raw_analyst_ratings['date'], errors='coerce')

print("Missing Dates After Conversion:")
print(raw_analyst_ratings['date'].isna().sum())

## 2. Text Analysis

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def clean_text(t):
    t = t.lower()
    t = re.sub(r'[^a-z\s]', '', t)
    return t

raw_analyst_ratings["clean_headline"] = raw_analyst_ratings["headline"].astype(str).apply(clean_text)

### 1. Keyword Frequency

In [None]:
all_words = " ".join(raw_analyst_ratings["clean_headline"]).split()
filtered_words = [w for w in all_words if w not in stop_words]

word_counts = pd.Series(filtered_words).value_counts().head(20)

word_counts

In [None]:
plt.figure(figsize=(8,4))
sns.barplot(x=word_counts.values, y=word_counts.index)
plt.title("Top 20 Keywords in Headlines")
plt.xlabel("Count")
plt.ylabel("Word")
plt.show()


### 2. Common Phrases

In [None]:
vectorizer = CountVectorizer(ngram_range=(2,2), stop_words='english')
X2 = vectorizer.fit_transform(raw_analyst_ratings["clean_headline"])
phrases = pd.Series(np.array(X2.sum(axis=0)).flatten(), index=vectorizer.get_feature_names_out())
phrases.sort_values(ascending=False).head(20)


In [None]:
top_phrases = phrases.sort_values(ascending=False).head(20)

plt.figure(figsize=(8,4))
sns.barplot(x=top_phrases.values, y=top_phrases.index)
plt.title("Top 20 Phrases")
plt.xlabel("Count")
plt.ylabel("Phrases")
plt.show()


## 3. Time Series Analysis

### B. Extract Useful Time Features

In [None]:
raw_analyst_ratings["year"] = raw_analyst_ratings["date"].dt.year
raw_analyst_ratings["month"] = raw_analyst_ratings["date"].dt.month
raw_analyst_ratings["day"] = raw_analyst_ratings["date"].dt.day
raw_analyst_ratings["weekday"] = raw_analyst_ratings["date"].dt.day_name()
raw_analyst_ratings["hour"] = raw_analyst_ratings["date"].dt.hour


### C. Frequecny per Day

In [None]:
daily_counts = raw_analyst_ratings.groupby(raw_analyst_ratings["date"].dt.date).size()

plt.figure(figsize=(10,4))
plt.plot(daily_counts.index, daily_counts.values)
plt.title("Articles Published Per Day")
plt.xlabel("Date")
plt.ylabel("Count")
plt.tight_layout()
plt.show()


### D. Frequency per Month

In [None]:
monthly_counts = raw_analyst_ratings.groupby([raw_analyst_ratings["date"].dt.to_period("M")]).size()

plt.figure(figsize=(10,4))
monthly_counts.plot(kind="bar")
plt.title("Monthly Article Frequency")
plt.xlabel("Month")
plt.ylabel("Number of Articles")
plt.show()


### E. Frequency by Day of the Week

In [None]:
plt.figure(figsize=(7,4))
sns.countplot(data=raw_analyst_ratings, x="weekday", order=[
    "Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"
])
plt.title("News Count by Day of Week")
plt.xlabel("Day")
plt.ylabel("Articles")
plt.xticks(rotation=45)
plt.show()


### F. Publication Time of Day

In [None]:
plt.figure(figsize=(8,4))
sns.histplot(raw_analyst_ratings["hour"], bins=24, kde=False)
plt.title("Article Publication Time (Hour of Day)")
plt.xlabel("Hour (0â€“23)")
plt.ylabel("Frequency")
plt.show()


### G. Spikes

In [None]:
threshold = daily_counts.mean() + 2*daily_counts.std()
spikes = daily_counts[daily_counts > threshold]

spikes


## 4. Publisher Analysis 

### A. Active Publisher

In [None]:
print("Top 15 Publishers by Article Count:")
top15 = publisher_counts.head(15)
top15

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x=top15.values, y=top15.index)
plt.title("Top 15 Most Active Publishers")
plt.xlabel("Number of Articles")
plt.ylabel("Publisher")
plt.show()


### B. Distribution of Publisher

In [None]:
print("Unique publishers:", raw_analyst_ratings["publisher"].nunique())

### C. Publisher with Email  

In [None]:
raw_analyst_ratings["is_email"] = raw_analyst_ratings["publisher"].str.contains("@", na=False)
raw_analyst_ratings["is_email"].value_counts()


In [None]:
# extracting domain for publisher with email
raw_analyst_ratings["domain"] = raw_analyst_ratings["publisher"].str.extract(r'@(.+)$')
domain_counts = raw_analyst_ratings["domain"].value_counts().head(10)
domain_counts


In [None]:
plt.figure(figsize=(8,4))
sns.barplot(x=domain_counts.values, y=domain_counts.index)
plt.title("Top Email Domains")
plt.xlabel("Articles")
plt.ylabel("Domain")
plt.show()
