In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import plotly.express as px

%matplotlib inline
pd.set_option('display.max_columns', 50)

In [None]:
df = pd.read_csv('../data/cleaned_data.csv')
df['publishedAt'] = pd.to_datetime(df['publishedAt'])
df['engagement_rate'] = (df['likes'] + df['comments']) / df['views'].replace(0,1)
df['upload_hour'] = df['publishedAt'].dt.hour
df['days_since_upload'] = (pd.Timestamp('now') - df['publishedAt']).dt.days.replace(0,1)
df['views_per_day'] = df['views'] / df['days_since_upload']

In [None]:
df.info()
df.head()
df.describe()
missing = df.isnull().sum()
print("Missing values:\n", missing)

In [None]:
display_cols = ['title', 'views', 'likes', 'comments', 'engagement_rate']
print("Top 10 Most Viewed Videos:")
display(df.sort_values("views", ascending=False)[display_cols].head(10))

print("Top 10 Most Liked Videos:")
display(df.sort_values("likes", ascending=False)[display_cols].head(10))

print("Top 10 Most Commented Videos:")
display(df.sort_values("comments", ascending=False)[display_cols].head(10))

In [None]:
plt.figure(figsize=(8,4))
sns.histplot(df['engagement_rate'], bins=20, kde=True)
plt.title('Distribution of Engagement Rate')
plt.xlabel('Engagement Rate')
plt.ylabel('Video Count')
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(x='duration_sec', y='views', hue='engagement_rate', size='likes',
                sizes=(20,100), data=df, palette='viridis')
plt.title("Views vs Duration vs Engagement Rate")
plt.xlabel("Duration (seconds)")
plt.ylabel("Views")
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='upload_hour', data=df)
plt.title("Upload Frequency by Hour")
plt.xlabel("Hour of Day")
plt.ylabel("Videos Uploaded")
plt.show()

In [None]:
print("Top Trending Videos (Views per Day):")
display(df.sort_values('views_per_day', ascending=False)[['title','views','views_per_day','publishedAt']].head(10))

In [None]:
plt.figure(figsize=(8,4))
sns.histplot(df['avg_sentiment'].dropna(), bins=20, color='salmon')
plt.title("Distribution of Average Comment Sentiment")
plt.xlabel("Average Sentiment (polarity)")
plt.ylabel("Video Count")
plt.show()

print("Most Positive Videos:")
display(df.sort_values('avg_sentiment', ascending=False)[['title','avg_sentiment','views']].head(5))

In [None]:
keyword_counts = df['top_keyword'].value_counts().head(15)
plt.figure(figsize=(8,4))
sns.barplot(x=keyword_counts.index, y=keyword_counts.values, palette='Blues_d')
plt.title("Top Keywords/Topics")
plt.xlabel("Keyword/Topic")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()

# Word cloud
text = ' '.join(df['title'].astype(str)) + ' ' + ' '.join(df['description'].astype(str))
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Word Cloud of Video Titles & Descriptions")
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(df[['views','likes','comments','duration_sec','engagement_rate','avg_sentiment']].corr(),
            annot=True, fmt='.2f', cmap='crest')
plt.title('Feature Correlation Heatmap')
plt.show()

In [None]:
features = ['duration_sec', 'likes', 'comments', 'avg_sentiment']
sns.pairplot(df[features + ['views']].dropna(), diag_kind='kde')
plt.suptitle("Pairwise Feature Relationships", y=1.02)
plt.show()