# SocialProphet - Exploratory Data Analysis

This notebook performs initial exploration of social media engagement data.

## Contents
1. Data Loading
2. Basic Statistics
3. Temporal Analysis
4. Engagement Distribution
5. Correlation Analysis
6. Data Quality Check

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys

# Add src to path
sys.path.append(str(Path.cwd().parent))

# Import project modules
from src.data_processing.collector import DataCollector
from src.data_processing.preprocessor import DataPreprocessor
from src.data_processing.features import FeatureEngineer
from src.utils.config import Config

# Settings
plt.style.use('seaborn-v0_8-whitegrid')
pd.set_option('display.max_columns', None)

print("Libraries loaded successfully!")

## 1. Data Loading

In [None]:
# Initialize collector
collector = DataCollector()

# Load your dataset - update the path as needed
# Option 1: Load from CSV
# df = collector.load_csv('../data/raw/social_media_data.csv')

# Option 2: Load Kaggle dataset (uncomment to use)
# df = collector.load_kaggle_dataset('subashmaster0411/social-media-engagement-dataset')

# For demo, create sample data
print("Please load your dataset by uncommenting the appropriate line above")

In [None]:
# Quick data overview
# Uncomment after loading data
# print(f"Dataset Shape: {df.shape}")
# print(f"\nColumns: {list(df.columns)}")
# print(f"\nData Types:\n{df.dtypes}")
# df.head()

## 2. Basic Statistics

In [None]:
# Numerical summary
# Uncomment after loading data
# df.describe()

In [None]:
# Missing values analysis
# Uncomment after loading data
# missing = df.isnull().sum()
# missing_pct = (missing / len(df)) * 100
# pd.DataFrame({'Missing': missing, 'Percentage': missing_pct}).sort_values('Missing', ascending=False)

## 3. Temporal Analysis

In [None]:
# Convert timestamp and analyze date range
# Uncomment after loading data
# df['timestamp'] = pd.to_datetime(df['timestamp'])
# print(f"Date Range: {df['timestamp'].min()} to {df['timestamp'].max()}")
# print(f"Total Days: {(df['timestamp'].max() - df['timestamp'].min()).days}")

In [None]:
# Engagement over time plot
def plot_engagement_over_time(df, date_col='timestamp', value_col='engagement'):
    """Plot engagement trend over time."""
    fig, ax = plt.subplots(figsize=(14, 6))
    
    # Daily aggregation
    daily = df.set_index(date_col)[value_col].resample('D').mean()
    
    ax.plot(daily.index, daily.values, linewidth=2, label='Daily Average')
    
    # Add rolling average
    rolling = daily.rolling(window=7).mean()
    ax.plot(rolling.index, rolling.values, linewidth=2, linestyle='--', label='7-day Moving Avg')
    
    ax.set_xlabel('Date', fontsize=12)
    ax.set_ylabel('Engagement', fontsize=12)
    ax.set_title('Engagement Over Time', fontsize=14, fontweight='bold')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    return fig

# Uncomment to use:
# plot_engagement_over_time(df)

In [None]:
# Hourly engagement pattern
def plot_hourly_pattern(df, date_col='timestamp', value_col='engagement'):
    """Plot engagement by hour of day."""
    df['hour'] = df[date_col].dt.hour
    hourly = df.groupby('hour')[value_col].mean()
    
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.bar(hourly.index, hourly.values, color='steelblue')
    ax.set_xlabel('Hour of Day', fontsize=12)
    ax.set_ylabel('Average Engagement', fontsize=12)
    ax.set_title('Engagement by Hour of Day', fontsize=14, fontweight='bold')
    ax.set_xticks(range(24))
    
    plt.tight_layout()
    return fig

# Uncomment to use:
# plot_hourly_pattern(df)

In [None]:
# Day of week pattern
def plot_weekly_pattern(df, date_col='timestamp', value_col='engagement'):
    """Plot engagement by day of week."""
    df['day_of_week'] = df[date_col].dt.dayofweek
    days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
    daily = df.groupby('day_of_week')[value_col].mean()
    
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.bar(range(7), daily.values, color='coral')
    ax.set_xlabel('Day of Week', fontsize=12)
    ax.set_ylabel('Average Engagement', fontsize=12)
    ax.set_title('Engagement by Day of Week', fontsize=14, fontweight='bold')
    ax.set_xticks(range(7))
    ax.set_xticklabels(days)
    
    plt.tight_layout()
    return fig

# Uncomment to use:
# plot_weekly_pattern(df)

## 4. Engagement Distribution

In [None]:
# Distribution of engagement metrics
def plot_engagement_distribution(df, columns=['likes', 'comments', 'shares']):
    """Plot distribution of engagement metrics."""
    available_cols = [c for c in columns if c in df.columns]
    
    fig, axes = plt.subplots(1, len(available_cols), figsize=(15, 5))
    if len(available_cols) == 1:
        axes = [axes]
    
    for ax, col in zip(axes, available_cols):
        ax.hist(df[col], bins=50, edgecolor='black', alpha=0.7)
        ax.set_xlabel(col.capitalize(), fontsize=12)
        ax.set_ylabel('Frequency', fontsize=12)
        ax.set_title(f'Distribution of {col.capitalize()}', fontsize=12)
        ax.axvline(df[col].mean(), color='red', linestyle='--', label=f'Mean: {df[col].mean():.1f}')
        ax.legend()
    
    plt.tight_layout()
    return fig

# Uncomment to use:
# plot_engagement_distribution(df)

In [None]:
# Box plots for outlier detection
def plot_boxplots(df, columns=['likes', 'comments', 'shares']):
    """Plot box plots for outlier detection."""
    available_cols = [c for c in columns if c in df.columns]
    
    fig, ax = plt.subplots(figsize=(10, 6))
    df[available_cols].boxplot(ax=ax)
    ax.set_ylabel('Count', fontsize=12)
    ax.set_title('Engagement Metrics - Outlier Detection', fontsize=14, fontweight='bold')
    
    plt.tight_layout()
    return fig

# Uncomment to use:
# plot_boxplots(df)

## 5. Correlation Analysis

In [None]:
# Correlation heatmap
def plot_correlation_matrix(df):
    """Plot correlation matrix for numeric columns."""
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    corr_matrix = df[numeric_cols].corr()
    
    fig, ax = plt.subplots(figsize=(12, 10))
    sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
                center=0, ax=ax, square=True)
    ax.set_title('Correlation Matrix', fontsize=14, fontweight='bold')
    
    plt.tight_layout()
    return fig

# Uncomment to use:
# plot_correlation_matrix(df)

## 6. Data Quality Check

In [None]:
# Run data validation
# preprocessor = DataPreprocessor()
# validation = preprocessor.validate_data(df)
# print("Data Validation Results:")
# print(f"Is Valid: {validation['is_valid']}")
# print(f"Issues: {validation['issues']}")
# print(f"Stats: {validation['stats']}")

## 7. Feature Engineering Preview

In [None]:
# Preview feature engineering
# feature_engineer = FeatureEngineer()
# df_features = feature_engineer.create_all_features(df)
# print(f"Original columns: {len(df.columns)}")
# print(f"After feature engineering: {len(df_features.columns)}")
# df_features.head()

## 8. Save Processed Data

In [None]:
# Clean and save data
# preprocessor = DataPreprocessor()
# df_clean = preprocessor.clean_data(df)
# df_clean = preprocessor.handle_missing_values(df_clean)
# collector.save_data(df_clean, 'cleaned_data.csv', data_type='processed')
# print("Data saved successfully!")

---

## Summary

This EDA notebook provides:
- Data loading and basic inspection
- Temporal pattern analysis
- Engagement distribution visualization
- Correlation analysis
- Data quality validation

**Next Steps:**
1. Load your actual dataset
2. Run all analysis cells
3. Save processed data
4. Proceed to forecasting (02_Forecasting.ipynb)