# AI vs Human Content Detection - Exploratory Data Analysis

This notebook performs comprehensive exploratory data analysis on the AI vs Human content detection dataset.

## Dataset Overview
- **File**: `data/raw/ai_human_content_detection_dataset.csv`
- **Task**: Binary classification (AI-generated vs Human-written)
- **Features**: Text content + multiple linguistic/stylometric features

## 1. Import Libraries and Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path
import sys

# Add src to path for our custom modules
sys.path.append('../../src')

# Set style and suppress warnings
plt.style.use('default')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

print("Libraries imported successfully!")

## 2. Load Dataset

In [None]:
# Load the dataset
data_path = '../../data/raw/ai_human_content_detection_dataset.csv'
df = pd.read_csv(data_path)

print(f"Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

## 3. Basic Dataset Information

In [None]:
# Basic info
print("=== DATASET OVERVIEW ===")
print(f"Number of samples: {len(df):,}")
print(f"Number of features: {len(df.columns)}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print("\n=== COLUMN INFORMATION ===")
df.info()

In [None]:
# First few rows
print("=== FIRST 5 ROWS ===")
df.head()

In [None]:
# Statistical summary
print("=== STATISTICAL SUMMARY ===")
df.describe()

## 4. Missing Values Analysis

In [None]:
# Check for missing values
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing Percentage': missing_percent
}).sort_values('Missing Count', ascending=False)

print("=== MISSING VALUES ===")
print(missing_df[missing_df['Missing Count'] > 0])

if missing_df['Missing Count'].sum() == 0:
    print("✅ No missing values found in the dataset!")
else:
    # Visualize missing values
    plt.figure(figsize=(10, 6))
    missing_cols = missing_df[missing_df['Missing Count'] > 0]
    if len(missing_cols) > 0:
        plt.bar(missing_cols.index, missing_cols['Missing Percentage'])
        plt.title('Missing Values by Column')
        plt.xlabel('Columns')
        plt.ylabel('Missing Percentage (%)')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

## 5. Target Variable Analysis

In [None]:
# Analyze the target variable (label)
print("=== TARGET VARIABLE ANALYSIS ===")
print("\nLabel distribution:")
label_counts = df['label'].value_counts().sort_index()
print(label_counts)

print("\nLabel percentages:")
label_percentages = df['label'].value_counts(normalize=True).sort_index() * 100
for label, pct in label_percentages.items():
    label_name = "Human" if label == 0 else "AI-Generated"
    print(f"{label_name} ({label}): {pct:.2f}%")

# Visualize label distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Bar plot
labels = ['Human (0)', 'AI-Generated (1)']
colors = ['skyblue', 'lightcoral']
ax1.bar(labels, label_counts.values, color=colors)
ax1.set_title('Distribution of Labels')
ax1.set_ylabel('Count')
for i, v in enumerate(label_counts.values):
    ax1.text(i, v + 100, str(v), ha='center', va='bottom')

# Pie chart
ax2.pie(label_counts.values, labels=labels, autopct='%1.1f%%', colors=colors, startangle=90)
ax2.set_title('Label Distribution')

plt.tight_layout()
plt.show()

# Check if dataset is balanced
balance_ratio = min(label_counts) / max(label_counts)
print(f"\nDataset balance ratio: {balance_ratio:.3f}")
if balance_ratio > 0.8:
    print("✅ Dataset is well balanced")
elif balance_ratio > 0.5:
    print("⚠️ Dataset is moderately imbalanced")
else:
    print("❌ Dataset is highly imbalanced")

## 6. Text Content Analysis

In [None]:
# Analyze text content lengths
print("=== TEXT CONTENT ANALYSIS ===")

# Calculate text lengths
df['text_length'] = df['text_content'].str.len()

print("\nText length statistics:")
print(df.groupby('label')['text_length'].describe())

# Visualize text length distributions
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Overall text length distribution
axes[0, 0].hist(df['text_length'], bins=50, alpha=0.7, edgecolor='black')
axes[0, 0].set_title('Overall Text Length Distribution')
axes[0, 0].set_xlabel('Text Length (characters)')
axes[0, 0].set_ylabel('Frequency')

# Text length by label
for label in df['label'].unique():
    label_name = "Human" if label == 0 else "AI-Generated"
    data = df[df['label'] == label]['text_length']
    axes[0, 1].hist(data, bins=30, alpha=0.6, label=label_name)
axes[0, 1].set_title('Text Length Distribution by Label')
axes[0, 1].set_xlabel('Text Length (characters)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].legend()

# Box plot comparison
df.boxplot(column='text_length', by='label', ax=axes[1, 0])
axes[1, 0].set_title('Text Length by Label (Box Plot)')
axes[1, 0].set_xlabel('Label')
axes[1, 0].set_ylabel('Text Length')

# Word count vs character count
scatter = axes[1, 1].scatter(df['word_count'], df['character_count'], 
                           c=df['label'], alpha=0.6, cmap='coolwarm')
axes[1, 1].set_title('Word Count vs Character Count')
axes[1, 1].set_xlabel('Word Count')
axes[1, 1].set_ylabel('Character Count')
plt.colorbar(scatter, ax=axes[1, 1], label='Label')

plt.tight_layout()
plt.show()

## 7. Linguistic Features Analysis

In [None]:
# Analyze linguistic features
print("=== LINGUISTIC FEATURES ANALYSIS ===")

# List of numeric features to analyze
linguistic_features = [
    'word_count', 'character_count', 'sentence_count', 'lexical_diversity',
    'avg_sentence_length', 'avg_word_length', 'punctuation_ratio',
    'flesch_reading_ease', 'gunning_fog_index', 'grammar_errors',
    'passive_voice_ratio', 'predictability_score', 'burstiness', 'sentiment_score'
]

# Statistical comparison by label
print("\nFeature comparison by label (mean values):")
feature_comparison = df.groupby('label')[linguistic_features].mean()
print(feature_comparison.round(3))

# Calculate differences
print("\nDifference (AI - Human):")
if 1 in feature_comparison.index and 0 in feature_comparison.index:
    differences = feature_comparison.loc[1] - feature_comparison.loc[0]
    print(differences.round(3))

# Visualize feature distributions
n_features = len(linguistic_features)
n_cols = 3
n_rows = (n_features + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4*n_rows))
axes = axes.flatten() if n_rows > 1 else [axes] if n_cols == 1 else axes

for i, feature in enumerate(linguistic_features):
    if i < len(axes):
        for label in df['label'].unique():
            label_name = "Human" if label == 0 else "AI-Generated"
            data = df[df['label'] == label][feature]
            axes[i].hist(data, bins=30, alpha=0.6, label=label_name)
        
        axes[i].set_title(f'{feature.replace("_", " ").title()}')
        axes[i].set_xlabel(feature)
        axes[i].set_ylabel('Frequency')
        axes[i].legend()

# Hide extra subplots
for i in range(len(linguistic_features), len(axes)):
    axes[i].set_visible(False)

plt.tight_layout()
plt.show()

## 8. Feature Correlation Analysis

In [None]:
# Correlation analysis
print("=== FEATURE CORRELATION ANALYSIS ===")

# Calculate correlation matrix
corr_features = linguistic_features + ['label']
correlation_matrix = df[corr_features].corr()

# Visualize correlation matrix
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
            square=True, fmt='.2f', cbar_kws={"shrink": .8})
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

# Features most correlated with label
print("\nFeatures most correlated with label:")
label_correlations = correlation_matrix['label'].abs().sort_values(ascending=False)
print(label_correlations[label_correlations.index != 'label'])

# Plot top correlated features
top_features = label_correlations[label_correlations.index != 'label'].head(8)
plt.figure(figsize=(10, 6))
colors = ['red' if x < 0 else 'blue' for x in correlation_matrix['label'][top_features.index]]
plt.barh(range(len(top_features)), top_features.values, color=colors, alpha=0.7)
plt.yticks(range(len(top_features)), [f.replace('_', ' ').title() for f in top_features.index])
plt.xlabel('Absolute Correlation with Label')
plt.title('Top Features Correlated with Label')
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

## 9. Sample Text Comparison

In [None]:
# Compare sample texts between AI and human
print("=== SAMPLE TEXT COMPARISON ===")

# Get samples from each class
human_samples = df[df['label'] == 0]['text_content'].head(3)
ai_samples = df[df['label'] == 1]['text_content'].head(3)

print("\n" + "="*60)
print("HUMAN-WRITTEN SAMPLES")
print("="*60)
for i, text in enumerate(human_samples, 1):
    print(f"\nSample {i}:")
    print(text[:500] + ("..." if len(text) > 500 else ""))
    print("-" * 50)

print("\n" + "="*60)
print("AI-GENERATED SAMPLES")
print("="*60)
for i, text in enumerate(ai_samples, 1):
    print(f"\nSample {i}:")
    print(text[:500] + ("..." if len(text) > 500 else ""))
    print("-" * 50)

## 10. Feature Importance for Classification

In [None]:
# Quick feature importance analysis using Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

print("=== FEATURE IMPORTANCE ANALYSIS ===")

# Prepare data
X = df[linguistic_features].fillna(0)
y = df['label']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_scaled, y)

# Get feature importance
feature_importance = pd.DataFrame({
    'feature': linguistic_features,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("\nFeature Importance (Random Forest):")
print(feature_importance)

# Plot feature importance
plt.figure(figsize=(10, 8))
plt.barh(range(len(feature_importance)), feature_importance['importance'])
plt.yticks(range(len(feature_importance)), 
           [f.replace('_', ' ').title() for f in feature_importance['feature']])
plt.xlabel('Feature Importance')
plt.title('Feature Importance for AI vs Human Classification')
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

## 11. Summary and Insights

In [None]:
print("=== DATASET SUMMARY AND INSIGHTS ===")
print(f"""    
📊 DATASET OVERVIEW:
• Total samples: {len(df):,}
• Features: {len(linguistic_features)} linguistic features + text content
• Target: Binary classification (Human=0, AI=1)
• Balance: {label_percentages[0]:.1f}% Human, {label_percentages[1]:.1f}% AI

📈 KEY FINDINGS:
• Most discriminative features: {', '.join(top_features.head(3).index)}
• Average text length: {df['text_length'].mean():.0f} characters
• Text length range: {df['text_length'].min()} - {df['text_length'].max()} characters

🔍 INSIGHTS FOR MODELING:
• Dataset appears well-balanced for binary classification
• Multiple linguistic features show correlation with AI/Human labels
• Text length and complexity metrics may be strong predictors
• Ready for feature engineering and model training

✅ NEXT STEPS:
1. Feature engineering and selection
2. Train baseline models (Logistic Regression, Random Forest)
3. Experiment with advanced models (BERT, transformers)
4. Cross-validation and hyperparameter tuning
5. Model interpretation and error analysis
""")

# Save key statistics
summary_stats = {
    'total_samples': len(df),
    'human_samples': len(df[df['label'] == 0]),
    'ai_samples': len(df[df['label'] == 1]),
    'balance_ratio': balance_ratio,
    'avg_text_length': df['text_length'].mean(),
    'top_correlated_features': top_features.head(5).index.tolist(),
    'most_important_features': feature_importance.head(5)['feature'].tolist()
}

print("\n📁 Summary statistics saved for future reference.")