# SEO Content Quality & Duplicate Detector - Enhanced Analysis

This notebook demonstrates advanced NLP analysis including sentiment analysis, named entity recognition, topic modeling, and comprehensive visualizations.

## 1. Setup and Imports

In [None]:
import sys
import os

# Add streamlit_app/utils to path
sys.path.insert(0, os.path.join('..', 'streamlit_app', 'utils'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import warnings
warnings.filterwarnings('ignore')

# NLP libraries
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk import ne_chunk, pos_tag, word_tokenize
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import Counter
import re

# Download NLTK data
for resource in ['vader_lexicon', 'averaged_perceptron_tagger', 'maxent_ne_chunker', 'words', 'punkt']:
    try:
        nltk.data.find(f'tokenizers/{resource}')
    except LookupError:
        try:
            nltk.download(resource, quiet=True)
        except:
            pass

from parser import parse_dataset, extract_text_from_html
from features import extract_features_from_dataset, extract_features
from similarity import detect_duplicates, parse_embedding
from scorer import train_quality_model, predict_quality

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Setup complete!")

## 2. Load and Parse Data

In [None]:
# Clean data first
df = pd.read_csv('../data/extracted_content.csv')
df['body_text'] = df['body_text'].fillna('')
df.to_csv('../data/extracted_content.csv', index=False)

print(f"Loaded {len(df)} pages")
df.head()

## 3. Extract NLP Features

In [None]:
# Extract features
features_df, embeddings = extract_features_from_dataset(
    '../data/extracted_content.csv',
    '../data/features.csv'
)

print(f"\nFeatures extracted for {len(features_df)} pages")
features_df.head()

## 4. Advanced NLP Analysis

### 4.1 Sentiment Analysis

In [None]:
# Perform sentiment analysis using VADER
print("Performing sentiment analysis...")
sia = SentimentIntensityAnalyzer()

sentiments = []
for text in df['body_text'][:100]:  # Sample first 100 for speed
    if text and len(text) > 10:
        scores = sia.polarity_scores(text[:5000])  # Limit text length
        sentiments.append(scores)
    else:
        sentiments.append({'neg': 0, 'neu': 0, 'pos': 0, 'compound': 0})

sentiment_df = pd.DataFrame(sentiments)

# Classify sentiment
sentiment_df['sentiment'] = sentiment_df['compound'].apply(
    lambda x: 'Positive' if x > 0.05 else ('Negative' if x < -0.05 else 'Neutral')
)

print("\nSentiment Distribution:")
print(sentiment_df['sentiment'].value_counts())
sentiment_df.head()

In [None]:
# Visualize sentiment distribution
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Sentiment counts
sentiment_counts = sentiment_df['sentiment'].value_counts()
axes[0].bar(sentiment_counts.index, sentiment_counts.values, color=['green', 'gray', 'red'])
axes[0].set_title('Sentiment Distribution')
axes[0].set_ylabel('Count')

# Compound score distribution
axes[1].hist(sentiment_df['compound'], bins=30, edgecolor='black', alpha=0.7)
axes[1].axvline(0, color='red', linestyle='--', label='Neutral')
axes[1].set_xlabel('Compound Score')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Compound Sentiment Score Distribution')
axes[1].legend()

# Positive/Negative/Neutral proportions
sentiment_props = sentiment_df[['pos', 'neu', 'neg']].mean()
axes[2].pie(sentiment_props, labels=['Positive', 'Neutral', 'Negative'], 
            autopct='%1.1f%%', colors=['green', 'gray', 'red'])
axes[2].set_title('Average Sentiment Proportions')

plt.tight_layout()
plt.show()

### 4.2 Named Entity Recognition (NER)

In [None]:
# Extract named entities from sample texts
print("Extracting named entities...")

def extract_entities(text, max_length=1000):
    """Extract named entities from text."""
    if not text or len(text) < 10:
        return []
    
    try:
        # Limit text length for performance
        text_sample = text[:max_length]
        tokens = word_tokenize(text_sample)
        pos_tags = pos_tag(tokens)
        chunks = ne_chunk(pos_tags, binary=False)
        
        entities = []
        for chunk in chunks:
            if hasattr(chunk, 'label'):
                entity = ' '.join(c[0] for c in chunk)
                entities.append((entity, chunk.label()))
        
        return entities
    except:
        return []

# Extract entities from first 20 documents
all_entities = []
for text in df['body_text'][:20]:
    entities = extract_entities(text)
    all_entities.extend(entities)

# Count entity types
entity_types = Counter([ent[1] for ent in all_entities])
entity_names = Counter([ent[0] for ent in all_entities])

print(f"\nTotal entities found: {len(all_entities)}")
print(f"\nTop entity types:")
for ent_type, count in entity_types.most_common(5):
    print(f"  {ent_type}: {count}")

print(f"\nTop entities:")
for ent_name, count in entity_names.most_common(10):
    print(f"  {ent_name}: {count}")

In [None]:
# Visualize entity types
if len(entity_types) > 0:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Entity type distribution
    entity_type_df = pd.DataFrame(entity_types.most_common(10), columns=['Type', 'Count'])
    axes[0].barh(entity_type_df['Type'], entity_type_df['Count'])
    axes[0].set_xlabel('Count')
    axes[0].set_title('Top Named Entity Types')
    axes[0].invert_yaxis()
    
    # Top entities
    entity_name_df = pd.DataFrame(entity_names.most_common(15), columns=['Entity', 'Count'])
    axes[1].barh(entity_name_df['Entity'], entity_name_df['Count'])
    axes[1].set_xlabel('Count')
    axes[1].set_title('Top Named Entities')
    axes[1].invert_yaxis()
    
    plt.tight_layout()
    plt.show()

### 4.3 Topic Modeling with LDA

In [None]:
# Perform topic modeling using LDA
print("Performing topic modeling...")

# Prepare texts (sample first 100 for speed)
texts_sample = [text for text in df['body_text'][:100] if text and len(text) > 50]

# Create document-term matrix
vectorizer = CountVectorizer(
    max_features=1000,
    max_df=0.8,
    min_df=2,
    stop_words='english'
)

doc_term_matrix = vectorizer.fit_transform(texts_sample)
feature_names = vectorizer.get_feature_names_out()

# Fit LDA model
n_topics = 5
lda = LatentDirichletAllocation(
    n_components=n_topics,
    random_state=42,
    max_iter=20
)

lda.fit(doc_term_matrix)

# Display topics
def display_topics(model, feature_names, n_top_words=10):
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        top_words_idx = topic.argsort()[-n_top_words:][::-1]
        top_words = [feature_names[i] for i in top_words_idx]
        topics.append(top_words)
        print(f"\nTopic {topic_idx + 1}:")
        print(", ".join(top_words))
    return topics

topics = display_topics(lda, feature_names, n_top_words=10)

In [None]:
# Visualize topic word weights
fig, axes = plt.subplots(2, 3, figsize=(16, 10))
axes = axes.flatten()

for topic_idx in range(n_topics):
    top_words_idx = lda.components_[topic_idx].argsort()[-10:][::-1]
    top_words = [feature_names[i] for i in top_words_idx]
    weights = [lda.components_[topic_idx][i] for i in top_words_idx]
    
    axes[topic_idx].barh(top_words, weights)
    axes[topic_idx].set_title(f'Topic {topic_idx + 1}')
    axes[topic_idx].set_xlabel('Weight')
    axes[topic_idx].invert_yaxis()

# Hide extra subplot
axes[-1].axis('off')

plt.tight_layout()
plt.show()

### 4.4 Improved Keyword Extraction

In [None]:
# Extract keywords using TF-IDF
print("Extracting keywords with TF-IDF...")

tfidf = TfidfVectorizer(
    max_features=50,
    ngram_range=(1, 2),  # Include bigrams
    stop_words='english',
    max_df=0.8,
    min_df=2
)

tfidf_matrix = tfidf.fit_transform(texts_sample)
feature_names = tfidf.get_feature_names_out()

# Get average TF-IDF scores
avg_tfidf = np.mean(tfidf_matrix.toarray(), axis=0)
top_keywords_idx = avg_tfidf.argsort()[-20:][::-1]
top_keywords = [(feature_names[i], avg_tfidf[i]) for i in top_keywords_idx]

print("\nTop 20 Keywords (by TF-IDF):")
for keyword, score in top_keywords:
    print(f"  {keyword}: {score:.4f}")

In [None]:
# Visualize top keywords
keyword_df = pd.DataFrame(top_keywords, columns=['Keyword', 'TF-IDF Score'])

plt.figure(figsize=(12, 6))
plt.barh(keyword_df['Keyword'], keyword_df['TF-IDF Score'])
plt.xlabel('Average TF-IDF Score')
plt.title('Top 20 Keywords by TF-IDF')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 5. Comprehensive Visualizations

### 5.1 Word Clouds

In [None]:
# Generate word clouds for different quality levels
print("Generating word clouds...")

# Combine all text
all_text = ' '.join(df['body_text'][:100].dropna())

# Create word cloud
wordcloud = WordCloud(
    width=1200,
    height=600,
    background_color='white',
    stopwords='english',
    max_words=100,
    colormap='viridis'
).generate(all_text)

plt.figure(figsize=(14, 7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud - All Content', fontsize=16, pad=20)
plt.tight_layout()
plt.show()

### 5.2 Similarity Heatmap

In [None]:
# Load features and compute similarity matrix
features_df = pd.read_csv('../data/features.csv')

# Parse embeddings for sample (first 30 for visualization)
sample_size = min(30, len(features_df))
embeddings_sample = []

for idx in range(sample_size):
    emb = parse_embedding(features_df.iloc[idx]['embedding'])
    if emb is not None:
        embeddings_sample.append(emb)

embeddings_sample = np.array(embeddings_sample)

# Compute similarity matrix
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(embeddings_sample)

# Visualize similarity heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(
    similarity_matrix,
    cmap='YlOrRd',
    vmin=0,
    vmax=1,
    square=True,
    cbar_kws={'label': 'Cosine Similarity'}
)
plt.title(f'Content Similarity Heatmap (Sample of {sample_size} Pages)', fontsize=14, pad=20)
plt.xlabel('Page Index')
plt.ylabel('Page Index')
plt.tight_layout()
plt.show()

print(f"\nAverage similarity: {similarity_matrix[np.triu_indices_from(similarity_matrix, k=1)].mean():.4f}")
print(f"Max similarity (excluding diagonal): {np.max(similarity_matrix[np.triu_indices_from(similarity_matrix, k=1)]):.4f}")

### 5.3 Feature Importance and Model Performance

In [None]:
# Train model
model, X_test, y_test, y_pred, feature_names, metrics = train_quality_model(
    '../data/features.csv',
    '../streamlit_app/models/quality_model.pkl',
    test_size=0.3,
    random_state=42
)

In [None]:
# Comprehensive model visualizations
fig = plt.figure(figsize=(16, 12))
gs = fig.add_gridspec(3, 2, hspace=0.3, wspace=0.3)

# Feature importance
ax1 = fig.add_subplot(gs[0, 0])
feature_importance = metrics['feature_importance']
ax1.barh(feature_importance['feature'], feature_importance['importance'], color='steelblue')
ax1.set_xlabel('Importance')
ax1.set_title('Feature Importance')
ax1.invert_yaxis()

# Confusion matrix
ax2 = fig.add_subplot(gs[0, 1])
cm = metrics['confusion_matrix']
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Low', 'Medium', 'High'],
            yticklabels=['Low', 'Medium', 'High'],
            ax=ax2)
ax2.set_xlabel('Predicted')
ax2.set_ylabel('Actual')
ax2.set_title('Confusion Matrix')

# Model performance comparison
ax3 = fig.add_subplot(gs[1, :])
models = ['RandomForest', 'Baseline']
accuracy_scores = [metrics['accuracy'], metrics['baseline_accuracy']]
f1_scores = [metrics['f1_score'], metrics['baseline_f1']]

x = np.arange(len(models))
width = 0.35

ax3.bar(x - width/2, accuracy_scores, width, label='Accuracy', color='skyblue')
ax3.bar(x + width/2, f1_scores, width, label='F1 Score', color='lightcoral')
ax3.set_ylabel('Score')
ax3.set_title('Model Performance Comparison')
ax3.set_xticks(x)
ax3.set_xticklabels(models)
ax3.legend()
ax3.set_ylim([0, 1])

# Add value labels on bars
for i, v in enumerate(accuracy_scores):
    ax3.text(i - width/2, v + 0.01, f'{v:.3f}', ha='center', va='bottom')
for i, v in enumerate(f1_scores):
    ax3.text(i + width/2, v + 0.01, f'{v:.3f}', ha='center', va='bottom')

# Feature distributions by quality
features_df = pd.read_csv('../data/features.csv')
if 'predicted_quality' in features_df.columns:
    ax4 = fig.add_subplot(gs[2, 0])
    for quality in ['Low', 'Medium', 'High']:
        subset = features_df[features_df['predicted_quality'] == quality]
        if len(subset) > 0:
            ax4.scatter(subset['word_count'], subset['flesch_reading_ease'], 
                       alpha=0.6, label=quality, s=50)
    ax4.set_xlabel('Word Count')
    ax4.set_ylabel('Flesch Reading Ease')
    ax4.set_title('Quality Distribution by Features')
    ax4.legend()
    ax4.grid(True, alpha=0.3)
    
    # Quality distribution
    ax5 = fig.add_subplot(gs[2, 1])
    quality_counts = features_df['predicted_quality'].value_counts()
    colors = {'High': 'green', 'Medium': 'orange', 'Low': 'red'}
    bar_colors = [colors.get(q, 'gray') for q in quality_counts.index]
    ax5.bar(quality_counts.index, quality_counts.values, color=bar_colors, alpha=0.7)
    ax5.set_xlabel('Quality Level')
    ax5.set_ylabel('Count')
    ax5.set_title('Predicted Quality Distribution')

plt.show()

### 5.4 Distribution Charts

In [None]:
# Comprehensive distribution analysis
fig, axes = plt.subplots(2, 3, figsize=(16, 10))

# Word count distribution
axes[0, 0].hist(features_df['word_count'], bins=50, edgecolor='black', alpha=0.7)
axes[0, 0].axvline(features_df['word_count'].mean(), color='red', linestyle='--', label='Mean')
axes[0, 0].axvline(features_df['word_count'].median(), color='green', linestyle='--', label='Median')
axes[0, 0].set_xlabel('Word Count')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Word Count Distribution')
axes[0, 0].legend()

# Sentence count distribution
axes[0, 1].hist(features_df['sentence_count'], bins=50, edgecolor='black', alpha=0.7, color='orange')
axes[0, 1].set_xlabel('Sentence Count')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Sentence Count Distribution')

# Readability distribution
axes[0, 2].hist(features_df['flesch_reading_ease'], bins=50, edgecolor='black', alpha=0.7, color='green')
axes[0, 2].axvline(50, color='red', linestyle='--', alpha=0.5, label='Min Optimal')
axes[0, 2].axvline(70, color='red', linestyle='--', alpha=0.5, label='Max Optimal')
axes[0, 2].set_xlabel('Flesch Reading Ease')
axes[0, 2].set_ylabel('Frequency')
axes[0, 2].set_title('Readability Distribution')
axes[0, 2].legend()

# Box plots
axes[1, 0].boxplot([features_df['word_count']])
axes[1, 0].set_ylabel('Word Count')
axes[1, 0].set_title('Word Count Box Plot')
axes[1, 0].set_xticklabels([''])

axes[1, 1].boxplot([features_df['sentence_count']])
axes[1, 1].set_ylabel('Sentence Count')
axes[1, 1].set_title('Sentence Count Box Plot')
axes[1, 1].set_xticklabels([''])

axes[1, 2].boxplot([features_df['flesch_reading_ease']])
axes[1, 2].set_ylabel('Flesch Reading Ease')
axes[1, 2].set_title('Readability Box Plot')
axes[1, 2].set_xticklabels([''])

plt.tight_layout()
plt.show()

# Print statistics
print("\n=== Statistical Summary ===")
print("\nWord Count:")
print(features_df['word_count'].describe())
print("\nSentence Count:")
print(features_df['sentence_count'].describe())
print("\nReadability:")
print(features_df['flesch_reading_ease'].describe())

## 6. Duplicate Detection Visualization

In [None]:
# Detect duplicates
duplicates_df, similarity_matrix, summary_stats = detect_duplicates(
    '../data/features.csv',
    '../data/duplicates.csv',
    threshold=0.80,
    thin_threshold=500
)

print(f"\n{summary_stats['duplicate_pairs']} duplicate pairs found")
if len(duplicates_df) > 0:
    duplicates_df.head(10)

In [None]:
# Visualize duplicate analysis
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

if len(duplicates_df) > 0:
    # Similarity score distribution
    axes[0, 0].hist(duplicates_df['similarity'], bins=20, edgecolor='black', alpha=0.7)
    axes[0, 0].axvline(0.80, color='red', linestyle='--', label='Threshold')
    axes[0, 0].set_xlabel('Similarity Score')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].set_title('Duplicate Similarity Distribution')
    axes[0, 0].legend()
    
    # Word count comparison for duplicates
    axes[0, 1].scatter(duplicates_df['word_count_1'], duplicates_df['word_count_2'], alpha=0.6)
    axes[0, 1].plot([0, duplicates_df[['word_count_1', 'word_count_2']].max().max()], 
                    [0, duplicates_df[['word_count_1', 'word_count_2']].max().max()], 
                    'r--', alpha=0.5)
    axes[0, 1].set_xlabel('Word Count (URL 1)')
    axes[0, 1].set_ylabel('Word Count (URL 2)')
    axes[0, 1].set_title('Word Count Comparison for Duplicates')
else:
    axes[0, 0].text(0.5, 0.5, 'No duplicates found', ha='center', va='center')
    axes[0, 0].set_title('Duplicate Similarity Distribution')
    axes[0, 1].text(0.5, 0.5, 'No duplicates found', ha='center', va='center')
    axes[0, 1].set_title('Word Count Comparison')

# Thin content analysis
thin_content = features_df[features_df['word_count'] < 500]
normal_content = features_df[features_df['word_count'] >= 500]

axes[1, 0].bar(['Thin (<500)', 'Normal (>=500)'], 
               [len(thin_content), len(normal_content)],
               color=['red', 'green'], alpha=0.7)
axes[1, 0].set_ylabel('Count')
axes[1, 0].set_title('Thin Content Detection')

# Summary statistics
summary_text = f"""
Total Pages: {summary_stats['total_pages']}
Duplicate Pairs: {summary_stats['duplicate_pairs']}
Thin Pages: {summary_stats['thin_pages']}
Similarity Threshold: {summary_stats['similarity_threshold']}
Thin Threshold: {summary_stats['thin_threshold']} words
"""
axes[1, 1].text(0.1, 0.5, summary_text, fontsize=12, verticalalignment='center')
axes[1, 1].axis('off')
axes[1, 1].set_title('Summary Statistics')

plt.tight_layout()
plt.show()

## 7. Summary

This enhanced analysis includes:
- **Sentiment Analysis**: VADER-based sentiment scoring
- **Named Entity Recognition**: Extraction and categorization of entities
- **Topic Modeling**: LDA-based topic discovery
- **Advanced Keywords**: TF-IDF with n-grams
- **Comprehensive Visualizations**: Heatmaps, word clouds, distributions

All results demonstrate the power of NLP for SEO content analysis.