In [20]:
# Imports & Directory Setup
import os
import re
import json
import time
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import sent_tokenize
import textstat

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score
import joblib

# Optional: sentence transformers for better embeddings
try:
    from sentence_transformers import SentenceTransformer
    SENTE = True
except ImportError:
    SENTE = False
    print("‚ö†Ô∏è sentence-transformers not installed. Using TF-IDF embeddings instead.")

# Download NLTK tokenizer 
nltk.download('punkt')


# Directory Structure
ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
DATA_DIR = ROOT / "data"
MODELS_DIR = ROOT / "models"

# File paths
INPUT_CSV = DATA_DIR / "data.csv"
EXTRACTED_CSV = DATA_DIR / "extracted_content.csv"
FEATURES_CSV = DATA_DIR / "features.csv"
DUPLICATES_CSV = DATA_DIR / "duplicates.csv"
MODEL_PATH = MODELS_DIR / "quality_model.pkl"

print("‚úÖ Setup complete")
print("Root Directory:", ROOT)


‚ö†Ô∏è sentence-transformers not installed. Using TF-IDF embeddings instead.
‚úÖ Setup complete
Root Directory: C:\Users\aishw\seo-content-detector


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aishw\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### HTML Parsing

In [21]:
# HTML Parsing Functions
def extract_from_html(html: str):
    """Extracts title, main body text, and word count from raw HTML."""
    try:
        soup = BeautifulSoup(html, "lxml")
        title = soup.title.string.strip() if soup.title else ""

        # Prefer <article> or <main> for main content
        main_section = soup.find("article") or soup.find("main")
        if main_section:
            text = main_section.get_text(separator=" ", strip=True)
        else:
            # Fallback: use all paragraphs and headings
            tags = soup.find_all(["p", "h1", "h2", "h3"])
            text = " ".join([t.get_text(" ", strip=True) for t in tags])

        text = re.sub(r"\s+", " ", text).strip()
        return {
            "title": title,
            "body_text": text,
            "word_count": len(text.split())
        }
    except Exception:
        return {"title": "", "body_text": "", "word_count": 0}


In [22]:
# Read dataset & parse HTML
df = pd.read_csv(INPUT_CSV)
print("Rows in dataset:", len(df))

if 'html_content' not in df.columns:
    raise ValueError("Expected 'html_content' column in data.csv.")

records = []
for i, row in tqdm(df.iterrows(), total=len(df), desc="Parsing HTML"):
    parsed = extract_from_html(row['html_content'])
    records.append({
        "url": row['url'],
        "title": parsed['title'],
        "body_text": parsed['body_text'],
        "word_count": parsed['word_count']
    })

extracted_df = pd.DataFrame(records)
extracted_df.to_csv(EXTRACTED_CSV, index=False)
print(f"‚úÖ Extracted content saved to {EXTRACTED_CSV}")
extracted_df.head()

Rows in dataset: 81


Parsing HTML:   0%|          | 0/81 [00:00<?, ?it/s]

‚úÖ Extracted content saved to C:\Users\aishw\seo-content-detector\data\extracted_content.csv


Unnamed: 0,url,title,body_text,word_count
0,https://www.cm-alliance.com/cybersecurity-blog,Cyber Security Blog,Cyber Crisis Tabletop Exercise Cyber Security ...,337
1,https://www.varonis.com/blog/cybersecurity-tips,Top 10 Cybersecurity Awareness Tips: How to St...,Cybersecurity is gaining more importance globa...,1700
2,https://www.cisecurity.org/insights/blog/11-cy...,11 Cyber Defense Tips to Stay Secure at Work a...,Home Insights Blog Posts 11 Cyber Defense Tips...,1058
3,https://www.cisa.gov/topics/cybersecurity-best...,Cybersecurity Best Practices | Cybersecurity a...,Cybersecurity Best Practices CISA provides inf...,826
4,https://www.qnbtrust.bank/Resources/Learning-C...,,,0


###  Text Preprocessing & Feature Engineering

In [23]:
# Text Cleaning & Feature Engineering
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def sentence_count(text):
    return len(sent_tokenize(text)) if text else 0

extracted_df['clean_text'] = extracted_df['body_text'].apply(clean_text)
extracted_df['sentence_count'] = extracted_df['clean_text'].apply(sentence_count)
extracted_df['flesch_reading_ease'] = extracted_df['clean_text'].apply(lambda x: textstat.flesch_reading_ease(x) if x else 0)
extracted_df['is_thin'] = extracted_df['word_count'] < 500

# TF-IDF keywords
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = vectorizer.fit_transform(extracted_df['clean_text'])
features = vectorizer.get_feature_names_out()

def get_top_keywords(row, n=5):
    arr = row.toarray().flatten()
    top_idx = arr.argsort()[-n:][::-1]
    return "|".join([features[i] for i in top_idx if arr[i] > 0])

top_keywords = [get_top_keywords(tfidf_matrix[i]) for i in range(tfidf_matrix.shape[0])]
extracted_df['top_keywords'] = top_keywords

# Embeddings
if SENTE:
    print("üîπ Using SentenceTransformer embeddings...")
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(extracted_df['clean_text'], show_progress_bar=True)
else:
    print("‚öôÔ∏è Using TF-IDF as embeddings.")
    embeddings = tfidf_matrix.toarray()

extracted_df['embedding'] = [json.dumps(list(map(float, e))) for e in embeddings]

# Save features
features_df = extracted_df[['url', 'word_count', 'sentence_count', 'flesch_reading_ease', 'top_keywords', 'is_thin', 'embedding']]
features_df.to_csv(FEATURES_CSV, index=False)
print(f"‚úÖ Features saved to {FEATURES_CSV}")
features_df.head()

‚öôÔ∏è Using TF-IDF as embeddings.
‚úÖ Features saved to C:\Users\aishw\seo-content-detector\data\features.csv


Unnamed: 0,url,word_count,sentence_count,flesch_reading_ease,top_keywords,is_thin,embedding
0,https://www.cm-alliance.com/cybersecurity-blog,337,6,-11.155568,cyber|cybersecurity|alliance|training|consultancy,True,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,https://www.varonis.com/blog/cybersecurity-tips,1700,92,41.465,varonis|data|access|security|app,False,"[0.0, 0.017555493119617405, 0.0, 0.0, 0.0, 0.0..."
2,https://www.cisecurity.org/insights/blog/11-cy...,1058,62,53.262918,password|passphrase|don|authentication|cyber,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,https://www.cisa.gov/topics/cybersecurity-best...,826,27,-2.538002,cisa|cybersecurity|cyber|practices|resilience,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,https://www.qnbtrust.bank/Resources/Learning-C...,0,0,0.0,,True,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


### Duplicate Detection

In [24]:
#  Duplicate Detection
print("üîç Calculating cosine similarities...")
cos_sim = cosine_similarity(embeddings)

SIM_THRESHOLD = 0.8
pairs = []

for i in range(len(extracted_df)):
    for j in range(i + 1, len(extracted_df)):
        if cos_sim[i, j] >= SIM_THRESHOLD:
            pairs.append({
                "url1": extracted_df.loc[i, 'url'],
                "url2": extracted_df.loc[j, 'url'],
                "similarity": float(cos_sim[i, j])
            })

duplicates_df = pd.DataFrame(pairs)
duplicates_df.to_csv(DUPLICATES_CSV, index=False)
print(f"‚úÖ Duplicates saved to {DUPLICATES_CSV}")
print(f"Total duplicate pairs found: {len(duplicates_df)}")

print("Total pages:", len(extracted_df))
print("Thin content pages:", extracted_df['is_thin'].sum())
duplicates_df.head()


üîç Calculating cosine similarities...
‚úÖ Duplicates saved to C:\Users\aishw\seo-content-detector\data\duplicates.csv
Total duplicate pairs found: 3
Total pages: 81
Thin content pages: 32


Unnamed: 0,url1,url2,similarity
0,https://en.wikipedia.org/wiki/SD-WAN,https://www.cisco.com/site/us/en/learn/topics/...,0.814697
1,https://nytlicensing.com/latest/trends/content...,https://copyblogger.com/content-marketing/,0.801423
2,https://nytlicensing.com/latest/trends/content...,https://www.coursera.org/articles/content-stra...,0.803941


### Content Quality Scoring

In [25]:
#  Quality Scoring Model

# Step 1: Label assignment (synthetic rule-based)
def quality_label(row):
    wc, fr = row['word_count'], row['flesch_reading_ease']
    if wc > 1500 and 50 <= fr <= 70:
        return "High Quality"
    elif wc < 500 or fr < 30:
        return "Low Quality"
    else:
        return "Medium Quality"

extracted_df['label'] = extracted_df.apply(quality_label, axis=1)

# Step 2: Prepare features and labels
X = extracted_df[['word_count', 'sentence_count', 'flesch_reading_ease']]
y = extracted_df['label']

# Step 3: Split dataset (70/30)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Step 4: Train model
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Step 5: Evaluate model
print("Model Performance:")
print(classification_report(y_test, y_pred, digits=2))
print("Overall Accuracy:", round(accuracy_score(y_test, y_pred), 2))
print("F1 (weighted):", round(f1_score(y_test, y_pred, average='weighted'), 2))

# Step 6: Baseline model (simple rule-based on word count)
def baseline_rule(wc):
    if wc > 1500:
        return "High Quality"
    elif wc < 500:
        return "Low Quality"
    else:
        return "Medium Quality"

baseline_pred = X_test['word_count'].apply(baseline_rule)
baseline_acc = accuracy_score(y_test, baseline_pred)
print("Baseline Accuracy:", round(baseline_acc, 2))

# Step 7: Feature importances
importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
print("\nTop Features:")
for i, (feature, imp) in enumerate(importances.items(), start=1):
    print(f"{i}. {feature} (importance: {imp:.2f})")

# Step 8: Save model safely
MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(model, MODEL_PATH)
print(f"\n‚úÖ Model saved to {MODEL_PATH}")


Model Performance:
                precision    recall  f1-score   support

  High Quality       0.50      0.50      0.50         2
   Low Quality       1.00      0.93      0.97        15
Medium Quality       0.78      0.88      0.82         8

      accuracy                           0.88        25
     macro avg       0.76      0.77      0.76        25
  weighted avg       0.89      0.88      0.88        25

Overall Accuracy: 0.88
F1 (weighted): 0.88
Baseline Accuracy: 0.52

Top Features:
1. flesch_reading_ease (importance: 0.38)
2. word_count (importance: 0.35)
3. sentence_count (importance: 0.27)

‚úÖ Model saved to C:\Users\aishw\seo-content-detector\models\quality_model.pkl


###  Real-Time Analysis Demo

In [26]:
#  Real-Time URL Analysis
import requests
from sklearn.metrics.pairwise import cosine_similarity

HEADERS = {"User-Agent": "seo-content-detector-bot/1.0"}

def scrape_html(url):
    try:
        res = requests.get(url, headers=HEADERS, timeout=10)
        if res.status_code == 200:
            return res.text
        return ""
    except Exception:
        return ""

def compute_embedding(text):
    if SENTE:
        return model.encode([text])[0]
    return vectorizer.transform([text]).toarray()[0]

def analyze_url(url):
    html = scrape_html(url)
    parsed = extract_from_html(html)
    clean = clean_text(parsed['body_text'])
    wc = parsed['word_count']
    sc = sentence_count(clean)
    fr = textstat.flesch_reading_ease(clean)
    is_thin = wc < 500

    emb = compute_embedding(clean)
    sims = cosine_similarity([emb], embeddings)[0]
    similar_pages = [
        {"url": extracted_df.loc[i, 'url'], "similarity": float(sims[i])}
        for i in np.where(sims > 0.75)[0]
    ]

    label = model.predict(pd.DataFrame([{
        "word_count": wc, "sentence_count": sc, "flesch_reading_ease": fr
    }]))[0]

    return {
        "url": url,
        "word_count": wc,
        "sentence_count": sc,
        "readability": fr,
        "is_thin": is_thin,
        "quality_label": label,
        "similar_to": similar_pages
    }

result = analyze_url("https://en.wikipedia.org/wiki/Main_Page")
print(json.dumps(result, indent=2))

{
  "url": "https://en.wikipedia.org/wiki/Main_Page",
  "word_count": 1971,
  "sentence_count": 53,
  "readability": 35.344451570758395,
  "is_thin": false,
  "quality_label": "Medium Quality",
  "similar_to": []
}
