In [1]:
#1
# ----------------------------------------------
# SEO Content Quality & Duplicate Detector
# Author: [Your Name]
# Company: LeadWalnut Screening Assignment
# ----------------------------------------------

import os
import re
import json
import time
import joblib
import textstat
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sentence_transformers import SentenceTransformer

# Ensure necessary folders exist
os.makedirs("../data", exist_ok=True)
os.makedirs("../models", exist_ok=True)

print("✅ Environment ready!")


✅ Environment ready!


In [2]:
# Load the dataset
data_path = "../data/data.csv"

df = pd.read_csv(data_path)
print("Data loaded successfully!")
df.head()


Data loaded successfully!


Unnamed: 0,url,html_content
0,https://www.cm-alliance.com/cybersecurity-blog,"<!doctype html><!--[if lt IE 7]> <html class=""..."
1,https://www.varonis.com/blog/cybersecurity-tips,"<!doctype html><html lang=""en""><head>\n <me..."
2,https://www.cisecurity.org/insights/blog/11-cy...,<!DOCTYPE html><html data-unhead-vue-server-re...
3,https://www.cisa.gov/topics/cybersecurity-best...,"\n\n<!DOCTYPE html>\n<html lang=""en"" dir=""ltr""..."
4,https://www.qnbtrust.bank/Resources/Learning-C...,


In [3]:
#3
def extract_text_from_html(html):
    """Extract clean text and title from HTML content."""
    try:
        soup = BeautifulSoup(html, "html.parser")
        title = soup.title.string.strip() if soup.title else "Untitled"
        for script in soup(["script", "style", "noscript"]):
            script.decompose()
        text = " ".join(p.get_text() for p in soup.find_all(["p", "article", "main"]))
        text = re.sub(r"\s+", " ", text).strip()
        return title, text
    except Exception as e:
        return "ParseError", ""

def scrape_url(url):
    """Fetch HTML content from a URL with basic error handling."""
    headers = {"User-Agent": "Mozilla/5.0 (LeadWalnut Assignment)"}
    try:
        res = requests.get(url, headers=headers, timeout=10)
        if res.status_code == 200:
            return res.text
        else:
            return ""
    except Exception:
        return ""

# If html_content not present, scrape
if "html_content" not in df.columns:
    print("No html_content found — scraping webpages...")
    df["html_content"] = [scrape_url(u) for u in tqdm(df["url"])]

# Extract text and titles
titles, bodies, word_counts = [], [], []
for html in tqdm(df["html_content"], desc="Parsing HTML"):
    title, text = extract_text_from_html(html)
    titles.append(title)
    bodies.append(text)
    word_counts.append(len(text.split()))

df["title"] = titles
df["body_text"] = bodies
df["word_count"] = word_counts

# Save extracted content
extracted_path = "../data/extracted_content.csv"
df[["url", "title", "body_text", "word_count"]].to_csv(extracted_path, index=False)
print(f"✅ Extracted content saved to {extracted_path}")
df.head()


Parsing HTML: 100%|████████████████████████████████████████████████████████████████████| 81/81 [00:08<00:00,  9.05it/s]

✅ Extracted content saved to ../data/extracted_content.csv





Unnamed: 0,url,html_content,title,body_text,word_count
0,https://www.cm-alliance.com/cybersecurity-blog,"<!doctype html><!--[if lt IE 7]> <html class=""...",Cyber Security Blog,Cyber Crisis Tabletop Exercise Cyber Security ...,326
1,https://www.varonis.com/blog/cybersecurity-tips,"<!doctype html><html lang=""en""><head>\n <me...",Top 10 Cybersecurity Awareness Tips: How to St...,The #1 Data Security Platform WHERE TO BUY CAP...,5436
2,https://www.cisecurity.org/insights/blog/11-cy...,<!DOCTYPE html><html data-unhead-vue-server-re...,11 Cyber Defense Tips to Stay Secure at Work a...,HomeInsightsBlog Posts11 Cyber Defense Tips to...,2007
3,https://www.cisa.gov/topics/cybersecurity-best...,"\n\n<!DOCTYPE html>\n<html lang=""en"" dir=""ltr""...",Cybersecurity Best Practices | Cybersecurity a...,An official website of the United States gover...,1426
4,https://www.qnbtrust.bank/Resources/Learning-C...,,ParseError,,0


In [4]:
#4
from nltk.tokenize import sent_tokenize
import nltk
nltk.download("punkt")

def compute_features(text):
    """Compute sentence count and readability for a given text."""
    if not text.strip():
        return 0, 0
    sentences = sent_tokenize(text)
    sentence_count = len(sentences)
    try:
        readability = textstat.flesch_reading_ease(text)
    except Exception:
        readability = 0
    return sentence_count, readability

df["sentence_count"], df["flesch_reading_ease"] = zip(*df["body_text"].apply(compute_features))

# TF-IDF keyword extraction
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
tfidf_matrix = vectorizer.fit_transform(df["body_text"])
feature_names = np.array(vectorizer.get_feature_names_out())

def get_top_keywords(row, top_n=5):
    vector = tfidf_matrix[row].toarray().flatten()
    top_indices = vector.argsort()[-top_n:][::-1]
    return "|".join(feature_names[top_indices])

df["top_keywords"] = [get_top_keywords(i) for i in range(len(df))]

features_path = "../data/features.csv"
df[["url", "word_count", "sentence_count", "flesch_reading_ease", "top_keywords"]].to_csv(features_path, index=False)
print(f"✅ Features saved to {features_path}")
df.head()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\NITHYA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


✅ Features saved to ../data/features.csv


Unnamed: 0,url,html_content,title,body_text,word_count,sentence_count,flesch_reading_ease,top_keywords
0,https://www.cm-alliance.com/cybersecurity-blog,"<!doctype html><!--[if lt IE 7]> <html class=""...",Cyber Security Blog,Cyber Crisis Tabletop Exercise Cyber Security ...,326,6,-6.816181,cyber|alliance|cybersecurity|training|consultancy
1,https://www.varonis.com/blog/cybersecurity-tips,"<!doctype html><html lang=""en""><head>\n <me...",Top 10 Cybersecurity Awareness Tips: How to St...,The #1 Data Security Platform WHERE TO BUY CAP...,5436,298,39.226772,varonis|data|security|access|app
2,https://www.cisecurity.org/insights/blog/11-cy...,<!DOCTYPE html><html data-unhead-vue-server-re...,11 Cyber Defense Tips to Stay Secure at Work a...,HomeInsightsBlog Posts11 Cyber Defense Tips to...,2007,134,53.035066,password|passphrase|authentication|don|protect
3,https://www.cisa.gov/topics/cybersecurity-best...,"\n\n<!DOCTYPE html>\n<html lang=""en"" dir=""ltr""...",Cybersecurity Best Practices | Cybersecurity a...,An official website of the United States gover...,1426,52,4.82975,cisa|cybersecurity|cyber|practices|nation
4,https://www.qnbtrust.bank/Resources/Learning-C...,,ParseError,,0,0,0.0,जप|ztna|zscaler|zoom|zishing


In [5]:
#5
try:
    model = SentenceTransformer("all-MiniLM-L6-v2")
    print("✅ Using SentenceTransformer embeddings.")
    embeddings = model.encode(df["body_text"].tolist(), show_progress_bar=True)
except Exception:
    print("⚠️ Falling back to TF-IDF embeddings.")
    embeddings = tfidf_matrix.toarray()

np.save("../data/embeddings.npy", embeddings)

# Compute similarity matrix
similarity_matrix = cosine_similarity(embeddings)
threshold = 0.80

duplicate_pairs = []
for i in range(len(df)):
    for j in range(i + 1, len(df)):
        if similarity_matrix[i, j] > threshold:
            duplicate_pairs.append((df.iloc[i]["url"], df.iloc[j]["url"], round(similarity_matrix[i, j], 2)))

dup_df = pd.DataFrame(duplicate_pairs, columns=["url1", "url2", "similarity"])
dup_df.to_csv("../data/duplicates.csv", index=False)

# Thin content detection
df["is_thin"] = df["word_count"] < 500

print(f"✅ Duplicate pairs found: {len(dup_df)}")
print(f"✅ Thin content pages: {df['is_thin'].sum()}")
dup_df.head()


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Using SentenceTransformer embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

✅ Duplicate pairs found: 85
✅ Thin content pages: 26


Unnamed: 0,url1,url2,similarity
0,https://www.qnbtrust.bank/Resources/Learning-C...,https://www.connectwise.com/blog/phishing-prev...,1.0
1,https://www.qnbtrust.bank/Resources/Learning-C...,https://www.hpe.com/us/en/what-is/sd-wan.html,1.0
2,https://www.qnbtrust.bank/Resources/Learning-C...,https://remotedesktop.google.com/,1.0
3,https://www.qnbtrust.bank/Resources/Learning-C...,https://support.microsoft.com/en-us/windows/ho...,1.0
4,https://www.qnbtrust.bank/Resources/Learning-C...,https://www.cloudflare.com/learning/access-man...,1.0


In [6]:
#6
def label_quality(row):
    if row["word_count"] > 1500 and 50 <= row["flesch_reading_ease"] <= 70:
        return "High"
    elif row["word_count"] < 500 or row["flesch_reading_ease"] < 30:
        return "Low"
    else:
        return "Medium"

df["quality_label"] = df.apply(label_quality, axis=1)

X = df[["word_count", "sentence_count", "flesch_reading_ease"]]
y = df["quality_label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("✅ Model trained!")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

joblib.dump(clf, "../models/quality_model.pkl")
print("✅ Model saved to ../models/quality_model.pkl")


✅ Model trained!
              precision    recall  f1-score   support

        High       1.00      1.00      1.00         3
         Low       0.86      1.00      0.92        12
      Medium       1.00      0.80      0.89        10

    accuracy                           0.92        25
   macro avg       0.95      0.93      0.94        25
weighted avg       0.93      0.92      0.92        25

Confusion Matrix:
 [[ 3  0  0]
 [ 0 12  0]
 [ 0  2  8]]
✅ Model saved to ../models/quality_model.pkl


In [7]:
#7
def analyze_url(url):
    """Scrape and evaluate any new URL in real-time."""
    html = scrape_url(url)
    title, text = extract_text_from_html(html)
    word_count = len(text.split())
    sentence_count, readability = compute_features(text)
    is_thin = word_count < 500
    features = np.array([[word_count, sentence_count, readability]])
    model = joblib.load("../models/quality_model.pkl")
    quality_label = model.predict(features)[0]

    # Find similar URLs
    if "embeddings" in locals():
        new_emb = model.encode([text]) if hasattr(model, "encode") else vectorizer.transform([text]).toarray()
        sim_scores = cosine_similarity(new_emb, embeddings).flatten()
        similar_indices = np.where(sim_scores > 0.75)[0]
        similar_urls = [
            {"url": df.iloc[i]["url"], "similarity": round(float(sim_scores[i]), 2)}
            for i in similar_indices
        ]
    else:
        similar_urls = []

    return {
        "url": url,
        "word_count": word_count,
        "readability": readability,
        "quality_label": quality_label,
        "is_thin": is_thin,
        "similar_to": similar_urls[:5],
    }

# Example test
result = analyze_url("https://example.com")
print(json.dumps(result, indent=2))


{
  "url": "https://example.com",
  "word_count": 18,
  "readability": 45.64500000000001,
  "quality_label": "Low",
  "is_thin": true,
  "similar_to": []
}




In [11]:
readme = """# SEO Content Quality & Duplicate Detector

## Overview
A complete NLP pipeline to evaluate webpage SEO quality and detect duplicates.

## How to Run
1. Clone the repository:
   git clone https://github.com/yourusername/seo-content-detector
   cd seo-content-detector

2. Install dependencies:
   pip install -r requirements.txt

3. Launch the notebook:
   jupyter notebook notebooks/seo_pipeline.ipynb

## Outputs
- data/extracted_content.csv
- data/features.csv
- data/duplicates.csv
- models/quality_model.pkl

## Key Results
- Duplicate threshold: 0.8
- Thin content threshold: 500 words
- Classifier: RandomForest
"""

with open("../README.md", "w", encoding="utf-8") as f:
    f.write(readme)

print("✅ README.md generated successfully.")


✅ README.md generated successfully.
