<a href="https://colab.research.google.com/github/Yash26-hub/Cybersecurity-NLP-Analysis-/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import argparse
import logging
import os
import re
import sys
from typing import List

import joblib
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Optional: use nltk for lemmatization
USE_NLTK = True
if USE_NLTK:
    try:
        import nltk
        from nltk.corpus import stopwords
        from nltk.stem import WordNetLemmatizer

        # Ensure required data is available
        try:
            _ = stopwords.words("english")
            _ = WordNetLemmatizer()
        except LookupError:
            nltk.download("punkt", quiet=True)
            nltk.download("wordnet", quiet=True)
            nltk.download("omw-1.4", quiet=True)
            nltk.download("stopwords", quiet=True)
            _ = stopwords.words("english")
            _ = WordNetLemmatizer()
    except Exception:
        USE_NLTK = False

# Fallback stopwords
from sklearn.feature_extraction import text as sklearn_text

DEFAULT_STOPWORDS = set(sklearn_text.ENGLISH_STOP_WORDS)
if USE_NLTK:
    try:
        DEFAULT_STOPWORDS |= set(stopwords.words("english"))
    except Exception:
        pass

logger = logging.getLogger("analysis")
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(sys.stdout)
formatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)


def clean_text(text: str, lemmatize: bool = True) -> str:
    if not isinstance(text, str):
        return ""
    text = text.lower()
    # Remove URLs and emails
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    text = re.sub(r"\S+@\S+", " ", text)
    # Remove non letters (keep spaces)
    text = re.sub(r"[^a-z\s]", " ", text)
    # Collapse whitespace
    text = re.sub(r"\s+", " ", text).strip()
    # Remove stopwords and optionally lemmatize
    tokens = [t for t in text.split() if t not in DEFAULT_STOPWORDS and len(t) > 1]
    if lemmatize and USE_NLTK:
        try:
            lemmatizer = WordNetLemmatizer()
            tokens = [lemmatizer.lemmatize(t) for t in tokens]
        except Exception:
            pass
    return " ".join(tokens)


def build_corpus(df: pd.DataFrame, text_columns: List[str]) -> pd.Series:
    present_cols = [c for c in text_columns if c in df.columns]
    if not present_cols:
        raise ValueError(f"None of the specified text columns are present. Available columns: {df.columns.tolist()}")
    # Fill NaN with empty strings
    df_loc = df.copy()
    df_loc[present_cols] = df_loc[present_cols].fillna("")
    corpus = df_loc[present_cols].astype(str).agg(" ".join, axis=1)
    return corpus


def main(args):
    if not os.path.isfile(args.input):
        logger.error("Input file not found: %s", args.input)
        sys.exit(1)

    logger.info("Loading CSV: %s", args.input)
    df = pd.read_csv(args.input, low_memory=False)

    logger.info("Building corpus from columns: %s", args.text_columns)
    try:
        corpus_raw = build_corpus(df, args.text_columns)
    except ValueError as e:
        logger.error(str(e))
        sys.exit(1)

    logger.info("Cleaning text (lemmatize=%s)...", args.lemmatize)
    corpus_clean = corpus_raw.apply(lambda t: clean_text(t, lemmatize=args.lemmatize))
    non_empty_mask = corpus_clean.str.strip().astype(bool)
    if non_empty_mask.sum() == 0:
        logger.error("No non-empty text after cleaning. Exiting.")
        sys.exit(1)

    df = df.loc[non_empty_mask].copy()
    df["clean_corpus"] = corpus_clean.loc[non_empty_mask].values

    # Vectorization for LDA: CountVectorizer
    logger.info("Vectorizing for LDA: max_features=%d, ngram_range=%s", args.count_features, args.ngram_range)
    count_vect = CountVectorizer(max_features=args.count_features, ngram_range=tuple(args.ngram_range))
    counts = count_vect.fit_transform(df["clean_corpus"])

    # Fit LDA
    logger.info("Fitting LDA with %d topics", args.lda_topics)
    lda = LatentDirichletAllocation(n_components=args.lda_topics, random_state=42, learning_method="batch", max_iter=10)
    lda.fit(counts)

    # Show top words per topic
    feature_names = count_vect.get_feature_names_out()
    logger.info("--- TOP WORDS PER LDA TOPIC ---")
    n_top_words = args.top_words
    for idx, topic in enumerate(lda.components_):
        top_indices = topic.argsort()[:-n_top_words - 1:-1]
        top_words = [feature_names[i] for i in top_indices]
        logger.info("Topic %d: %s", idx + 1, ", ".join(top_words))

    # Vectorization for clustering: TF-IDF
    logger.info("Vectorizing for clustering (TF-IDF): max_features=%d, ngram_range=%s", args.tfidf_features, args.ngram_range)
    tfidf = TfidfVectorizer(max_features=args.tfidf_features, ngram_range=tuple(args.ngram_range), stop_words=None)
    tfidf_matrix = tfidf.fit_transform(df["clean_corpus"])

    # KMeans clustering
    logger.info("Fitting KMeans with %d clusters", args.clusters)
    kmeans = KMeans(n_clusters=args.clusters, random_state=42, n_init=10)
    df["cluster"] = kmeans.fit_predict(tfidf_matrix)

    logger.info("\n--- K-MEANS CLUSTER SIZES ---")
    cluster_counts = df["cluster"].value_counts().sort_index()
    for c, cnt in cluster_counts.items():
        logger.info("Cluster %s: %d rows", c, cnt)

    # Save results
    logger.info("Saving results to %s", args.output)
    df.to_csv(args.output, index=False)

    # Optionally save models/vectorizers
    if args.save_models:
        base = os.path.splitext(args.output)[0]
        logger.info("Saving vectorizers and models to %s_*.joblib", base)
        joblib.dump(count_vect, base + "_count_vectorizer.joblib")
        joblib.dump(tfidf, base + "_tfidf_vectorizer.joblib")
        joblib.dump(lda, base + "_lda_model.joblib")
        joblib.dump(kmeans, base + "_kmeans_model.joblib")

    logger.info("Success! Results saved to %s", args.output)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="NLP analysis for cybersecurity dataset")
    parser.add_argument("--input", "-i", required=True, help="Input CSV file path")
    parser.add_argument(
        "--text-columns",
        "-c",
        nargs="?",
        default=["Attack Type", "Target Industry", "Attack Source", "Security Vulnerability Type"],
        help="Columns to combine into the corpus (default: 4 columns used previously)",
    )
    parser.add_argument("--output", "-o", default="Analysis_Results.csv", help="Output CSV path")
    parser.add_argument("--tfidf-features", type=int, default=1000, help="Max features for TF-IDF vectorizer")
    parser.add_argument("--count-features", type=int, default=1000, help="Max features for CountVectorizer (LDA)")
    parser.add_argument("--lda-topics", type=int, default=6, help="Number of LDA topics")
    parser.add_argument("--clusters", type=int, default=6, help="Number of KMeans clusters")
    parser.add_argument("--top-words", type=int, default=10, help="Top words to display per topic")
    parser.add_argument("--save-models", action="store_true", help="Save vectorizers and models as joblib files")
    parser.add_argument("--lemmatize", action="store_true", help="Apply lemmatization (requires NLTK)")
    parser.add_argument("--ngram-range", nargs=2, type=int, default=[1, 2], help="Ngram range (two ints, e.g. 1 2)")

    # For Colab execution, provide mock arguments to argparse
    args = parser.parse_args(["--input", "Global_Cybersecurity_Threats_2015-2024 (1).csv"])

    main(args)


2026-01-27 15:57:44,114 INFO Loading CSV: Global_Cybersecurity_Threats_2015-2024 (1).csv
2026-01-27 15:57:44,114 INFO Loading CSV: Global_Cybersecurity_Threats_2015-2024 (1).csv
2026-01-27 15:57:44,114 INFO Loading CSV: Global_Cybersecurity_Threats_2015-2024 (1).csv
2026-01-27 15:57:44,114 INFO Loading CSV: Global_Cybersecurity_Threats_2015-2024 (1).csv


INFO:analysis:Loading CSV: Global_Cybersecurity_Threats_2015-2024 (1).csv


2026-01-27 15:57:44,132 INFO Building corpus from columns: ['Attack Type', 'Target Industry', 'Attack Source', 'Security Vulnerability Type']
2026-01-27 15:57:44,132 INFO Building corpus from columns: ['Attack Type', 'Target Industry', 'Attack Source', 'Security Vulnerability Type']
2026-01-27 15:57:44,132 INFO Building corpus from columns: ['Attack Type', 'Target Industry', 'Attack Source', 'Security Vulnerability Type']
2026-01-27 15:57:44,132 INFO Building corpus from columns: ['Attack Type', 'Target Industry', 'Attack Source', 'Security Vulnerability Type']


INFO:analysis:Building corpus from columns: ['Attack Type', 'Target Industry', 'Attack Source', 'Security Vulnerability Type']


2026-01-27 15:57:44,161 INFO Cleaning text (lemmatize=False)...
2026-01-27 15:57:44,161 INFO Cleaning text (lemmatize=False)...
2026-01-27 15:57:44,161 INFO Cleaning text (lemmatize=False)...
2026-01-27 15:57:44,161 INFO Cleaning text (lemmatize=False)...


INFO:analysis:Cleaning text (lemmatize=False)...


2026-01-27 15:57:44,204 INFO Vectorizing for LDA: max_features=1000, ngram_range=[1, 2]
2026-01-27 15:57:44,204 INFO Vectorizing for LDA: max_features=1000, ngram_range=[1, 2]
2026-01-27 15:57:44,204 INFO Vectorizing for LDA: max_features=1000, ngram_range=[1, 2]
2026-01-27 15:57:44,204 INFO Vectorizing for LDA: max_features=1000, ngram_range=[1, 2]


INFO:analysis:Vectorizing for LDA: max_features=1000, ngram_range=[1, 2]


2026-01-27 15:57:44,239 INFO Fitting LDA with 6 topics
2026-01-27 15:57:44,239 INFO Fitting LDA with 6 topics
2026-01-27 15:57:44,239 INFO Fitting LDA with 6 topics
2026-01-27 15:57:44,239 INFO Fitting LDA with 6 topics


INFO:analysis:Fitting LDA with 6 topics


2026-01-27 15:57:51,789 INFO --- TOP WORDS PER LDA TOPIC ---
2026-01-27 15:57:51,789 INFO --- TOP WORDS PER LDA TOPIC ---
2026-01-27 15:57:51,789 INFO --- TOP WORDS PER LDA TOPIC ---
2026-01-27 15:57:51,789 INFO --- TOP WORDS PER LDA TOPIC ---


INFO:analysis:--- TOP WORDS PER LDA TOPIC ---


2026-01-27 15:57:51,794 INFO Topic 1: unknown, zero, zero day, day, hacker group, hacker, group, middle, man middle, man
2026-01-27 15:57:51,794 INFO Topic 1: unknown, zero, zero day, day, hacker group, hacker, group, middle, man middle, man
2026-01-27 15:57:51,794 INFO Topic 1: unknown, zero, zero day, day, hacker group, hacker, group, middle, man middle, man
2026-01-27 15:57:51,794 INFO Topic 1: unknown, zero, zero day, day, hacker group, hacker, group, middle, man middle, man


INFO:analysis:Topic 1: unknown, zero, zero day, day, hacker group, hacker, group, middle, man middle, man


2026-01-27 15:57:51,810 INFO Topic 2: state, nation state, nation, state zero, state weak, sql, sql injection, injection, zero, zero day
2026-01-27 15:57:51,810 INFO Topic 2: state, nation state, nation, state zero, state weak, sql, sql injection, injection, zero, zero day
2026-01-27 15:57:51,810 INFO Topic 2: state, nation state, nation, state zero, state weak, sql, sql injection, injection, zero, zero day
2026-01-27 15:57:51,810 INFO Topic 2: state, nation state, nation, state zero, state weak, sql, sql injection, injection, zero, zero day


INFO:analysis:Topic 2: state, nation state, nation, state zero, state weak, sql, sql injection, injection, zero, zero day


2026-01-27 15:57:51,825 INFO Topic 3: engineering, social, social engineering, group social, state social, insider social, unknown social, group, hacker, hacker group
2026-01-27 15:57:51,825 INFO Topic 3: engineering, social, social engineering, group social, state social, insider social, unknown social, group, hacker, hacker group
2026-01-27 15:57:51,825 INFO Topic 3: engineering, social, social engineering, group social, state social, insider social, unknown social, group, hacker, hacker group
2026-01-27 15:57:51,825 INFO Topic 3: engineering, social, social engineering, group social, state social, insider social, unknown social, group, hacker, hacker group


INFO:analysis:Topic 3: engineering, social, social engineering, group social, state social, insider social, unknown social, group, hacker, hacker group


2026-01-27 15:57:51,840 INFO Topic 4: retail, day, zero day, zero, insider, insider zero, middle, man middle, man, retail unknown
2026-01-27 15:57:51,840 INFO Topic 4: retail, day, zero day, zero, insider, insider zero, middle, man middle, man, retail unknown
2026-01-27 15:57:51,840 INFO Topic 4: retail, day, zero day, zero, insider, insider zero, middle, man middle, man, retail unknown
2026-01-27 15:57:51,840 INFO Topic 4: retail, day, zero day, zero, insider, insider zero, middle, man middle, man, retail unknown


INFO:analysis:Topic 4: retail, day, zero day, zero, insider, insider zero, middle, man middle, man, retail unknown


2026-01-27 15:57:51,852 INFO Topic 5: software, unpatched, unpatched software, telecommunications, insider unpatched, unknown, unknown unpatched, insider, group unpatched, ransomware
2026-01-27 15:57:51,852 INFO Topic 5: software, unpatched, unpatched software, telecommunications, insider unpatched, unknown, unknown unpatched, insider, group unpatched, ransomware
2026-01-27 15:57:51,852 INFO Topic 5: software, unpatched, unpatched software, telecommunications, insider unpatched, unknown, unknown unpatched, insider, group unpatched, ransomware
2026-01-27 15:57:51,852 INFO Topic 5: software, unpatched, unpatched software, telecommunications, insider unpatched, unknown, unknown unpatched, insider, group unpatched, ransomware


INFO:analysis:Topic 5: software, unpatched, unpatched software, telecommunications, insider unpatched, unknown, unknown unpatched, insider, group unpatched, ransomware


2026-01-27 15:57:51,871 INFO Topic 6: weak passwords, weak, passwords, insider, unknown weak, insider weak, government, unknown, group weak, ddos
2026-01-27 15:57:51,871 INFO Topic 6: weak passwords, weak, passwords, insider, unknown weak, insider weak, government, unknown, group weak, ddos
2026-01-27 15:57:51,871 INFO Topic 6: weak passwords, weak, passwords, insider, unknown weak, insider weak, government, unknown, group weak, ddos
2026-01-27 15:57:51,871 INFO Topic 6: weak passwords, weak, passwords, insider, unknown weak, insider weak, government, unknown, group weak, ddos


INFO:analysis:Topic 6: weak passwords, weak, passwords, insider, unknown weak, insider weak, government, unknown, group weak, ddos


2026-01-27 15:57:51,878 INFO Vectorizing for clustering (TF-IDF): max_features=1000, ngram_range=[1, 2]
2026-01-27 15:57:51,878 INFO Vectorizing for clustering (TF-IDF): max_features=1000, ngram_range=[1, 2]
2026-01-27 15:57:51,878 INFO Vectorizing for clustering (TF-IDF): max_features=1000, ngram_range=[1, 2]
2026-01-27 15:57:51,878 INFO Vectorizing for clustering (TF-IDF): max_features=1000, ngram_range=[1, 2]


INFO:analysis:Vectorizing for clustering (TF-IDF): max_features=1000, ngram_range=[1, 2]


2026-01-27 15:57:51,977 INFO Fitting KMeans with 6 clusters
2026-01-27 15:57:51,977 INFO Fitting KMeans with 6 clusters
2026-01-27 15:57:51,977 INFO Fitting KMeans with 6 clusters
2026-01-27 15:57:51,977 INFO Fitting KMeans with 6 clusters


INFO:analysis:Fitting KMeans with 6 clusters


2026-01-27 15:57:52,159 INFO 
--- K-MEANS CLUSTER SIZES ---
2026-01-27 15:57:52,159 INFO 
--- K-MEANS CLUSTER SIZES ---
2026-01-27 15:57:52,159 INFO 
--- K-MEANS CLUSTER SIZES ---
2026-01-27 15:57:52,159 INFO 
--- K-MEANS CLUSTER SIZES ---


INFO:analysis:
--- K-MEANS CLUSTER SIZES ---


2026-01-27 15:57:52,169 INFO Cluster 0: 566 rows
2026-01-27 15:57:52,169 INFO Cluster 0: 566 rows
2026-01-27 15:57:52,169 INFO Cluster 0: 566 rows
2026-01-27 15:57:52,169 INFO Cluster 0: 566 rows


INFO:analysis:Cluster 0: 566 rows


2026-01-27 15:57:52,173 INFO Cluster 1: 173 rows
2026-01-27 15:57:52,173 INFO Cluster 1: 173 rows
2026-01-27 15:57:52,173 INFO Cluster 1: 173 rows
2026-01-27 15:57:52,173 INFO Cluster 1: 173 rows


INFO:analysis:Cluster 1: 173 rows


2026-01-27 15:57:52,176 INFO Cluster 2: 513 rows
2026-01-27 15:57:52,176 INFO Cluster 2: 513 rows
2026-01-27 15:57:52,176 INFO Cluster 2: 513 rows
2026-01-27 15:57:52,176 INFO Cluster 2: 513 rows


INFO:analysis:Cluster 2: 513 rows


2026-01-27 15:57:52,180 INFO Cluster 3: 579 rows
2026-01-27 15:57:52,180 INFO Cluster 3: 579 rows
2026-01-27 15:57:52,180 INFO Cluster 3: 579 rows
2026-01-27 15:57:52,180 INFO Cluster 3: 579 rows


INFO:analysis:Cluster 3: 579 rows


2026-01-27 15:57:52,196 INFO Cluster 4: 557 rows
2026-01-27 15:57:52,196 INFO Cluster 4: 557 rows
2026-01-27 15:57:52,196 INFO Cluster 4: 557 rows
2026-01-27 15:57:52,196 INFO Cluster 4: 557 rows


INFO:analysis:Cluster 4: 557 rows


2026-01-27 15:57:52,202 INFO Cluster 5: 612 rows
2026-01-27 15:57:52,202 INFO Cluster 5: 612 rows
2026-01-27 15:57:52,202 INFO Cluster 5: 612 rows
2026-01-27 15:57:52,202 INFO Cluster 5: 612 rows


INFO:analysis:Cluster 5: 612 rows


2026-01-27 15:57:52,208 INFO Saving results to Analysis_Results.csv
2026-01-27 15:57:52,208 INFO Saving results to Analysis_Results.csv
2026-01-27 15:57:52,208 INFO Saving results to Analysis_Results.csv
2026-01-27 15:57:52,208 INFO Saving results to Analysis_Results.csv


INFO:analysis:Saving results to Analysis_Results.csv


2026-01-27 15:57:52,281 INFO Success! Results saved to Analysis_Results.csv
2026-01-27 15:57:52,281 INFO Success! Results saved to Analysis_Results.csv
2026-01-27 15:57:52,281 INFO Success! Results saved to Analysis_Results.csv
2026-01-27 15:57:52,281 INFO Success! Results saved to Analysis_Results.csv


INFO:analysis:Success! Results saved to Analysis_Results.csv


In [6]:
import pandas as pd

analysis_df = pd.read_csv('Analysis_Results.csv')
display(analysis_df.head())

Unnamed: 0,Country,Year,Attack Type,Target Industry,Financial Loss (in Million $),Number of Affected Users,Attack Source,Security Vulnerability Type,Defense Mechanism Used,Incident Resolution Time (in Hours),clean_corpus,cluster
0,China,2019,Phishing,Education,80.53,773169,Hacker Group,Unpatched Software,VPN,63,phishing education hacker group unpatched soft...,2
1,China,2019,Ransomware,Retail,62.19,295961,Hacker Group,Unpatched Software,Firewall,71,ransomware retail hacker group unpatched software,2
2,India,2017,Man-in-the-Middle,IT,38.65,605895,Hacker Group,Weak Passwords,VPN,20,man middle hacker group weak passwords,2
3,UK,2024,Ransomware,Telecommunications,41.44,659320,Nation-state,Social Engineering,AI-based Detection,7,ransomware telecommunications nation state soc...,4
4,Germany,2018,Man-in-the-Middle,IT,74.41,810682,Insider,Social Engineering,VPN,68,man middle insider social engineering,4
