# SKARBOT: Full Social Media Text Analysis Pipeline by Sasha Vujisic and Ashish Kalam
This notebook processes social media datasets (Twitter and Reddit) to:
- Preprocess and clean the text
- Extract keyphrases using KeyBERT
- Detect paraphrase matches using Sentence-BERT
- Analyze internal repetition (bot detection)
- Save a final pipeline summary report

In [None]:
!pip install pandas numpy sentence-transformers keybert tqdm swifter contractions

# --- Import Libraries ---

In [None]:
import os
import shutil
import pandas as pd
import numpy as np
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer, util
from data_preprocessing import preprocess_dataframe
from keyphrase_extraction import extract_from_dataframe
from paraphrase_detection import compute_similarity_fast, load_model
from repetition_analysis import compute_repetition_statistics, get_match_repetitions
from save_pipeline_summary import save_pipeline_summary

# --- Helper Functions ---
Ensure necessary folders and file structure.

In [None]:
def ensure_directories():
    os.makedirs("data/processed", exist_ok=True)

def archive_processed_files(archive_name):
    processed_dir = "data/processed"
    archive_dir = f"data/archives/{archive_name}"
    os.makedirs(archive_dir, exist_ok=True)
    for filename in os.listdir(processed_dir):
        file_path = os.path.join(processed_dir, filename)
        if os.path.isfile(file_path):
            shutil.move(file_path, os.path.join(archive_dir, filename))
    print(f"Archived all processed files to: {archive_dir}\n")

# --- Main Pipeline Function ---
This function executes the full SKARBOT pipeline.

In [None]:
def run_pipeline(
    CANDIDATE,
    REFERENCE,
    CANDIDATE_COL,
    REFERENCE_COL,
    CANDIDATE_DIRECTORY,
    REFERENCE_DIRECTORY,
    PARAPHRASE_SIMILARITY_THRESHOLD=0.80,
    REPETITION_THRESHOLD=0.80
):
    print("\n==== PIPELINE START ====")
    ensure_directories()

    model = load_model("all-MiniLM-L6-v2")

    print("\nStep 1: Load CSVs")
    candidate_df = pd.read_csv(f"data/{CANDIDATE_DIRECTORY}/{CANDIDATE}.csv")
    reference_df = pd.read_csv(f"data/{REFERENCE_DIRECTORY}/{REFERENCE}.csv")

    if CANDIDATE_COL == REFERENCE_COL:
        new_candidate_col = CANDIDATE_COL + "2"
        candidate_df = candidate_df.rename(columns={CANDIDATE_COL: new_candidate_col})
        CANDIDATE_COL = new_candidate_col

    print("CSV Loading Complete.")

    print("\nStep 2: Preprocessing Text")
    clean_candidate_df = preprocess_dataframe(candidate_df, text_column=CANDIDATE_COL)
    clean_reference_df = preprocess_dataframe(reference_df, text_column=REFERENCE_COL)

    print("\nStep 3: Keyphrase Extraction")
    keyphrased_candidate_df = extract_from_dataframe(clean_candidate_df, text_column=CANDIDATE_COL, model=model)
    keyphrased_reference_df = extract_from_dataframe(clean_reference_df, text_column=REFERENCE_COL, model=model)

    keyphrased_candidate_df.to_csv(f"data/processed/keyphrases_{CANDIDATE}.csv", index=False)
    keyphrased_reference_df.to_csv(f"data/processed/keyphrases_{REFERENCE}.csv", index=False)

    print("\nStep 4: Paraphrase Similarity Matching")
    matches = compute_similarity_fast(
        keyphrased_candidate_df,
        keyphrased_reference_df,
        model,
        text_column=CANDIDATE_COL,
        ref_column=REFERENCE_COL,
        keyphrase_column="keyphrases",
        threshold=PARAPHRASE_SIMILARITY_THRESHOLD,
        batch_size=32
    )
    matches_df = pd.DataFrame(matches, columns=[REFERENCE_COL, CANDIDATE_COL, "similarity"])
    matches_df.to_csv(f"data/processed/matches_{REFERENCE}_{REFERENCE_COL}_{CANDIDATE}_{CANDIDATE_COL}.csv", index=False)

    print("\nStep 5: Internal Repetition Detection")
    similar_candidates, _ = compute_repetition_statistics(
        keyphrased_candidate_df[CANDIDATE_COL].tolist(), model, threshold=REPETITION_THRESHOLD
    )
    similar_references, _ = compute_repetition_statistics(
        keyphrased_reference_df[REFERENCE_COL].tolist(), model, threshold=REPETITION_THRESHOLD
    )

    similar_candidates_df = pd.DataFrame(similar_candidates, columns=["index1", "index2", "similarity", CANDIDATE_COL, "duplicate"])
    similar_references_df = pd.DataFrame(similar_references, columns=["index1", "index2", "similarity", REFERENCE_COL, "duplicate"])

    similar_candidates_df.to_csv(f"data/processed/repetitive_{CANDIDATE_COL}.csv", index=False)
    similar_references_df.to_csv(f"data/processed/repetitive_{REFERENCE_COL}.csv", index=False)

    print("\nStep 6: Paraphrased Text Repetition Analysis")
    match_repetitions = get_match_repetitions(matches_df, similar_candidates_df, CANDIDATE_COL, "duplicate")
    match_ref_reps = get_match_repetitions(matches_df, similar_references_df, REFERENCE_COL, "duplicate")

    pd.DataFrame(match_repetitions, columns=[f"paraphrased_{CANDIDATE_COL}", "repetitions"]).to_csv(
        f"data/processed/repetitive_matches_{CANDIDATE}_{CANDIDATE_COL}.csv", index=False
    )
    pd.DataFrame(match_ref_reps, columns=[f"paraphrased_{REFERENCE_COL}", "repetitions"]).to_csv(
        f"data/processed/repetitive_matches_{REFERENCE}_{REFERENCE_COL}.csv", index=False
    )

    print("\nStep 7: Saving Pipeline Summary")
    save_pipeline_summary(
        CANDIDATE, 
        REFERENCE, 
        CANDIDATE_COL, 
        REFERENCE_COL, 
        matches_df,
        clean_candidate_df,
        clean_reference_df,
        similar_candidates, 
        similar_references,
        match_repetitions,
        match_ref_reps,
        PARAPHRASE_SIMILARITY_THRESHOLD,
        REPETITION_THRESHOLD
    )

    print("\n==== PIPELINE COMPLETE ====")

# --- Run Example Pipeline ---

In [None]:
run_pipeline(
    CANDIDATE="twitter",
    REFERENCE="reddit",
    CANDIDATE_COL="tweet",
    REFERENCE_COL="title",
    CANDIDATE_DIRECTORY="trivial",
    REFERENCE_DIRECTORY="trivial",
    PARAPHRASE_SIMILARITY_THRESHOLD=0.80,
    REPETITION_THRESHOLD=0.80
)

This project notebook was formatted with the help of ChatGPT.