In [None]:

# Imports

import pandas as pd
import re
import html
from bs4 import BeautifulSoup

import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")  # run only once
stop_words = set(stopwords.words("english"))

In [None]:

QUESTIONS_PATH = "../data/raw/Questions.csv"
ANSWERS_PATH = "../data/raw/Answers.csv"
TAGS_PATH = "../data/raw/Tags.csv"

QUESTIONS_OUTPUT_PATH = "../data/processed/questions_clean.csv"
ANSWERS_OUTPUT_PATH = "../data/processed/answers_clean.csv"

CHUNK_SIZE = 50000  # rows processed at a time(large so chunckwise)


counting questions/answers/tags

In [None]:

question_count = sum(
    len(chunk)
    for chunk in pd.read_csv(QUESTIONS_PATH, encoding="latin1", chunksize=100000)
)

question_count


In [None]:
answer_count = sum(
    len(chunk)
    for chunk in pd.read_csv(ANSWERS_PATH, encoding="latin1", chunksize=100000)
)

answer_count


In [None]:
tag_count = sum(
    len(chunk)
    for chunk in pd.read_csv(TAGS_PATH, encoding="latin1", chunksize=100000)
)

tag_count


In [None]:
dataset_summary = pd.DataFrame({
    "TYpe": ["Questions", "Answers", "Tags"],
    "Count": [question_count, answer_count, tag_count]
})

dataset_summary


In [None]:
sample_df_ques= pd.read_csv(
    QUESTIONS_PATH,
    encoding="latin1",
    nrows=5
)

# sample_df_ques.head()
sample_df_ques[['Title','Body']].head()


In [None]:
sample_df_ques= pd.read_csv(
    QUESTIONS_PATH,
    encoding="latin1",
    nrows=5
)

sample_df_ques.shape


In [None]:
from bs4 import XMLParsedAsHTMLWarning
import warnings

# Suppress XML warning once
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)

In [None]:
def clean_text_pipeline(text):
    if pd.isna(text):
        return ""

    # Decode HTML entities (e.g., &lt; â†’ <)
    text = html.unescape(text)
    try:
        # Remove HTML/XML tags
        text = BeautifulSoup(text, "html.parser").get_text()
    except Exception:
        return ""
    
    # Convert to lowercase
    text = text.lower()

    # Remove non-alphabetic characters
    text = re.sub(r"[^a-z\s]", " ", text)

    # Normalize multiple spaces
    text = re.sub(r"\s+", " ", text)

    # Tokenize
    tokens = text.split()

    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    # Join back
    text = " ".join(tokens)

    return text.strip()


In [None]:
# Test Cleaning on Small Subset
test_df = pd.read_csv(
    QUESTIONS_PATH,
    encoding="latin1",
    nrows=10000
)

test_df["raw_text"] = (
    test_df["Title"].fillna("") + " " +
    test_df["Body"].fillna("")
)

test_df["clean_text"] = test_df["raw_text"].apply(clean_text_pipeline)

sample = test_df[["raw_text", "clean_text"]].iloc[0]

print("RAW TEXT:\n", sample["raw_text"][:500])
print("\nCLEAN TEXT:\n", sample["clean_text"][:500])


In [None]:
# Full Dataset Cleaning (Chunk Processing)
first_chunk = True

for chunk in pd.read_csv(
    QUESTIONS_PATH,
    encoding="latin1",
    chunksize=CHUNK_SIZE
):

    # Combine title and body
    chunk["raw_text"] = (
        chunk["Title"].fillna("") + " " +
        chunk["Body"].fillna("")
    )

    # Apply cleaning pipeline
    chunk["clean_text"] = chunk["raw_text"].apply(clean_text_pipeline)

    # Keep only relevant columns
    processed_chunk = chunk[["Id", "Score", "clean_text"]]

    # Write incrementally
    processed_chunk.to_csv(
        QUESTIONS_OUTPUT_PATH,
        mode="w" if first_chunk else "a",
        header=first_chunk,
        index=False
    )

    first_chunk = False

print(" Full preprocessing complete.")


In [None]:
# verify
clean_df_sample = pd.read_csv(QUESTIONS_OUTPUT_PATH, nrows=5)

clean_df_sample.head()


In [None]:
original_count = sum(
    len(chunk)
    for chunk in pd.read_csv(QUESTIONS_PATH, encoding="latin1", chunksize=100000)
)

clean_count = sum(
    len(chunk)
    for chunk in pd.read_csv(QUESTIONS_OUTPUT_PATH, chunksize=100000)
)

print("Original rows:", original_count)
print("Cleaned rows:", clean_count)


answers cleaning

In [None]:
sample_df_ans= pd.read_csv(
    ANSWERS_PATH,
    encoding="latin1",
    nrows=5
)

# sample_df_ans.head()
sample_df_ques[['Body']].head()


In [None]:
# Test Cleaning on Small Subset of ans
test_df_ans = pd.read_csv(
    ANSWERS_PATH,
    encoding="latin1",
    nrows=10000
)

test_df_ans["raw_text"] = (
    test_df_ans["Body"].fillna("")
)

test_df_ans["clean_text"] = test_df_ans["raw_text"].apply(clean_text_pipeline)

sample = test_df_ans[["raw_text", "clean_text"]].iloc[0]

print("RAW TEXT:\n", sample["raw_text"][:500])
print("\nCLEAN TEXT:\n", sample["clean_text"][:500])


In [None]:
# Full Dataset Cleaning (Chunk Processing)
first_chunk = True

for chunk in pd.read_csv(
    ANSWERS_PATH,
    encoding="latin1",
    chunksize=CHUNK_SIZE
):

    # Combine title and body
    chunk["raw_text"] = (
        chunk["Body"].fillna("")
    )

    # Apply cleaning pipeline
    chunk["clean_text"] = chunk["raw_text"].apply(clean_text_pipeline)

    # Keep only relevant columns
    processed_chunk = chunk[["Id", "Score", "clean_text"]]

    # Write incrementally
    processed_chunk.to_csv(
        ANSWERS_OUTPUT_PATH,
        mode="w" if first_chunk else "a",
        header=first_chunk,
        index=False
    )

    first_chunk = False

print(" Full preprocessing complete.")


In [None]:
# verify
clean_df_sample = pd.read_csv(QUESTIONS_OUTPUT_PATH, nrows=5)

clean_df_sample.head()


In [None]:
# TF-IDF VECTORIZATION

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import save_npz

# Load cleaned questions
clean_df = pd.read_csv("../data/processed/questions_clean.csv")

vectorizer = TfidfVectorizer(
    max_features=10000,
    min_df=5,
    max_df=0.8,
    ngram_range=(1,2),
    sublinear_tf=True
)

tfidf_questions = vectorizer.fit_transform(clean_df["clean_text"].fillna(""))

import joblib
joblib.dump(vectorizer, "../data/processed/tfidf_vectorizer.pkl")

print("Questions TF-IDF Shape:", tfidf_questions.shape)

# Save sparse matrix
save_npz("../data/processed/questions_tfidf.npz", tfidf_questions)

print("Questions TF-IDF saved successfully.")

In [None]:
from scipy.sparse import load_npz

# Load cleaned answers
clean_answers = pd.read_csv("../data/processed/answers_clean.csv")

# Load saved vectorizer
vectorizer = joblib.load("../data/processed/tfidf_vectorizer.pkl")

# Transform answers (DO NOT FIT AGAIN)
tfidf_answers = vectorizer.transform(clean_answers["clean_text"].fillna(""))

print("Answers TF-IDF Shape:", tfidf_answers.shape)

# Save sparse matrix
save_npz("../data/processed/answers_tfidf.npz", tfidf_answers)

print("Answers TF-IDF saved successfully.")
