In [None]:
import nltk

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import pandas as pd
import string
import re
import contractions  # For expanding contractions
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.utils.helpers import download_statistics
download_statistics()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Word statistics files not found!
Downloading... done!
Unpacking... done!


In [None]:
# Initialize Ekphrasis for slang normalization only.
# Pass a callable tokenizer such as str.split instead of a string.
slang_processor = TextPreProcessor(
    normalize=[],  # No extra normalization for entities
    annotate=set(),  # No annotations
    unpack_contractions=False,  # Already handling contractions separately
    tokenizer=str.split  # Use a simple word-based tokenizer
)


def normalize_slang(text):
    # Process text with Ekphrasis to handle slang normalization.
    return " ".join(slang_processor.pre_process_doc(text))


# Load dataset
df = pd.read_csv('/content/data_spam.csv', encoding='latin-1')
df = df[['v1', 'v2']].rename(columns={'v1': 'label', 'v2': 'text'})




Reading english - 1grams ...
generating cache file for faster loading...
reading ngrams /root/.ekphrasis/stats/english/counts_1grams.txt


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


In [None]:
def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    # 1. Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text, flags=re.IGNORECASE)

    # 2. Expand contractions
    text = contractions.fix(text)

    # 3. Replace slang/abbreviations using Ekphrasis for slang normalization
    text = normalize_slang(text)

    # 4. Handle special characters that cause concatenation (e.g., replace '/' and '.' with spaces)
    text = re.sub(r'([/\.])', r' ', text)

    # 5. Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # 6. Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # 7. Remove noise: numbers and special characters (keep letters and whitespace)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)

    # 8. Tokenization
    tokens = word_tokenize(text)

    # 9. Fix repeated characters (e.g., "soooo" -> "sooo")
    tokens = [re.sub(r'(.)\1{2,}', r'\1\1', word) for word in tokens]

    # 10. POS tagging
    tagged = pos_tag(tokens)

    # 11. Case folding (preserve proper nouns initially)
    processed, pos_tags = [], []
    for word, tag in tagged:
        if tag in ['NNP', 'NNPS']:
            processed.append(word)
        else:
            processed.append(word.lower())
        pos_tags.append(tag)

    # 12. Stop word removal (keep proper nouns)
    stop_words = set(stopwords.words('english'))
    filtered, filtered_pos = [], []
    for word, tag in zip(processed, pos_tags):
        if tag in ['NNP', 'NNPS'] or word not in stop_words:
            filtered.append(word)
            filtered_pos.append(tag)

    # 13. Lemmatization using POS mapping
    lemmatizer = WordNetLemmatizer()
    pos_map = {'NN': 'n', 'NNS': 'n', 'VB': 'v', 'VBD': 'v', 'VBG': 'v', 'VBN': 'v',
               'VBP': 'v', 'VBZ': 'v', 'JJ': 'a', 'JJR': 'a', 'JJS': 'a', 'RB': 'r'}
    lemmas = []
    for word, tag in zip(filtered, filtered_pos):
        pos = pos_map.get(tag[:2], 'n')
        lemmas.append(lemmatizer.lemmatize(word, pos=pos))

    # 14. Final lowercase conversion
    return ' '.join(word.lower() for word in lemmas)


In [None]:
nltk.download(['punkt', 'stopwords', 'averaged_perceptron_tagger', 'wordnet'])


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder

# Load preprocessed data
df = pd.read_csv('/content/processed_data_spam.csv', encoding='latin-1')
# Drop rows with missing 'processed' text
df.dropna(subset=['processed'], inplace=True)

# Reset index just in case
df.reset_index(drop=True, inplace=True)

# Encode labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])
y = df['label_encoded']
# Encode labels (ham = 0, spam = 1)
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])
y = df['label_encoded']

# --- Bag of Words ---
count_vectorizer = CountVectorizer()
X_bow = count_vectorizer.fit_transform(df['processed'])

# --- TF-IDF ---
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['processed'])

# --- Dimensionality Reduction (Optional) ---
# This reduces features to 100 components (you can tweak this number)
svd = TruncatedSVD(n_components=100, random_state=42)
X_bow_reduced = svd.fit_transform(X_bow)
X_tfidf_reduced = svd.fit_transform(X_tfidf)

# --- Output shapes ---
print("Bag of Words shape:", X_bow.shape)
print("TF-IDF shape:", X_tfidf.shape)
print("Reduced BoW shape:", X_bow_reduced.shape)
print("Reduced TF-IDF shape:", X_tfidf_reduced.shape)

# At this point you have:
# X_bow           -> Full Bag of Words
# X_bow_reduced   -> Dimensionality-reduced BoW
# X_tfidf         -> Full TF-IDF
# X_tfidf_reduced -> Dimensionality-reduced TF-IDF
# y               -> Labels


Bag of Words shape: (5562, 7105)
TF-IDF shape: (5562, 7105)
Reduced BoW shape: (5562, 100)
Reduced TF-IDF shape: (5562, 100)


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder

# --- Load and clean data ---
df = pd.read_csv('processed_data_spam.csv', encoding='latin-1')
df.dropna(subset=['processed'], inplace=True)
df.reset_index(drop=True, inplace=True)

# --- Encode labels ---
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])
y = df['label_encoded']

# --- Define vectorizers ---
vectorizers = {
    "TF-IDF": TfidfVectorizer(),
    "Bag of Words": CountVectorizer()
}

# --- Classifier models ---
models = {
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": MultinomialNB(),
    "SVM (Linear)": LinearSVC()
}

# --- Run for each vectorizer ---
for vec_name, vectorizer in vectorizers.items():
    print(f"\n====== Using {vec_name} ======")
    X = vectorizer.fit_transform(df['processed'])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        print(f"\n🔹 Model: {model_name}")
        print("Accuracy :", accuracy_score(y_test, y_pred))
        print("Precision:", precision_score(y_test, y_pred))
        print("Recall   :", recall_score(y_test, y_pred))
        print("F1 Score :", f1_score(y_test, y_pred))




🔹 Model: Logistic Regression
Accuracy : 0.954177897574124
Precision: 1.0
Recall   : 0.6709677419354839
F1 Score : 0.803088803088803

🔹 Model: Naive Bayes
Accuracy : 0.9595687331536388
Precision: 1.0
Recall   : 0.7096774193548387
F1 Score : 0.8301886792452831

🔹 Model: SVM (Linear)
Accuracy : 0.9883198562443846
Precision: 1.0
Recall   : 0.9161290322580645
F1 Score : 0.9562289562289562


🔹 Model: Logistic Regression
Accuracy : 0.9829290206648698
Precision: 1.0
Recall   : 0.8774193548387097
F1 Score : 0.9347079037800687

🔹 Model: Naive Bayes
Accuracy : 0.9784366576819407
Precision: 0.8922155688622755
Recall   : 0.9612903225806452
F1 Score : 0.9254658385093167

🔹 Model: SVM (Linear)
Accuracy : 0.9856244384546271
Precision: 0.986013986013986
Recall   : 0.9096774193548387
F1 Score : 0.9463087248322147
