<a href="https://colab.research.google.com/github/apoorvapu/data_science/blob/main/NLP_email_classify_NOsql.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**NLP**: **(2 methods: (1) fine-tuning LLM and (2) model training from scratch)**

# **email detection (spam)**

✅ Uses TF-IDF (no need for large word embeddings like BERT).
✅ Restricts features (max_features=5000 in TF-IDF) to reduce RAM usage.
✅ Uses LightGBM, which is optimized for fast, memory-efficient training.
✅ Avoids deep learning (which requires high GPU/TPU memory).

In [10]:
!pip install datasets



In [11]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [12]:
from datasets import load_dataset
ds = load_dataset("talby/spamassassin", "text")
print(ds['train'][0])



In [13]:
# Convert dataset to Pandas DataFrame for easier handling
df = pd.DataFrame(ds["train"])  # Convert 'train' split to DataFrame
df.head()

Unnamed: 0,label,group,text
0,1,easy_ham,"""\n> From: fork-admin@xent.com [mailto:fork-ad..."
1,1,easy_ham,"""Hi,\n\nOn Sun, 01 Sep 2002 00:05:03 MDT Reg C..."
2,1,easy_ham,"""On Fri Sep 13 2002 at 02:03, Robert Elz wrote..."
3,1,easy_ham,"""On Thu, 2002-08-29 at 01:06, Matthias Saou wr..."
4,1,easy_ham,"""Hi, I'm building an rpm for the resin webserv..."


In [14]:
# Preprocessing function
def clean_text(text):
    text = text.lower()  # Lowercasing
    text = re.sub(r'\S+@\S+', ' ', text)  # Remove emails
    text = re.sub(r'https?://\S+', ' ', text)  # Remove URLs
    text = re.sub(r'[^a-z\s]', ' ', text)  # Keep only letters and spaces
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Apply cleaning
df["clean_text"] = df["text"].apply(clean_text)

# Encode labels (1 = spam, 0 = ham)
df["label"] = df["label"].astype(int)

# Check dataset
print(df.head())

   label     group                                               text  \
0      1  easy_ham  "\n> From: fork-admin@xent.com [mailto:fork-ad...   
1      1  easy_ham  "Hi,\n\nOn Sun, 01 Sep 2002 00:05:03 MDT Reg C...   
2      1  easy_ham  "On Fri Sep 13 2002 at 02:03, Robert Elz wrote...   
3      1  easy_ham  "On Thu, 2002-08-29 at 01:06, Matthias Saou wr...   
4      1  easy_ham  "Hi, I'm building an rpm for the resin webserv...   

                                          clean_text  
0  n from on behalf of njames n rogers n n subjec...  
1  hi n non sun sep mdt reg clemens wrote n n n i...  
2  on fri sep at robert elz wrote n n date wed se...  
3  on thu at matthias saou wrote n n thanks a lot...  
4  hi i m building an rpm for the resin webserver...  


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import accuracy_score

# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df["clean_text"]).toarray()
y = df["label"]

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train LightGBM model
lgbm = lgb.LGBMClassifier(n_estimators=100)
lgbm.fit(X_train, y_train)

# Predict
y_pred = lgbm.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"LightGBM Accuracy: {accuracy:.4f}")










# Encode categories
label_encoder = LabelEncoder()
df["group"] = label_encoder.fit_transform(df["group"])  # Encode labels as numbers



KeyError: 'category'

In [None]:
# Convert text to TF-IDF features (low memory friendly)
vectorizer = TfidfVectorizer(max_features=5000)  # Limit features for low memory
X = vectorizer.fit_transform(df["text"]).toarray()
y = df["group"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LightGBM model
lgb_train = lgb.Dataset(X_train, y_train)
lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train)


In [None]:
params = {
    "objective": "multiclass",
    "num_class": len(np.unique(y)),
    "metric": "multi_logloss",
    "boosting_type": "gbdt",
    "max_depth": -1,
    "learning_rate": 0.1,
    "verbose": -1,
    "num_leaves": 31,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5
}

# Train LightGBM
model = lgb.train(params, lgb_train, valid_sets=[lgb_test], early_stopping_rounds=20, verbose_eval=False)



In [None]:
# Predict and evaluate
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
accuracy = accuracy_score(y_test, y_pred)

print(f"Test Accuracy: {accuracy:.4f}")


#**using DistilBERT (more expensive)**

In [None]:
pip install transformers datasets torch scikit-learn


In [None]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import numpy as np

# Load dataset (ensure 'text' and 'category' columns exist)
df = pd.read_csv("email_dataset.csv")
df.dropna(inplace=True)

# Encode labels
label_encoder = LabelEncoder()
df["category"] = label_encoder.fit_transform(df["category"])
num_classes = len(label_encoder.classes_)

# Split data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["text"].tolist(), df["category"].tolist(), test_size=0.2, random_state=42
)

# Tokenization
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

# Convert to Dataset format
train_dataset = Dataset.from_dict({"input_ids": train_encodings["input_ids"], "attention_mask": train_encodings["attention_mask"], "labels": train_labels})
test_dataset = Dataset.from_dict({"input_ids": test_encodings["input_ids"], "attention_mask": test_encodings["attention_mask"], "labels": test_labels})

# Load DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_classes)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,  # Reduce batch size for low memory
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train model
trainer.train()

# Evaluate
preds = trainer.predict(test_dataset).predictions
pred_labels = np.argmax(preds, axis=1)
accuracy = (pred_labels == np.array(test_labels)).mean()

print(f"Test Accuracy: {accuracy:.4f}")
