In [10]:
import os

print("Current directory files:")
print(os.listdir())


Current directory files:
['.config', 'sample_data']


In [11]:
from google.colab import files
files.upload()

Saving spam_or_not_spam.csv to spam_or_not_spam.csv




In [14]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# =========================
# 1. LOAD DATASET
# =========================
data = pd.read_csv("spam_or_not_spam.csv")

print("Dataset loaded successfully!")
print(data.head())

# =========================
# 2. CLEAN DATA (FIX NaN)
# =========================
data["email"] = data["email"].fillna("").astype(str)

# =========================
# 3. SPLIT FEATURES & LABEL
# =========================
X_text = data["email"]
y = data["label"]

# =========================
# 4. TEXT → NUMERIC (TF-IDF)
# =========================
vectorizer = TfidfVectorizer(
    stop_words="english",
    max_features=3000
)

X = vectorizer.fit_transform(X_text).toarray()

print("\nVectorized shape:", X.shape)

# =========================
# 5. TRAIN / TEST SPLIT
# =========================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# =========================
# 6. INITIALIZE PARAMETERS
# =========================
np.random.seed(0)
w = np.zeros(X_train.shape[1])
b = 0.0
lr = 0.1
epochs = 50

# =========================
# 7. SIGMOID FUNCTION
# =========================
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# =========================
# 8. TRAINING LOOP (LINEAR CLASSIFIER)
# =========================
for epoch in range(epochs):
    for i in range(len(X_train)):
        z = np.dot(X_train[i], w) + b
        y_pred = sigmoid(z)

        dz = y_pred - y_train.iloc[i]
        w -= lr * dz * X_train[i]
        b -= lr * dz

    if epoch % 10 == 0:
        print(f"Epoch {epoch} completed")

# =========================
# 9. TESTING
# =========================
correct = 0

for i in range(len(X_test)):
    z = np.dot(X_test[i], w) + b
    pred = 1 if sigmoid(z) >= 0.5 else 0
    if pred == y_test.iloc[i]:
        correct += 1

accuracy = correct / len(X_test)

print("\nFinal Accuracy:", accuracy)

Dataset loaded successfully!
                                               email  label
0   date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...      0
1  martin a posted tassos papadopoulos the greek ...      0
2  man threatens explosion in moscow thursday aug...      0
3  klez the virus that won t die already the most...      0
4   in adding cream to spaghetti carbonara which ...      0

Vectorized shape: (3000, 3000)
Epoch 0 completed
Epoch 10 completed
Epoch 20 completed
Epoch 30 completed
Epoch 40 completed

Final Accuracy: 0.9816666666666667
