# Training logic part for email classifier

### Data loader

In [2]:
import pandas as pd
import re

def load_data(filepath='email.csv'):
    df = pd.read_csv(filepath, encoding='latin-1')
    
    # Rename columns for clarity
    df = df.iloc[:, :2]
    df.columns = ['label', 'text']
    
    # Convert labels to binary (1 = spam, 0 = ham)
    df['label'] = df['label'].map({'spam': 1, 'ham': 0})
    
    # Clean text (remove special characters, convert to lowercase)
    df['text'] = df['text'].apply(lambda x: re.sub(r'\W+', ' ', x.lower()))
    
    return df

### Preprocess

In [None]:
import gensim
from gensim.models import Word2Vec
import numpy as np

def preprocess_text(text):
    return text.lower().split()

def train_word2vec(texts, vector_size=100, min_count=1, window=5):
    tokenized_texts = [preprocess_text(text) for text in texts]
    model = Word2Vec(sentences=tokenized_texts, vector_size=vector_size, min_count=min_count, window=window, workers=4)
    return model

def text_to_vector(text, word2vec_model, vector_size=100):
    words = preprocess_text(text)
    vectors = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(vector_size)

### Model training

In [3]:
from sklearn.naive_bayes import GaussianNB
import numpy as np
import joblib
from classifier.dataset_loader import load_data
from classifier.preprocess import train_word2vec, text_to_vector

# Load dataset
df = load_data('data/email.csv')

# Train Word2Vec model
word2vec_model = train_word2vec(df['text'])
word2vec_model.save('classifier/model/word2vec.model')

# Convert text to vectors
X = np.array([text_to_vector(text, word2vec_model) for text in df['text']])
y = df['label'].values

# Train Naive Bayes classifier
nb_model = GaussianNB()
nb_model.fit(X, y)

# Save trained model
joblib.dump(nb_model, 'classifier/model/spam_classifier.pkl')
print("Model training complete and saved!")

ModuleNotFoundError: No module named 'classifier'

In [None]:
import joblib
import numpy as np
from gensim.models import Word2Vec
from classifier.preprocess import text_to_vector

# Load trained models
word2vec_model = Word2Vec.load('backend/classifier/model/word2vec.model')
spam_classifier = joblib.load('backend/classifier/model/spam_classifier.pkl')

def classify_email(text):
    vector = text_to_vector(text, word2vec_model).reshape(1, -1)
    prediction = spam_classifier.predict(vector)[0]
    return "Spam" if prediction == 1 else "Ham"