# Dataset

1. load csv file (panda, numpy)
2. split dataset. Example code:()
   ```
   random.shuffle(data) # change if you are using pandas dataframe
   training = data[:int(len(data)*0.8)]
   test = data[int(len(data)*0.8):]

   fold5 = KFold(5) # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html
   for train_idx, val_idx in fold5.split(training):
      sub_val = training[val_idx]
      sub_train = training[train_idx]
      clf = model(sub_train, sub_val, ...) # training the model, and evaluate it on validation dataset
      performance(clf, test) # test the model on test dataset
   ```

PREPROCESS *DATASET*

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Load the dataset
data = pd.read_csv('spambase.csv')

# Check for missing values
print("Missing values:")
print(data.isnull().sum())

# Handle missing values (if any)
data.dropna(inplace=True)  # Remove rows with missing values

# Scale the features using Min-Max scaling
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(data.drop(columns=['spam']))
scaled_data = pd.DataFrame(scaled_features, columns=data.columns[:-1])
scaled_data['spam'] = data['spam']

# Handle imbalanced data (if necessary)
# You can use techniques like oversampling or undersampling here

# Split the data into training and testing sets
X = scaled_data.drop(columns=['spam'])
y = scaled_data['spam']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform 5-fold cross-validation
from sklearn.model_selection import KFold

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index, val_index in kfold.split(X_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    # Train and evaluate models for each fold
    # Implement your models here

# Save the preprocessed data
scaled_data.to_csv('preprocessed_email_spam_dataset.csv', index=False)

Missing values:
word_freq_make                0
word_freq_address             0
word_freq_all                 0
word_freq_3d                  0
word_freq_our                 0
word_freq_over                0
word_freq_remove              0
word_freq_internet            0
word_freq_order               0
word_freq_mail                0
word_freq_receive             0
word_freq_will                0
word_freq_people              0
word_freq_report              0
word_freq_addresses           0
word_freq_free                0
word_freq_business            0
word_freq_email               0
word_freq_you                 0
word_freq_credit              0
word_freq_your                0
word_freq_font                0
word_freq_000                 0
word_freq_money               0
word_freq_hp                  0
word_freq_hpl                 0
word_freq_george              0
word_freq_650                 0
word_freq_lab                 0
word_freq_labs                0
word_freq_telnet        

CREATE TEST SET, TRAINING SET

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the preprocessed dataset
preprocessed_data = pd.read_csv('preprocessed_email_spam_dataset.csv')

# Split the dataset into features and target variable
X = preprocessed_data.drop(columns=['spam'])  # Features
y = preprocessed_data['spam']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the training and testing sets
print("Training set shape:")
print("Features:", X_train.shape)
print("Target:", y_train.shape)

print("\nTesting set shape:")
print("Features:", X_test.shape)
print("Target:", y_test.shape)

# Save the training and testing sets to separate CSV files
X_train.to_csv('X_train.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

Training set shape:
Features: (3680, 57)
Target: (3680,)

Testing set shape:
Features: (921, 57)
Target: (921,)


#Naive bayes

1. model learning:

   Note:

   features: remove attributes that is not related to word (the last four attributes)

   labels: the last column

   count P(c) -> how many samples are positive, and how many are negtive

   if freq_word>0, then this word exists. You could use this to calculate P(a|c) -> for each class, what is the prob of each word

   remember to use laplace smoothing.

2. model evaluation (on val dataset -> performance(model, val)):
   
   for each new sample, $\prod{P(a|c)}P(c)$ if word is in the email(freq_word > 0); and find the maximum class
   

   

NAIVE BAYES

In [3]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the training data
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv', header=None, names=['spam'], skiprows=1)

# Calculate the prior probabilities
spam_count = y_train['spam'].sum()
not_spam_count = len(y_train) - spam_count
p_spam = spam_count / len(y_train)
p_not_spam = not_spam_count / len(y_train)

# Calculate the conditional probabilities for each feature
spam_feature_probs = {}
not_spam_feature_probs = {}

for column in X_train.columns:
    spam_feature_count = X_train[column][y_train['spam'] == 1].sum()
    not_spam_feature_count = X_train[column][y_train['spam'] == 0].sum()
    spam_feature_probs[column] = (spam_feature_count + 1) / (spam_count + 2)
    not_spam_feature_probs[column] = (not_spam_feature_count + 1) / (not_spam_count + 2)

# Function to predict the class of an email
def predict_email(email_features):
    spam_prob = np.log(p_spam)
    not_spam_prob = np.log(p_not_spam)

    for column in X_train.columns:
        if email_features[column] == 1:
            spam_prob += np.log(spam_feature_probs[column])
            not_spam_prob += np.log(not_spam_feature_probs[column])

    if spam_prob > not_spam_prob:
        return 1
    else:
        return 0

# Load the testing data
X_test = pd.read_csv('X_test.csv')
y_test = pd.read_csv('y_test.csv', header=None, names=['spam'], skiprows=1)

# Make predictions on the testing data
y_pred = X_test.apply(predict_email, axis=1)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the performance metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.5830618892508144
Precision: 1.0
Recall: 0.015384615384615385
F1-score: 0.030303030303030307


# KNN
1. model learning: None

2. model evaluation(on val dataset): You could use each row(exclude the last column) as the feature of the email. You do not have to recalcuate the freqency.

   ```
   Note:
   parallel programing
   numpy.cos() to calcuate the similarity
   ```

In [8]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the training data
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv', header=None, names=['spam'], skiprows=1)

# Load the testing data
X_test = pd.read_csv('X_test.csv')
y_test = pd.read_csv('y_test.csv', header=None, names=['spam'], skiprows=1)

# Create a KNN classifier class
class KNNClassifier:
    def __init__(self, n_neighbors=5):
        self.n_neighbors = n_neighbors

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        distances = np.sqrt(np.sum((self.X_train.values[:, np.newaxis] - X.values)**2, axis=2))
        indices = np.argsort(distances, axis=0)[:self.n_neighbors]
        nearest_labels = self.y_train.values[indices]
        y_pred = np.apply_along_axis(lambda x: np.argmax(np.bincount(x)), axis=0, arr=nearest_labels)
        return y_pred

# Create a KNN classifier instance
k = 5  # Number of neighbors
knn = KNNClassifier(n_neighbors=k)

# Perform 5-fold cross-validation on the training data
num_folds = 5
fold_size = len(X_train) // num_folds
cv_scores = []
for i in range(num_folds):
    start = i * fold_size
    end = start + fold_size
    X_val_fold = X_train.iloc[start:end]
    y_val_fold = y_train.iloc[start:end]
    X_train_fold = pd.concat([X_train.iloc[:start], X_train.iloc[end:]])
    y_train_fold = pd.concat([y_train.iloc[:start], y_train.iloc[end:]])
    knn.fit(X_train_fold, y_train_fold)
    y_pred_fold = knn.predict(X_val_fold)
    cv_scores.append(accuracy_score(y_val_fold, y_pred_fold))

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", np.mean(cv_scores))

# Train the KNN classifier on the entire training set
knn.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = knn.predict(X_test)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the performance metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Cross-validation scores: [0.8953804347826086, 0.8872282608695652, 0.8913043478260869, 0.9171195652173914, 0.9089673913043478]
Mean cross-validation score: 0.9
Accuracy: 0.8838219326818675
Precision: 0.8773333333333333
Recall: 0.8435897435897436
F1-score: 0.8601307189542484


# LR

1. model learning: You could use each row(exclude the last column) as the feature of the email. You do not have to recalcuate the freqency.
    
    $y = sigmoid(MX)$

step 1: add one more column (all value is 1) in X -> X' = np.c_[np.ones((len(X), 1)), X]

step 2:vector M = np.random.randn(len(X[0])+1, 1);

key formula for step 3 (Note: n is the size of the TRAINING dataset; $cdot$ is dot production ):

1. $pred_y = sigmoid(M\cdot X')$

2. $loss = -\sum(y\cdot log(pred_y)+(1-y)\cdot log(1-pred_y))/n$

3. $gm=X'\cdot (pred_y - y)*2/n$

Step 3 example code:
   ```
   #Step 3: performing gradient descent on whole dataset:
   best_model = M
   best_performace = 0
   for i in range(epoch):
     pred_y = ...
     gm = ...
     _p = performace(model, val)
     if _p > best_performance:
        best_model = M
        best_performance = _p
     M = M - learning_rate*gm
   ```

2. model evaluation(on val dataset):
  
   calculate pred_y, if more than 0.5, then the predicted label is 1.

In [5]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the training data
X_train = pd.read_csv('X_train.csv').values
y_train = pd.read_csv('y_train.csv', header=None, names=['spam'], skiprows=1).values.ravel()

# Load the validation data
X_val = pd.read_csv('X_test.csv').values
y_val = pd.read_csv('y_test.csv', header=None, names=['spam'], skiprows=1).values.ravel()

# Add a column of ones to X_train and X_val
X_train = np.c_[np.ones((len(X_train), 1)), X_train]
X_val = np.c_[np.ones((len(X_val), 1)), X_val]

# Initialize the model parameters
M = np.random.randn(X_train.shape[1], 1)

# Sigmoid function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Performance evaluation function
def performance(model, X, y):
    pred_y = sigmoid(np.dot(X, model))
    pred_labels = (pred_y > 0.5).astype(int)
    accuracy = accuracy_score(y, pred_labels)
    precision = precision_score(y, pred_labels)
    recall = recall_score(y, pred_labels)
    f1 = f1_score(y, pred_labels)
    return accuracy, precision, recall, f1

# Hyperparameters
learning_rate = 0.01
epoch = 100

# Training loop
best_model = M
best_performance = 0
for i in range(epoch):
    # Forward pass
    pred_y = sigmoid(np.dot(X_train, M))

    # Compute loss
    loss = -np.sum(y_train * np.log(pred_y) + (1 - y_train) * np.log(1 - pred_y)) / len(X_train)

    # Compute gradient
    gm = np.dot(X_train.T, (pred_y - y_train.reshape(-1, 1))) * 2 / len(X_train)

    # Evaluate performance on validation set
    _accuracy, _precision, _recall, _f1 = performance(M, X_val, y_val)
    _p = _f1  # You can choose any metric as the performance measure

    if _p > best_performance:
        best_model = M
        best_performance = _p

    # Update model parameters
    M = M - learning_rate * gm

# Evaluate the best model on the validation set
accuracy, precision, recall, f1 = performance(best_model, X_val, y_val)

# Print the performance metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.6243213897937026
Precision: 0.6078431372549019
Recall: 0.31794871794871793
F1-score: 0.4175084175084175


# Model Evaluation

https://scikit-learn.org/stable/modules/model_evaluation.html

I have used the scoring metrics as outputs for the above implementations.