# Import Necessary Libraries

In [41]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
import lightgbm as lgb
from tqdm.auto import tqdm
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC


# Mount Google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Define Necessary Functions

In [44]:
# Function to load stored splitted data
def load_split_data(file_path):
    return pd.read_csv(file_path)

# Function to fit models
def fit_model(train_data, y_train, model_name='logistic_regression'):
    if model_name == 'logistic_regression':
        model = Pipeline([
            ('tfidf', TfidfVectorizer(stop_words='english')),
            ('clf', LogisticRegression(random_state=42))
        ])
    elif model_name == 'random_forest':
        model = Pipeline([
            ('tfidf', TfidfVectorizer(stop_words='english')),
            ('clf', RandomForestClassifier(random_state=42))
        ])
    elif model_name == 'svm':
        model = Pipeline([
            ('tfidf', TfidfVectorizer(stop_words='english')),
            ('clf', SVC(gamma='auto'))
        ])
    else:
        raise ValueError("Model name not recognized. Choose 'logistic_regression', 'random_forest', or 'svm'")

    model.fit(train_data, y_train)
    return model

# Function to calculate scores of differet models
def score_model(model, data, y_true):
    y_pred = model.predict(data)
    return accuracy_score(y_true, y_pred)

# Function for model evaluation
def evaluate_model(y_true, y_pred):
    print(classification_report(y_true, y_pred))

# Function for model validation
def validate_model(model, train_data, y_train, validation_data, y_val):
    print("Train score:", score_model(model, train_data, y_train))
    print("Validation score:", score_model(model, validation_data, y_val))

In [48]:
# Model Selection from 3 benchmarked models
model_names = ['logistic_regression', 'random_forest', 'svm']

In [49]:
# Load data
train_data = load_split_data(r'/content/drive/MyDrive/Applied_ML_A1/train.csv')
validation_data = load_split_data(r'/content/drive/MyDrive/Applied_ML_A1/validation.csv')
test_data = load_split_data(r'/content/drive/MyDrive/Applied_ML_A1/test.csv')


In [50]:
X_train = train_data['text']
y_train = train_data['spam']
X_val = validation_data['text']
y_val = validation_data['spam']
X_test = test_data['text']
y_test = test_data['spam']

## Training and evaluation

In [51]:
# Training and evaluation
for model_name in tqdm(model_names, desc="Training Models"):
    print(f"\nTraining with {model_name}:")
    model = fit_model(X_train, y_train, model_name)

    # Score on train and validation
    validate_model(model, X_train, y_train, X_val, y_val)

    # Evaluate on train and validation
    y_pred_train = model.predict(X_train)
    y_pred_val = model.predict(X_val)
    print("For training.\n")
    evaluate_model(y_train, y_pred_train)
    print("For validation.\n")
    evaluate_model(y_val, y_pred_val)

Training Models:   0%|          | 0/3 [00:00<?, ?it/s]


Training with logistic_regression:
Train score: 0.9949803579223047
Validation score: 0.9825378346915018
For training.

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      3504
           1       1.00      0.98      0.99      1078

    accuracy                           0.99      4582
   macro avg       1.00      0.99      0.99      4582
weighted avg       0.99      0.99      0.99      4582

For validation.

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       645
           1       1.00      0.93      0.96       214

    accuracy                           0.98       859
   macro avg       0.99      0.97      0.98       859
weighted avg       0.98      0.98      0.98       859


Training with random_forest:
Train score: 1.0
Validation score: 0.9790454016298021
For training.

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3504
 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Based on Recall and Precission Logistic Regression is the best out of 3 benchmarked models before parameter tuning.

# Tune hyperparameters of Logistic Regression model

In [57]:

# Combine the training and validation sets for final fine-tuning
X_train_val = pd.concat([X_train, X_val])
y_train_val = pd.concat([y_train, y_val])

In [58]:
# Define the base models
models_gcv = {
    'logistic_regression': Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english')),
        ('clf', LogisticRegression(random_state=42))
    ])
}

# Define the parameter grids for each model
param_grids = {
    'logistic_regression': {
        'tfidf__max_df': [0.5, 0.75],
        'tfidf__ngram_range': [(1, 1), (1, 2)],
        'clf__C': [0.1, 1, 10]
    }
}

In [59]:
best_models = {}
for model_name in tqdm(models_gcv.keys(), desc="Grid Searching Models"):
    print(f"Starting grid search for {model_name}...")
    grid_search = GridSearchCV(models_gcv[model_name], param_grids[model_name], cv=3, n_jobs=-1, verbose=1)
    grid_search.fit(X_train_val,y_train_val )

    print(f"Best parameters for {model_name}:")
    print(grid_search.best_params_)

    # Store the best model
    best_models[model_name] = grid_search.best_estimator_

Grid Searching Models:   0%|          | 0/1 [00:00<?, ?it/s]

Starting grid search for logistic_regression...
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best parameters for logistic_regression:
{'clf__C': 10, 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 1)}


In [60]:
best_models

{'logistic_regression': Pipeline(steps=[('tfidf', TfidfVectorizer(max_df=0.5, stop_words='english')),
                 ('clf', LogisticRegression(C=10, random_state=42))])}

In [61]:
# Dictionary to store the test accuracy of each model
test_accuracies = {}

for model_name, model in tqdm(best_models.items()):
    # Predicting on the test set
    y_pred = model.predict(X_test)

    # Calculating accuracy
    accuracy = accuracy_score(y_test, y_pred)
    test_accuracies[model_name] = accuracy

    print(f"{model_name} Test Accuracy: {accuracy:.8f}")


  0%|          | 0/1 [00:00<?, ?it/s]

logistic_regression Test Accuracy: 0.98954704


### Logistic_Regression gives the best score after Grid Search CV on test data.

-- This may change depending on random state and data split

In [62]:
# Determine the model with the best test accuracy
best_model_name = max(test_accuracies, key=test_accuracies.get)

## Classification report on Test Data for best model

In [63]:
# Assuming best_model_name is the name of the best model determined from previous steps
best_model = best_models[best_model_name]

# Generate predictions and classification reports for both train and test sets
datasets = {
    'Train': (X_train_val, y_train_val),
    'Test': (X_test, y_test)
}

for phase, (features, labels) in tqdm(datasets.items(), desc="Evaluating Best Model"):
    y_pred = best_model.predict(features)
    report = classification_report(labels, y_pred)
    print(f"{phase} Classification Report for {best_model_name}:\n{report}\n")


Evaluating Best Model:   0%|          | 0/2 [00:00<?, ?it/s]

Train Classification Report for logistic_regression:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4149
           1       1.00      1.00      1.00      1292

    accuracy                           1.00      5441
   macro avg       1.00      1.00      1.00      5441
weighted avg       1.00      1.00      1.00      5441


Test Classification Report for logistic_regression:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       211
           1       0.99      0.97      0.98        76

    accuracy                           0.99       287
   macro avg       0.99      0.98      0.99       287
weighted avg       0.99      0.99      0.99       287




-------------------------------------------------------------------------------------------------------------------------------