# Import Necessary Libraries

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
import lightgbm as lgb
from tqdm.auto import tqdm
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC


# Mount Google drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Define Necessary Functions

In [26]:
# Function to load stored splitted data
def load_split_data(file_path):
    return pd.read_csv(file_path)

# Function to fit models
def fit_model(train_data, y_train, model_name='logistic_regression'):
    if model_name == 'logistic_regression':
        model = Pipeline([
            ('tfidf', TfidfVectorizer(stop_words='english')),
            ('clf', LogisticRegression(random_state=42))
        ])
    elif model_name == 'random_forest':
        model = Pipeline([
            ('tfidf', TfidfVectorizer(stop_words='english')),
            ('clf', RandomForestClassifier(random_state=42))
        ])
    elif model_name == 'svm':
        model = Pipeline([
            ('tfidf', TfidfVectorizer(stop_words='english')),
            ('clf', SVC())
        ])
    else:
        raise ValueError("Model name not recognized. Choose 'logistic_regression', 'random_forest', or 'svm'")

    model.fit(train_data, y_train)
    return model

# Function to calculate scores of differet models
def score_model(model, data, y_true):
    y_pred = model.predict(data)
    return accuracy_score(y_true, y_pred)

# Function for model evaluation
def evaluate_model(y_true, y_pred):
    print(classification_report(y_true, y_pred))

# Function for model validation
def validate_model(model, train_data, y_train, validation_data, y_val):
    print("Train score:", score_model(model, train_data, y_train))
    print("Validation score:", score_model(model, validation_data, y_val))

# Model Selection from 3 benchmarked models

In [27]:
model_names = ['logistic_regression', 'random_forest', 'svm']

In [28]:
# Load data
train_data = load_split_data(r'/content/drive/MyDrive/Applied_ML_A1/train.csv')
validation_data = load_split_data(r'/content/drive/MyDrive/Applied_ML_A1/validation.csv')
test_data = load_split_data(r'/content/drive/MyDrive/Applied_ML_A1/test.csv')


In [29]:
X_train = train_data['text']
y_train = train_data['spam']
X_val = validation_data['text']
y_val = validation_data['spam']
X_test = test_data['text']
y_test = test_data['spam']

## Training and evaluation

In [30]:
# Training and evaluation
for model_name in tqdm(model_names, desc="Training Models"):
    print(f"\nTraining with {model_name}:")
    model = fit_model(X_train, y_train, model_name)

    # Score on train and validation
    validate_model(model, X_train, y_train, X_val, y_val)

    # Evaluate on train and validation
    y_pred_train = model.predict(X_train)
    y_pred_val = model.predict(X_val)
    print("For training.\n")
    evaluate_model(y_train, y_pred_train)
    print("For validation.\n")
    evaluate_model(y_val, y_pred_val)

Training Models:   0%|          | 0/3 [00:00<?, ?it/s]


Training with logistic_regression:
Train score: 0.9949803579223047
Validation score: 0.9825378346915018
For training.

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      3504
           1       1.00      0.98      0.99      1078

    accuracy                           0.99      4582
   macro avg       1.00      0.99      0.99      4582
weighted avg       0.99      0.99      0.99      4582

For validation.

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       645
           1       1.00      0.93      0.96       214

    accuracy                           0.98       859
   macro avg       0.99      0.97      0.98       859
weighted avg       0.98      0.98      0.98       859


Training with random_forest:
Train score: 1.0
Validation score: 0.9790454016298021
For training.

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3504
 

# Based on Recall and Precission Logistic Regression is the best out of 3 benchmarked models before parameter tuning. Also SVM and Random Forest model may overfit the data as accuracy score in training data is 1 for both the models.

# Tune hyperparameters of Logistic Regression model

In [32]:
LR_model = Pipeline([
            ('tfidf', TfidfVectorizer(stop_words='english')),
            ('clf', LogisticRegression(random_state=42))
        ])

LR_model_params =  {
        'tfidf__max_df': [0.5, 0.75],
        'tfidf__ngram_range': [(1, 1), (1, 2)],
        'clf__C': [0.1, 1, 10]
    }

In [49]:
grid_search = GridSearchCV(LR_model, LR_model_params, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train,y_train )

print(f"Best parameters for LR_model :")
print(grid_search.best_params_)

tuned_LR_model=grid_search.best_estimator_

Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best parameters for LR_model :
{'clf__C': 10, 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 1)}


## Classification report on Test Data for Tuned Logistic Regression Model

In [55]:
y_pred = tuned_LR_model.predict(X_test)
report = classification_report(y_test, y_pred)
print(f"Classification Report for Tuned_LR_Model:\n{report}\n")

Classification Report for Tuned_LR_Model:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       211
           1       0.99      0.96      0.97        76

    accuracy                           0.99       287
   macro avg       0.99      0.98      0.98       287
weighted avg       0.99      0.99      0.99       287


