In [9]:
# Scikit-learn classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

# Scikit-learn model evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV

# Scikit-learn ensemble methods
from sklearn.ensemble import StackingClassifier

In [5]:
# Build and evaluate multiple classifiers using TF-IDF transformed text data. 
def model_building(df):

    X = df['transformed_text']
    y = df['spam']

    # TF-IDF Vectorization
    tfidf = TfidfVectorizer(max_features=3000)
    X_tfidf = tfidf.fit_transform(X)

    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=2)

    # Initialize classifiers
    clfs = {
        'SVC': SVC(kernel='sigmoid', gamma=1.0),
        'KN': KNeighborsClassifier(),
        'NB': MultinomialNB(),
        'DT': DecisionTreeClassifier(max_depth=5),
        'LR': LogisticRegression(solver='liblinear', penalty='l1'),
        'RF': RandomForestClassifier(n_estimators=50, random_state=2),
        'AdaBoost': AdaBoostClassifier(n_estimators=50, random_state=2),
        'BgC': BaggingClassifier(n_estimators=50, random_state=2),
        'ETC': ExtraTreesClassifier(n_estimators=50, random_state=2),
        'GBDT': GradientBoostingClassifier(n_estimators=50, random_state=2),
        'xgb': XGBClassifier(n_estimators=50, random_state=2)
    }

    # Function to train a classifier and return accuracy and precision scores
    def train_classifier(clf, X_train, y_train, X_test, y_test):
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        return accuracy, precision

    accuracy_scores = []
    precision_scores = []

    # Train and evaluate each classifier
    for name, clf in clfs.items():
        current_accuracy, current_precision = train_classifier(clf, X_train, y_train, X_test, y_test)
        print("For ", name)
        print("Accuracy - ", current_accuracy)
        print("Precision - ", current_precision)
        accuracy_scores.append(current_accuracy)
        precision_scores.append(current_precision)

    # Store the performance metrics
    performance_df = pd.DataFrame({'Algorithm': clfs.keys(), 'Accuracy': accuracy_scores, 'Precision': precision_scores}).sort_values('Precision', ascending=False)

    return performance_df, clfs


In [6]:
def validate_models(df, models):
    X = df['transformed_text']
    y = df['spam']

    # TF-IDF Vectorization
    tfidf = TfidfVectorizer(max_features=3000)
    X_tfidf = tfidf.fit_transform(X)

    # Split data into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_tfidf, y, test_size=0.2, random_state=2)

    results = []

    for name, clf in models.items():
        print(f"Validating {name}...")

        # Fit on train data
        clf.fit(X_train, y_train)

        # Score on train and validation sets
        train_score = clf.score(X_train, y_train)
        val_score = clf.score(X_val, y_val)

        # Predictions on train and validation sets
        y_train_pred = clf.predict(X_train)
        y_val_pred = clf.predict(X_val)

        # Evaluate on train and validation sets
        train_accuracy = accuracy_score(y_train, y_train_pred)
        train_precision = precision_score(y_train, y_train_pred)
        val_accuracy = accuracy_score(y_val, y_val_pred)
        val_precision = precision_score(y_val, y_val_pred)
        val_report = classification_report(y_val, y_val_pred)

        # Fine-tune using train and validation sets if necessary
        if hasattr(clf, 'best_params_'):
            print(f"Fine-tuning {name}...")
            clf = clf.best_estimator_

        results.append({
            'Algorithm': name,
            'Train Score': train_score,
            'Validation Score': val_score,
            'Train Accuracy': train_accuracy,
            'Train Precision': train_precision,
            'Validation Accuracy': val_accuracy,
            'Validation Precision': val_precision,
            'Validation Report': val_report
        })

    return pd.DataFrame(results)

In [7]:
def build_and_validate_models(df):
    # Build models
    performance_df, clfs = model_building(df)

    # Fine-tuning parameters for classifiers that support it
    param_grids = {
        'SVC': {'kernel': ['sigmoid', 'rbf'], 'gamma': [1.0, 0.1, 0.01]},
        'DT': {'max_depth': [3, 5, 7]},
        'RF': {'n_estimators': [50, 100, 200]},
        'AdaBoost': {'n_estimators': [50, 100, 200], 'learning_rate': [0.1, 0.5, 1.0]},
        'GBDT': {'n_estimators': [50, 100, 200], 'learning_rate': [0.1, 0.5, 1.0]},
        'xgb': {'n_estimators': [50, 100, 200], 'learning_rate': [0.1, 0.5, 1.0]}
    }

    clfs_copy = clfs.copy()

    # Update classifiers with GridSearchCV for fine-tuning
    for name in param_grids:
        if name in clfs_copy:
            clfs_copy[name] = GridSearchCV(clfs_copy[name], param_grids[name], cv=3)

    # Validate models
    validation_results = validate_models(df, clfs_copy)

    return performance_df, validation_results

In [8]:
def best_model(df, benchmark_models):

    # Split the data into training and testing sets
    def train_test_split_df(test_size):
        X = df['transformed_text']
        y = df['spam']

        tfidf = TfidfVectorizer()
        X_tfidf = tfidf.fit_transform(X)

        X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=test_size)

        return X_train, X_test, y_train, y_test

    # Train the benchmark models
    def train_benchmark_models(benchmark_models, X_train, y_train):
        for name, model in benchmark_models.items():
            model.fit(X_train, y_train)

        return benchmark_models

    # Evaluate the benchmark models.
    def evaluate_benchmark_models(benchmark_models, X_test, y_test):
        evaluation_scores = {}

        for name, model in benchmark_models.items():
            y_pred = model.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred)
            evaluation_scores[name] = {'Accuracy': accuracy, 'Precision': precision}

        return evaluation_scores

    # Select the best model based on evaluation scores
    def select_best_model(evaluation_scores):
        best_model = max(evaluation_scores, key=lambda k: (evaluation_scores[k]['Accuracy'], evaluation_scores[k]['Precision']))

        return best_model
    
    # Apply stacking ensemble learning  
    def apply_stacking(models, X_train, y_train, X_test, y_test):
        final_estimator = RandomForestClassifier()

        clf = StackingClassifier(estimators=list(models.items()), final_estimator=final_estimator)
        clf.fit(X_train, y_train)

        y_pred = clf.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        evaluation_scores = {'Accuracy': accuracy, 'Precision': precision}

        return evaluation_scores
    
    X_train, X_test, y_train, y_test = train_test_split_df(0.2)
    
    benchmark_models = train_benchmark_models(benchmark_models, X_train, y_train)
    evaluation_scores = evaluate_benchmark_models(benchmark_models, X_test, y_test)
    best_model = select_best_model(evaluation_scores)
    evaluation_scores_ensemble = apply_stacking(models, X_train, y_train, X_test, y_test)

    return benchmark_models, evaluation_scores, best_model, evaluation_scores_ensemble