In [1]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import matplotlib.pyplot as plt

In [9]:
# Load the data
data = pd.read_csv('spam.csv', header=None)

# Assuming the last column is the target variable and the rest are features
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Checking data types of all features
print("The data types")
print(data.dtypes)

# Check for missing values
missing_values = data.isnull().sum()

# Check for NaN values
nan_values = data.isna().sum()

# Print the results
print("Missing Values:")
print(missing_values)

print("\nNaN Values:")
print(nan_values)

# Exclude the target variable
target_variable = data.iloc[:, -1]

# Check the distribution of the target variable
class_distribution = target_variable.value_counts()

# Print the distribution
print("Class Distribution:")
print(class_distribution)

# Calculate the balance ratio
balance_ratio = class_distribution[1] / class_distribution[0]
print("Balance Ratio:", balance_ratio)

The data types
0      object
1      object
2      object
3      object
4      object
5      object
6      object
7      object
8      object
9      object
10     object
11     object
12     object
13     object
14     object
15     object
16     object
17     object
18     object
19     object
20     object
21     object
22    float64
23     object
24     object
25     object
26     object
27    float64
28     object
29     object
30     object
31    float64
32     object
33    float64
34    float64
35     object
36    float64
37     object
38     object
39     object
40     object
41     object
42     object
43     object
44     object
45     object
46     object
47     object
48     object
49     object
50     object
51     object
52     object
53     object
54     object
55     object
56     object
57     object
dtype: object
Missing Values:
0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0

In [3]:
# Check for categorical features
categorical_features = X.select_dtypes(include=['object']).columns
if not categorical_features.empty:
    # If categorical features are found, one-hot encode them
    X = pd.get_dummies(X, columns=categorical_features)

# Convert feature names to strings
X.columns = X.columns.astype(str)

In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=3601, random_state=42)

In [5]:
# Fused model with Decision Tree, Gaussian Naïve Bayes, and Logistic Regression

# Create individual classifiers
dt_classifier = DecisionTreeClassifier(random_state=42)
nb_classifier = GaussianNB()
lr_classifier = LogisticRegression(random_state=42, max_iter=1000)

# Create the fused model using the majority voting rule
fused_model = VotingClassifier(estimators=[
    ('decision_tree', dt_classifier),
    ('naive_bayes', nb_classifier),
    ('logistic_regression', lr_classifier)
], voting='hard')

# Train and evaluate the fused model
fused_model.fit(X_train[:1000], y_train[:1000])
y_pred_fused = fused_model.predict(X_test)

# Print accuracy, per class accuracy, and confusion matrix for the fused model
print("Fused Model Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_fused))
print("Classification Report:\n", classification_report(y_test, y_pred_fused,zero_division=1))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_fused))


Fused Model Results:
Accuracy: 0.9133574007220217
Classification Report:
               precision    recall  f1-score   support

       Class       1.00      0.00      0.00         1
         ham       0.93      0.93      0.93      2190
        spam       0.89      0.89      0.89      1410

    accuracy                           0.91      3601
   macro avg       0.94      0.61      0.61      3601
weighted avg       0.91      0.91      0.91      3601

Confusion Matrix:
 [[   0    1    0]
 [   0 2028  162]
 [   0  149 1261]]


In [6]:
# AdaBoost Ensemble with Decision Trees as base learner
# Create AdaBoost model with Decision Tree as the base learner
adaboost_model = AdaBoostClassifier(estimator=DecisionTreeClassifier(random_state=42), n_estimators=50)

# Train and evaluate the AdaBoost model
adaboost_model.fit(X_train[:1000], y_train[:1000])
y_pred_adaboost = adaboost_model.predict(X_test)

# Print accuracy, per class accuracy, and confusion matrix for AdaBoost
print("\nAdaBoost Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_adaboost))
print("Classification Report:\n", classification_report(y_test, y_pred_adaboost,zero_division=1))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_adaboost))


AdaBoost Results:
Accuracy: 0.8858650374895862
Classification Report:
               precision    recall  f1-score   support

       Class       1.00      0.00      0.00         1
         ham       0.90      0.91      0.91      2190
        spam       0.86      0.84      0.85      1410

    accuracy                           0.89      3601
   macro avg       0.92      0.59      0.59      3601
weighted avg       0.89      0.89      0.89      3601

Confusion Matrix:
 [[   0    0    1]
 [   0 2000  190]
 [   0  220 1190]]


In [7]:
# Compare with Random Forest
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train[:1000], y_train[:1000])
y_pred_rf = rf_classifier.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
confusion_matrix_rf = confusion_matrix(y_test, y_pred_rf)
classification_report_rf = classification_report(y_test, y_pred_rf)

# Report the results for Random Forest
print("\nRandom Forest Results:")
print(f"Accuracy: {accuracy_rf}")
print("Confusion Matrix:")
print(confusion_matrix_rf)
print("Classification Report:")
print(classification_report_rf)


Random Forest Results:
Accuracy: 0.9169675090252708
Confusion Matrix:
[[   0    1    0]
 [   0 2104   86]
 [   0  212 1198]]
Classification Report:
              precision    recall  f1-score   support

       Class       0.00      0.00      0.00         1
         ham       0.91      0.96      0.93      2190
        spam       0.93      0.85      0.89      1410

    accuracy                           0.92      3601
   macro avg       0.61      0.60      0.61      3601
weighted avg       0.92      0.92      0.92      3601



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
# The impact of training sample size
# Repeat the above steps for different training-test splits (50%-50%, 60%-40%, 70%-30%, and 80%-20%)

for split in [0.5, 0.6, 0.7, 0.8]:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split, random_state=42)

    # Train and evaluate the fused model
    fused_model.fit(X_train, y_train)
    y_pred_fused = fused_model.predict(X_test)

    # Train and evaluate the AdaBoost model
    adaboost_model.fit(X_train, y_train)
    y_pred_adaboost = adaboost_model.predict(X_test)
    
    rf_classifier.fit(X_train, y_train)
    y_pred_random_forest = rf_classifier.predict(X_test)

    print(f"\nResults for {int(split*100)}%-{int((1-split)*100)}% split:")
    print("Fused Model Accuracy:", accuracy_score(y_test, y_pred_fused))
    print("AdaBoost Accuracy:", accuracy_score(y_test, y_pred_adaboost))
    print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_random_forest))
    



Results for 50%-50% split:
Fused Model Accuracy: 0.9356801390699696
AdaBoost Accuracy: 0.9161234245980009
Random Forest Accuracy: 0.9274228596262495

Results for 60%-40% split:
Fused Model Accuracy: 0.9337436640115858
AdaBoost Accuracy: 0.8964518464880521
Random Forest Accuracy: 0.9272266473569877

Results for 70%-30% split:
Fused Model Accuracy: 0.9286157666045934
AdaBoost Accuracy: 0.9003724394785847
Random Forest Accuracy: 0.9252017380509001

Results for 80%-19% split:
Fused Model Accuracy: 0.915263443780554
AdaBoost Accuracy: 0.8843020097772949
Random Forest Accuracy: 0.9185225420966866
