# ***Summary***
**The best model is >> Random Forest with accuracy '93.4%'**

In [1]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.2-py3-none-any.whl (7.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.2
Note: you may need to restart the kernel to use updated packages.


In [2]:
from ucimlrepo import fetch_ucirepo 
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix



In [3]:
def calculate_acc(y_act,y_pred):
    # Calculate training accuracy
    print(f'Accuracy : {accuracy_score(y_act, y_pred)}')
    print(f'Classification Report \n {classification_report(y_act, y_pred)}')

    # Confusion matrix
    print("Confusion Matrix:\n", confusion_matrix(y_act, y_pred))


In [4]:
# fetch dataset 
spambase = fetch_ucirepo(id=94) 
  
# data (as pandas dataframes) 
X = spambase.data.features 
y = spambase.data.targets 

In [5]:
# metadata 
print(spambase.metadata)  

{'uci_id': 94, 'name': 'Spambase', 'repository_url': 'https://archive.ics.uci.edu/dataset/94/spambase', 'data_url': 'https://archive.ics.uci.edu/static/public/94/data.csv', 'abstract': 'Classifying Email as Spam or Non-Spam', 'area': 'Computer Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 4601, 'num_features': 57, 'feature_types': ['Integer', 'Real'], 'demographics': [], 'target_col': ['Class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1999, 'last_updated': 'Mon Aug 28 2023', 'dataset_doi': '10.24432/C53G6X', 'creators': ['Mark Hopkins', 'Erik Reeber', 'George Forman', 'Jaap Suermondt'], 'intro_paper': None, 'additional_info': {'summary': 'The "spam" concept is diverse: advertisements for products/web sites, make money fast schemes, chain letters, pornography...\n\nThe classification task for this dataset is to determine whether a given email is spam or not.\n\t\nOur collecti

In [6]:
# variable information 
print(spambase.variables) 

                          name     role        type demographic  \
0               word_freq_make  Feature  Continuous        None   
1            word_freq_address  Feature  Continuous        None   
2                word_freq_all  Feature  Continuous        None   
3                 word_freq_3d  Feature  Continuous        None   
4                word_freq_our  Feature  Continuous        None   
5               word_freq_over  Feature  Continuous        None   
6             word_freq_remove  Feature  Continuous        None   
7           word_freq_internet  Feature  Continuous        None   
8              word_freq_order  Feature  Continuous        None   
9               word_freq_mail  Feature  Continuous        None   
10           word_freq_receive  Feature  Continuous        None   
11              word_freq_will  Feature  Continuous        None   
12            word_freq_people  Feature  Continuous        None   
13            word_freq_report  Feature  Continuous        Non

In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Create a list of models to compare
models = [
    ("Linear Regression", LinearRegression()),
    ("Lasso Regression", Lasso(alpha=0.1)),
    ("Ridge Regression", Ridge(alpha=0.1)),
    ("Decision Tree", DecisionTreeRegressor()),
    ("Random Forest", RandomForestRegressor()),
    ("Gradient Boosting", GradientBoostingRegressor()),
    ("K-Nearest Neighbors", KNeighborsRegressor(n_neighbors=5)),
    ("SVM",SVC())
]

In [9]:
# Initialize lists to store model performance scores
mae_scores = []
mse_scores = []
rmse_scores = []
r2_scores = []

In [10]:
# Perform cross-validation for each model
for name, model in models:
    kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5-fold cross-validation
    mae = -cross_val_score(model, X_train, y_train, cv=kf, scoring='neg_mean_absolute_error').mean()
    mse = -cross_val_score(model, X_train, y_train, cv=kf, scoring='neg_mean_squared_error').mean()
    rmse = np.sqrt(mse)
    r2 = cross_val_score(model, X_train, y_train, cv=kf, scoring='r2').mean()
    
    mae_scores.append(mae)
    mse_scores.append(mse)
    rmse_scores.append(rmse)
    r2_scores.append(r2)
    
# Create a DataFrame to compare model performance
results_df = pd.DataFrame({
    "Model": [name for name, _ in models],
    "MAE": mae_scores,
    "MSE": mse_scores,
    "RMSE": rmse_scores,
    "R2": r2_scores
})

# Display the results
print(results_df)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_

                 Model       MAE       MSE      RMSE        R2
0    Linear Regression  0.264275  0.112326  0.335151  0.525502
1     Lasso Regression  0.391896  0.201092  0.448433  0.147491
2     Ridge Regression  0.264247  0.112273  0.335072  0.525724
3        Decision Tree  0.097962  0.096128  0.310045  0.596419
4        Random Forest  0.101009  0.047325  0.217544  0.800449
5    Gradient Boosting  0.129683  0.050358  0.224405  0.787379
6  K-Nearest Neighbors  0.252989  0.147880  0.384552  0.375029
7                  SVM  0.286413  0.286413  0.535176 -0.209232


# ***Training***

# *Training on heighest R2*

In [11]:
model = RandomForestRegressor()
model.fit(X,y)

  model.fit(X,y)


In [12]:
#Calculate training accuracy
y_train_pred1 = model.predict(X_train)

In [13]:
for i in range(len(y_train_pred1)):
    if y_train_pred1[i]>0.9:
        y_train_pred1[i]=1
    else:y_train_pred1[i]=0

In [14]:
# Printing accuracies
print('\t Training accuracy')
calculate_acc(y_train, y_train_pred1)

	 Training accuracy
Accuracy : 0.946195652173913
Classification Report 
               precision    recall  f1-score   support

           0       0.92      1.00      0.96      2257
           1       1.00      0.86      0.93      1423

    accuracy                           0.95      3680
   macro avg       0.96      0.93      0.94      3680
weighted avg       0.95      0.95      0.95      3680

Confusion Matrix:
 [[2257    0]
 [ 198 1225]]


# *Testing on Random Forest*

In [15]:
#Calculate training accuracy
y_test_pred1 = model.predict(X_test)

In [16]:
for i in range(len(y_test_pred1)):
    if y_test_pred1[i]>0.9:
        y_test_pred1[i]=1
    else:y_test_pred1[i]=0

In [17]:
# Printing accuracies
print('\t Testing accuracy')
calculate_acc(y_test,y_test_pred1)

	 Testing accuracy
Accuracy : 0.9272529858849077
Classification Report 
               precision    recall  f1-score   support

           0       0.89      1.00      0.94       531
           1       1.00      0.83      0.91       390

    accuracy                           0.93       921
   macro avg       0.94      0.91      0.92       921
weighted avg       0.94      0.93      0.93       921

Confusion Matrix:
 [[531   0]
 [ 67 323]]


# *Training on heighest RMSE*

In [18]:
svm_classifier=SVC()
svm_classifier.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [19]:
# Make predictions on the training data
y_train_pred2 = svm_classifier.predict(X_train)

In [20]:
# Printing accuracies
print('\t Training accuracy')
calculate_acc(y_train, y_train_pred2)

	 Training accuracy
Accuracy : 0.7135869565217391
Classification Report 
               precision    recall  f1-score   support

           0       0.72      0.88      0.79      2257
           1       0.70      0.45      0.55      1423

    accuracy                           0.71      3680
   macro avg       0.71      0.67      0.67      3680
weighted avg       0.71      0.71      0.70      3680

Confusion Matrix:
 [[1981  276]
 [ 778  645]]


# *Testing on heighest RMSE*

In [21]:
y_test_pred2 = svm_classifier.predict(X_test)

In [22]:
# Printing accuracies
print('\t Testing accuracy')
calculate_acc(y_test, y_test_pred2)

	 Testing accuracy
Accuracy : 0.6623235613463626
Classification Report 
               precision    recall  f1-score   support

           0       0.66      0.84      0.74       531
           1       0.66      0.42      0.51       390

    accuracy                           0.66       921
   macro avg       0.66      0.63      0.63       921
weighted avg       0.66      0.66      0.64       921

Confusion Matrix:
 [[448  83]
 [228 162]]
