In [1]:
import pandas as pd
import pickle

from statistics import mean

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [2]:
def getDataset():
    """ Load the dataset and Returns dataframes of Input matrix (X) and Output labels (y)
        Return: dataset"""
    
    # Load the dataset from spambase.data
    dataframe = pd.read_csv('spambase.data', header=None)
    X = dataframe.drop(57, axis=1)
    y = dataframe[57]
    
    ### Storing the dataset values
    dataset = (X, y)
#     with open('dataset.pkl', 'wb') as fout:
#         pickle.dump(dataset, fout)
    return dataset

In [3]:
def getScore(model, X_train, X_test, y_train, y_test):
    """ Train the dataset on the given model and evaluate score on the test set
        Input:
            model: Classification Model
            X_train: training input for training
            X_test: training input for testing
            y_train: output labels for training
            y_test: output labels for testing
        Return: accuracy score of the model """
    
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

In [4]:
def evaluate(model, kfold, X, y):
    """ Evaluate the model for given dataset using k-fold Cross-Validation
        Input:
            model: Classification Model
            kfold: number of split of the dataset for cross validation
            X: input data
            y: output labels
        Return: average scores"""
    
    # score list to store all the eval score
    scores = []
    
    # split the training and testing dataset into k-folds using StratifiedKFold
    folds = StratifiedKFold(n_splits=kfold)
    for train_idx, test_idx in folds.split(X, y):
        X_train, X_test, y_train, y_test = X.iloc[train_idx], X.iloc[test_idx], y.iloc[train_idx], y.iloc[test_idx]
        scores.append(getScore(model, X_train, X_test, y_train, y_test))
    
    return mean(scores)

In [5]:
def main(argv):
    """ Load dataset and evaluate on various classification models
        All the kfold values used below are assigned after iteratively checking the accuracy of the model"""
    
    ### Read Dataset
    X, y = getDataset()
    
    ### Evaluate spam dataset on various Classifier
    print('Logistic Regression Classification:')
    acc_log = evaluate(LogisticRegression(), 9, X, y)
    print(acc_log)
    
    print('SVM Classification:')
    acc_svc = evaluate(SVC(), 10, X, y)
    print(acc_svc)
    
    print('Decision Tree Classification:')
    acc_dtc = evaluate(DecisionTreeClassifier(), 10, X, y)
    print(acc_dtc)
    
    print('RandomForest Classification:')
    acc_rf = evaluate(RandomForestClassifier(n_estimators=50), 9, X, y)
    print(acc_rf)
    print('Best result is observed from Random Forest: ', acc_rf, '\n')
    
    
    
    ### Another effective approach to evaluate a model using cross_val_score method
    print('Evaluate models using cross_val_score()')
    
    print('Logistic Regression Classification using cross_val_score():')
    score_lr = mean(cross_val_score(LogisticRegression(), X, y, cv=9))
    print(score_lr)
    
    print('SVM Classification using cross_val_score():')
    score_svc = mean(cross_val_score(SVC(), X, y, cv=10))
    print(score_svc)
    
    print('Decision Tree Classification using cross_val_score():')
    score_dt = mean(cross_val_score(DecisionTreeClassifier(), X, y, cv=10))
    print(score_dt)
    
    print('RandomForest Classification using cross_val_score():')
    score_rf = mean(cross_val_score(RandomForestClassifier(n_estimators=50), X, y, cv=9))
    print(score_rf)
    print('Best result is observed from Random Forest (using cross_val_score()): ', score_rf, '\n')

In [6]:
if __name__ == '__main__':
    main([])

Logistic Regression Classification:
0.9239064901550422
SVM Classification:
0.8152513943005812
Decision Tree Classification:
0.898242619294042
RandomForest Classification:
0.9419343612635313
Best result is observed from Random Forest:  0.9419343612635313 

Evaluate models using cross_val_score()
Logistic Regression Classification using cross_val_score():
0.9239064901550422
SVM Classification using cross_val_score():
0.8152513943005812
Decision Tree Classification using cross_val_score():
0.8997634152951738
RandomForest Classification using cross_val_score():
0.9425926358937791
Best result is observed from Random Forest (using cross_val_score()):  0.9425926358937791 

