In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
# creates absolute path
def abspath(path, *paths):
    fpath = os.path.join(os.getcwd(), os.pardir, path)

    for p in paths:
        fpath = os.path.join(fpath, p)
    return fpath

In [3]:
def l2_logistic_regression(train_data, test_data, train_labels, test_labels):
    
    # Train model
    model = LogisticRegression(penalty='l2', random_state=42)
    model.fit(train_data, train_labels)    
    
    # Extract top features from model
    importances = model.coef_
    
    for i in range(len(importances)):
        indices = np.argsort(importances[i])[::-1]

        print('\nFeature ranking for label ' + str(i) + ':') if len(importances) > 1 else print('Feature ranking:')
        for f in range(30):
            print('%d. Feature %d (%f)' % (f + 1, indices[f], importances[i][indices[f]]))

    # Test model
    y_train_pred = model.predict(train_data)
    y_test_pred = model.predict(test_data)

    # Evaluate model
    print('\nLogistic Regression - \nTrain Accuracy: ', round(np.sum(y_train_pred == train_labels) / train_data.shape[0], 4))
    print('Test Accuracy: ', round(np.sum(y_test_pred == test_labels) / test_data.shape[0], 4))

In [4]:
def decision_tree_classifier(train_data, test_data, train_labels, test_labels):

    # Train model
    for model in [
        DecisionTreeClassifier(random_state=42), 
        DecisionTreeClassifier(random_state=42, max_depth=10), 
        DecisionTreeClassifier(random_state=42, min_samples_leaf=2), 
        DecisionTreeClassifier(random_state=42, min_samples_split=3)
    ]:
        
        model.fit(train_data, train_labels)

        # Extract top features from model
        importances = model.feature_importances_
        indices = np.argsort(importances)[::-1]

        print('Feature ranking:')
        for f in range(30):
            print('%d. Feature %d (%f)' % (f + 1, indices[f], importances[indices[f]]))

        # Test model
        y_train_pred = model.predict(train_data)
        y_test_pred = model.predict(test_data)

        # Evaluate model
        print('\nDecision Tree - \nTrain Accuracy: ', round(np.sum(y_train_pred == train_labels) / train_data.shape[0], 4))
        print('Test Accuracy: ', round(np.sum(y_test_pred == test_labels) / test_data.shape[0], 4))
        print('-------------------------------------------')

In [5]:
# Fetch data
spambase_path = abspath('..', 'datasets', 'spambase.data')
print(spambase_path)
spambase_dataset = np.loadtxt(open(spambase_path, 'rb'), delimiter=',')

# Data and labels
spambase_data = spambase_dataset[:, list(range(0, spambase_dataset.shape[1] - 1))]
spambase_labels = spambase_dataset[:, spambase_dataset.shape[1] - 1]

print(spambase_dataset.shape)
print(spambase_data.shape)
print(spambase_labels.shape)

C:\Users\Ashton\Documents\GitHub\Machine-Learning-Experiments\Feature Extraction\Feature Selection\..\..\datasets\spambase.data
(4601, 58)
(4601, 57)
(4601,)


In [6]:
# Split into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(spambase_data, spambase_labels, test_size=0.20, random_state=42)

print(X_train.shape)
print(X_test.shape)

(3680, 57)
(921, 57)


In [7]:
l2_logistic_regression(X_train, X_test, y_train, y_test)

Feature ranking:
1. Feature 52 (3.584241)
2. Feature 22 (2.325232)
3. Feature 6 (2.196551)
4. Feature 14 (1.476576)
5. Feature 53 (1.111259)
6. Feature 35 (1.021773)
7. Feature 15 (1.012616)
8. Feature 16 (0.967225)
9. Feature 19 (0.940994)
10. Feature 3 (0.721762)
11. Feature 8 (0.631920)
12. Feature 4 (0.613005)
13. Feature 5 (0.607756)
14. Feature 7 (0.513041)
15. Feature 23 (0.353613)
16. Feature 27 (0.333023)
17. Feature 51 (0.246687)
18. Feature 20 (0.233609)
19. Feature 21 (0.229864)
20. Feature 2 (0.158968)
21. Feature 33 (0.140470)
22. Feature 9 (0.094108)
23. Feature 18 (0.092446)
24. Feature 13 (0.087026)
25. Feature 17 (0.066873)
26. Feature 31 (0.058252)
27. Feature 12 (0.008956)
28. Feature 55 (0.006827)
29. Feature 56 (0.000874)
30. Feature 54 (-0.011644)

Logistic Regression - 
Train Accuracy:  0.9323
Test Accuracy:  0.9229


In [8]:
decision_tree_classifier(X_train, X_test, y_train, y_test)

Feature ranking:
1. Feature 52 (0.339152)
2. Feature 6 (0.161860)
3. Feature 51 (0.084357)
4. Feature 54 (0.054901)
5. Feature 24 (0.052564)
6. Feature 55 (0.032372)
7. Feature 15 (0.031038)
8. Feature 26 (0.024413)
9. Feature 56 (0.018537)
10. Feature 45 (0.016740)
11. Feature 18 (0.015898)
12. Feature 4 (0.015344)
13. Feature 16 (0.011050)
14. Feature 11 (0.010865)
15. Feature 23 (0.010063)
16. Feature 34 (0.008090)
17. Feature 20 (0.007849)
18. Feature 8 (0.006836)
19. Feature 17 (0.006442)
20. Feature 35 (0.006228)
21. Feature 9 (0.006214)
22. Feature 7 (0.006209)
23. Feature 44 (0.005744)
24. Feature 48 (0.005350)
25. Feature 10 (0.005001)
26. Feature 5 (0.004893)
27. Feature 12 (0.004819)
28. Feature 38 (0.004345)
29. Feature 49 (0.003510)
30. Feature 47 (0.003171)

Decision Tree - 
Train Accuracy:  0.9995
Test Accuracy:  0.9197
-------------------------------------------
Feature ranking:
1. Feature 52 (0.379143)
2. Feature 6 (0.181890)
3. Feature 51 (0.090861)
4. Feature 24 (0.0