In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def l2_logistic_regression(train_data, test_data, train_labels, test_labels):
    
    # Train model
    model = LogisticRegression(penalty='l2', random_state=42)
    model.fit(train_data, train_labels)    
    
    # Extract top features from model
    importances = model.coef_
    
    for i in range(len(importances)):
        indices = np.argsort(importances[i])[::-1]

        print('\nFeature ranking for label ' + str(i) + ':') if len(importances) > 1 else print('Feature ranking:')
        for f in range(30):
            print('%d. Feature %d (%f)' % (f + 1, indices[f], importances[i][indices[f]]))

    # Test model
    y_train_pred = model.predict(train_data)
    y_test_pred = model.predict(test_data)

    # Evaluate model
    print('\nLogistic Regression - \nTrain Accuracy: ', round(np.sum(y_train_pred == train_labels) / train_data.shape[0], 4))
    print('Test Accuracy: ', round(np.sum(y_test_pred == test_labels) / test_data.shape[0], 4))

In [3]:
def decision_tree_classifier(train_data, test_data, train_labels, test_labels):

    # Train model
    for model in [
        DecisionTreeClassifier(random_state=42), 
        DecisionTreeClassifier(random_state=42, max_depth=10), 
        DecisionTreeClassifier(random_state=42, min_samples_leaf=2), 
        DecisionTreeClassifier(random_state=42, min_samples_split=3)
    ]:
        
        model.fit(train_data, train_labels)

        # Extract top features from model
        importances = model.feature_importances_
        indices = np.argsort(importances)[::-1]

        print('Feature ranking:')
        for f in range(30):
            print('%d. Feature %d (%f)' % (f + 1, indices[f], importances[indices[f]]))

        # Test model
        y_train_pred = model.predict(train_data)
        y_test_pred = model.predict(test_data)

        # Evaluate model
        print('\nDecision Tree - \nTrain Accuracy: ', round(np.sum(y_train_pred == train_labels) / train_data.shape[0], 4))
        print('Test Accuracy: ', round(np.sum(y_test_pred == test_labels) / test_data.shape[0], 4))
        print('-------------------------------------------')

In [12]:
# Fetch data
ng_train = fetch_20newsgroups(subset='train')
ng_test = fetch_20newsgroups(subset='test')

# Training and testing data
X_train = ng_train.data
X_test = ng_test.data

# Set of label names for training and testing data
# ng_train.target_names
# ng_test.target_names

# Training and testing labels, where each label is associated with a number corresponding to the index in target_names
# ng_train.target
# ng_test.target

# Training and testing labels
y_train = []
y_test = []

for i in range(len(X_train)):
    y_train.append(ng_train.target_names[ng_train.target[i]])
for i in range(len(X_test)):
    y_test.append(ng_test.target_names[ng_test.target[i]])

print(len(X_train))
print(len(X_test))
print(len(y_train))
print(len(y_test))

11314
7532
11314
7532


In [13]:
# Logistic Regression

# Converting text to vectors
tfidf = TfidfVectorizer()
tfidf.fit(X_train)
vect_X_train = tfidf.transform(X_train)
vect_X_test = tfidf.transform(X_test)

print(vect_X_train.shape)
print(vect_X_test.shape)

model = LogisticRegression(penalty='l2', random_state=42)

model.fit(vect_X_train, y_train)
importances = model.coef_      # Scores of each feature
    
# Store top k features
features = tfidf.get_feature_names()
for i in range(len(importances)):
    print('\nFeatures for label ' + str(i) + ':')
    indices = np.argsort(importances[i])[::-1]

    new_features = []
    for f in range(30):
        new_features.append(features[indices[f]])
    print(new_features)
    print('-------------------------------------------')

# Test model
y_train_pred = model.predict(vect_X_train)
y_test_pred = model.predict(vect_X_test)

# Evaluate model
print('\nLogistic Regression - \nTrain Accuracy: ', round(np.sum(y_train_pred == y_train) / vect_X_train.shape[0], 4))
print('-------------------------------------------')
print('Test Accuracy: ', round(np.sum(y_test_pred == y_test) / vect_X_test.shape[0], 4))

(11314, 129796)
(7532, 129796)

Features for label 0:
['keith', 'atheism', 'atheists', 'caltech', 'islamic', 'okcforum', 'islam', 'god', 'mathew', 'jaeger', 'livesey', 'rushdie', 'mangoe', 'benedikt', 'religion', 'osrhe', 'umd', 'wingate', 'edu', 'sgi', 'kmr4', 'bible', 'cobb', 'mantis', 'gregg', 'tek', 'solntze', 'ico', 'atheist', 'wwc']

Features for label 1:
['graphics', 'image', '3d', 'polygon', 'tiff', 'images', 'cview', 'format', 'files', '3do', 'pov', 'points', 'animation', 'package', 'vga', 'gif', 'sphere', 'color', 'vesa', 'algorithm', 'surface', 'program', '42', 'looking', 'library', 'file', 'code', 'software', 'quicktime', 'mpeg']

Features for label 2:
['windows', 'file', 'ax', 'driver', 'drivers', 'files', 'cica', 'dos', 'win', 'mouse', 'ms', 'ini', 'nt', 'risc', 'win3', 'fonts', 'ftp', 'printer', 'font', 'access', 'microsoft', 'bmp', 'using', 'program', 'card', 'manager', 'diamond', 'deskjet', 'norton', 'tw']

Features for label 3:
['drive', 'card', 'scsi', 'pc', 'ide', '

In [14]:
# Decision Tree

# Converting text to vectors
tfidf = TfidfVectorizer()
tfidf.fit(X_train)
vect_X_train = tfidf.transform(X_train)
vect_X_test = tfidf.transform(X_test)

print(vect_X_train.shape)
print(vect_X_test.shape)

# Train model
model = DecisionTreeClassifier(random_state=42)
for model in [
    DecisionTreeClassifier(random_state=42), 
    DecisionTreeClassifier(random_state=42, max_depth=10), 
    DecisionTreeClassifier(random_state=42, min_samples_leaf=2), 
    DecisionTreeClassifier(random_state=42, min_samples_split=3)
]:

    model.fit(vector_newsgroups_train, newsgroups_y_train)

    # Extract top features from model
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]

    # Store top k features
    features = vectorizer_newsgroups_train.get_feature_names()
    new_features = []
    print('\nFeatures:')
    for f in range(30):
        new_features.append(features[indices[f]])
    print(new_features)
    print('-------------------------------------------')
    
    # Test model
    y_train_pred = model.predict(vect_X_train)
    y_test_pred = model.predict(vect_X_test)

    # Evaluate model
    print('\nDecision Tree - \nTrain Accuracy: ', round(np.sum(y_train_pred == y_train) / vect_X_train.shape[0], 4))
    print('-------------------------------------------')
    print('Test Accuracy: ', round(np.sum(y_test_pred == y_test) / vect_X_test.shape[0], 4))

(11314, 129796)
(7532, 129796)
Features:
['clipper', 'dod', 'sale', 'windows', 'hockey', 'car', 'god', 'space', 'israel', 'gun', 'baseball', 'bike', 'turkish', 'mac', 'graphics', 'window', 'rutgers', 'geb', 'atheists', 'nhl', 'encryption', 'motif', 'article', 'apple', 'orbit', 'edu', 'controller', 'team', 'com', 'waco']

Decision Tree - 
Train Accuracy:  0.9999
Test Accuracy:  0.5645
-------------------------------------------
Features:
['clipper', 'dod', 'sale', 'windows', 'hockey', 'space', 'israel', 'car', 'god', 'gun', 'rutgers', 'article', 'bike', 'motif', 'guns', 'monitor', 'christ', 'bih', 'dan', 'graphics', 'scsi', 'xlib', 'radio', 'delta', 'companies', 'os', 'daker', 'open', 'romulus', 'kasajian']

Decision Tree - 
Train Accuracy:  0.3442
Test Accuracy:  0.3002
-------------------------------------------
Features:
['clipper', 'sale', 'dod', 'windows', 'hockey', 'space', 'god', 'car', 'israel', 'gun', 'baseball', 'bike', 'turkish', 'mac', 'graphics', 'window', 'rutgers', 'geb',