In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import operator

In [2]:
def l2_logistic_regression(train_data, test_data, train_labels, test_labels):
    
    # Train model
    model = LogisticRegression(penalty='l2', random_state=42)
    model.fit(train_data, train_labels)

    # Test model
    y_train_pred = model.predict(train_data)
    y_test_pred = model.predict(test_data)

    # Evaluate model
    print('\nLogistic Regression - \nTrain Accuracy: ', round(np.sum(y_train_pred == train_labels) / train_data.shape[0], 4))
    print('Test Accuracy: ', round(np.sum(y_test_pred == test_labels) / test_data.shape[0], 4))

In [3]:
def decision_tree_classifier(train_data, test_data, train_labels, test_labels):

    # Train model
    model = DecisionTreeClassifier(random_state=42)
    model.fit(train_data, train_labels)
    
    # Test model
    y_train_pred = model.predict(train_data)
    y_test_pred = model.predict(test_data)

    # Evaluate model
    print('\nDecision Tree - \nTrain Accuracy: ', round(np.sum(y_train_pred == train_labels) / train_data.shape[0], 4))
    print('Test Accuracy: ', round(np.sum(y_test_pred == test_labels) / test_data.shape[0], 4))

In [4]:
# Fetch data
ng_train = fetch_20newsgroups(subset='train')
ng_test = fetch_20newsgroups(subset='test')

# Training and testing data
X_train = ng_train.data
X_test = ng_test.data

# Set of label names for training and testing data
# ng_train.target_names
# ng_test.target_names

# Training and testing labels, where each label is associated with a number corresponding to the index in target_names
# ng_train.target
# ng_test.target

# Training and testing labels
y_train = []
y_test = []

for i in range(len(X_train)):
    y_train.append(ng_train.target_names[ng_train.target[i]])
for i in range(len(X_test)):
    y_test.append(ng_test.target_names[ng_test.target[i]])

print(len(X_train))
print(len(X_test))
print(len(y_train))
print(len(y_test))

18846
18846
11314
7532
11314
7532


In [5]:
# Take the max coefficient

tfidf = TfidfVectorizer(stop_words='english')
tfidf.fit(X_train)
vect_X_train = tfidf.transform(X_train)
vect_X_test = tfidf.transform(X_test)

print(vect_X_train.shape)
print(vect_X_test.shape)

l2_logistic_regression(vect_X_train, vect_X_test, y_train, y_test)
decision_tree_classifier(vect_X_train, vect_X_test, y_train, y_test)
print('---------------')

K = 200
for C in [0.1, 0.5, 1.0, 10.0]:
    print('C=' + str(C))
    model = LogisticRegression(penalty='l1', C=C, random_state=42)

    model.fit(vect_X_train, y_train)
    importances = model.coef_      # Scores of each feature

    # Store top k features
    features = tfidf.get_feature_names()
    new_features_weights = {}
    for i in range(len(importances)):
        indices = np.argsort(importances[i])[::-1]

        for f in range(K):
            if features[indices[f]] in new_features_weights:
                new_features_weights[features[indices[f]]] = max(new_features_weights[features[indices[f]]], importances[i][indices[f]])
            else:
                new_features_weights.setdefault(features[indices[f]], importances[i][indices[f]])
    
    new_features_weights = sorted(new_features_weights.items(), key=operator.itemgetter(1), reverse=True)[:K]
    
    new_features = []
    for k,v in new_features_weights:
        new_features.append(k)
    print(new_features)
    
    # Convert train and test data to vectors based on k features
    tfidf_train = TfidfVectorizer(stop_words='english', vocabulary=new_features)
    new_vect_X_train = tfidf_train.fit_transform(X_train)

    tfidf_test = TfidfVectorizer(stop_words='english', vocabulary=new_features)
    new_vect_X_test = tfidf_test.fit_transform(X_test)

    # Perform classification on data with k features
    l2_logistic_regression(new_vect_X_train, new_vect_X_test, y_train, y_test)
    decision_tree_classifier(new_vect_X_train, new_vect_X_test, y_train, y_test)
    print('---------------')

(11314, 129796)
(7532, 129796)

Logistic Regression - 
Train Accuracy:  0.9746
Test Accuracy:  0.8302

Decision Tree - 
Train Accuracy:  0.9999
Test Accuracy:  0.5645
---------------
C=0.1
['sale', 'dod', 'windows', 'space', 'car', 'gun', 'clipper', 'bike', 'baseball', 'hockey', 'graphics', 'israel', 'mac', 'key', 'israeli', 'keith', 'god', 'window', 'encryption', 'apple', 'turkish', 'team', 'cars', 'pitt', 'motif', 'christians', 'game', 'cramer', 'msg', 'church', 'drive', 'server', 'year', 'fbi', 'sandvik', 'nhl', 'geb', 'bus', 'armenian', 'card', 'henry', 'moon', 'armenians', 'scsi', 'nasa', 'jesus', 'atheism', 'jews', 'batf', 'ca', 'mit', 'guns', 'livesey', 'rutgers', 'stratus', 'atheists', 'ide', 'digex', 'creators', 'creator', 'creativity', 'creatively', 'creations', 'creative', 'creationists', 'creationist', 'creationism', 'creation', 'creatio', 'creats', 'creatures', 'creature', 'createwindow', 'credence', 'credential', 'credentials', 'credibility', 'credible', 'credibly', 'cred


Logistic Regression - 
Train Accuracy:  0.5898
Test Accuracy:  0.5239

Decision Tree - 
Train Accuracy:  0.6845
Test Accuracy:  0.4946
---------------


In [6]:
# Sum up coefficients

tfidf = TfidfVectorizer(stop_words='english')
tfidf.fit(X_train)
vect_X_train = tfidf.transform(X_train)
vect_X_test = tfidf.transform(X_test)

print(vect_X_train.shape)
print(vect_X_test.shape)

l2_logistic_regression(vect_X_train, vect_X_test, y_train, y_test)
decision_tree_classifier(vect_X_train, vect_X_test, y_train, y_test)
print('---------------')

K = 200
for C in [0.1, 0.5, 1.0, 10.0]:
    print('C=' + str(C))
    model = LogisticRegression(penalty='l1', C=C, random_state=42)

    model.fit(vect_X_train, y_train)
    importances = model.coef_      # Scores of each feature

    # Store top k features
    features = vectorizer_newsgroups_train.get_feature_names()
    new_features_weights = {}
    for i in range(len(importances)):
        indices = np.argsort(importances[i])[::-1]

        for f in range(K):
            if features[indices[f]] in new_features_weights:
                new_features_weights[features[indices[f]]] += importances[i][indices[f]]
            else:
                new_features_weights.setdefault(features[indices[f]], importances[i][indices[f]])
    
    new_features_weights = sorted(new_features_weights.items(), key=operator.itemgetter(1), reverse=True)[:K]
    
    new_features = []
    for k,v in new_features_weights:
        new_features.append(k)
    print(new_features)
    
    # Convert train and test data to vectors based on k features
    tfidf_train = TfidfVectorizer(stop_words='english', vocabulary=new_features)
    new_vect_X_train = tfidf_train.fit_transform(X_train)

    tfidf_test = TfidfVectorizer(stop_words='english', vocabulary=new_features)
    new_vect_X_test = tfidf_test.fit_transform(X_test)

    # Perform classification on data with k features
    l2_logistic_regression(new_vect_X_train, new_vect_X_test, y_train, y_test)
    decision_tree_classifier(new_vect_X_train, new_vect_X_test, y_train, y_test)
    print('---------------')

(11314, 129796)
(7532, 129796)

Logistic Regression - 
Train Accuracy:  0.9746
Test Accuracy:  0.8302

Decision Tree - 
Train Accuracy:  0.9999
Test Accuracy:  0.5645
---------------
C=0.1
['sale', 'dod', 'windows', 'space', 'car', 'gun', 'clipper', 'bike', 'god', 'baseball', 'hockey', 'graphics', 'israel', 'mac', 'key', 'israeli', 'team', 'keith', 'window', 'encryption', 'apple', 'turkish', 'cars', 'pitt', 'motif', 'christians', 'game', 'cramer', 'msg', 'church', 'drive', 'server', 'year', 'fbi', 'sandvik', 'jesus', 'nhl', 'geb', 'bus', 'armenian', 'card', 'henry', 'moon', 'armenians', 'scsi', 'nasa', 'atheism', 'jews', 'batf', 'ca', 'mit', 'guns', 'livesey', 'rutgers', 'stratus', 'atheists', 'ide', 'digex', 'creators', 'creator', 'creativity', 'creatively', 'creations', 'creative', 'creationists', 'creationist', 'creationism', 'creation', 'creatio', 'creats', 'creatures', 'creature', 'createwindow', 'credence', 'credential', 'credentials', 'credibility', 'credible', 'credibly', 'cred


Logistic Regression - 
Train Accuracy:  0.5898
Test Accuracy:  0.5239

Decision Tree - 
Train Accuracy:  0.6845
Test Accuracy:  0.4936
---------------
