In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import numpy as np

In [2]:
def l2_logistic_regression(train_data, test_data, train_labels, test_labels):
    
    # Train model
    model = LogisticRegression(penalty='l2', random_state=42)
    model.fit(train_data, train_labels)

    # Test model
    y_train_pred = model.predict(train_data)
    y_test_pred = model.predict(test_data)

    # Evaluate model
    print('\nLogistic Regression - \nTrain Accuracy: ', round(np.sum(y_train_pred == train_labels) / train_data.shape[0], 4))
    print('Test Accuracy: ', round(np.sum(y_test_pred == test_labels) / test_data.shape[0], 4))

In [3]:
def decision_tree_classifier(train_data, test_data, train_labels, test_labels):

    # Train model
    model = DecisionTreeClassifier(random_state=42)
    model.fit(train_data, train_labels)
    
    # Test model
    y_train_pred = model.predict(train_data)
    y_test_pred = model.predict(test_data)

    # Evaluate model
    print('\nDecision Tree - \nTrain Accuracy: ', round(np.sum(y_train_pred == train_labels) / train_data.shape[0], 4))
    print('Test Accuracy: ', round(np.sum(y_test_pred == test_labels) / test_data.shape[0], 4))

In [4]:
# Fetch data
ng_train = fetch_20newsgroups(subset='train')
ng_test = fetch_20newsgroups(subset='test')

# Training and testing data
X_train = ng_train.data
X_test = ng_test.data

# Set of label names for training and testing data
# ng_train.target_names
# ng_test.target_names

# Training and testing labels, where each label is associated with a number corresponding to the index in target_names
# ng_train.target
# ng_test.target

# Training and testing labels
y_train = []
y_test = []

for i in range(len(X_train)):
    y_train.append(ng_train.target_names[ng_train.target[i]])
for i in range(len(X_test)):
    y_test.append(ng_test.target_names[ng_test.target[i]])

print(len(X_train))
print(len(X_test))
print(len(y_train))
print(len(y_test))

11314
7532
11314
7532


In [5]:
# Converting text to vectors
tfidf = TfidfVectorizer(stop_words='english')
tfidf.fit(X_train)
vect_X_train = tfidf.transform(X_train)
vect_X_test = tfidf.transform(X_test)

print(vect_X_train.shape)
print(vect_X_test.shape)

(11314, 129796)
(7532, 129796)


In [6]:
l2_logistic_regression(vect_X_train, vect_X_test, y_train, y_test)
decision_tree_classifier(vect_X_train, vect_X_test, y_train, y_test)


Logistic Regression - 
Train Accuracy:  0.9746
Test Accuracy:  0.8298

Decision Tree - 
Train Accuracy:  0.9999
Test Accuracy:  0.5678


In [7]:
K = 200
model = SelectKBest(mutual_info_classif, k=K)
model.fit(vect_X_train, y_train)
importances = model.scores_      # Scores of each feature

indices = np.argsort(importances)[::-1]
features = tfidf.get_feature_names()

# Store top k features
new_features = []
print('Top ' + str(K) + ' features:')
for f in range(K):
    new_features.append(features[indices[f]])
print(new_features)

# Convert train and test data to vectors based on k features
new_tidf = TfidfVectorizer(stop_words='english', vocabulary=new_features)
new_tidf.fit(X_train)
new_vect_X_train = new_tidf.transform(X_train)
new_vect_X_test = new_tidf.transform(X_test)

# Perform classification on data with k features
l2_logistic_regression(new_vect_X_train, new_vect_X_test, y_train, y_test)
decision_tree_classifier(new_vect_X_train, new_vect_X_test, y_train, y_test)

Top 200 features:
['subject', 'lines', 'organization', 'edu', 'writes', 'article', 'posting', 'host', 'com', 'nntp', 'university', 'like', 'just', 'don', 'know', 'think', 'does', 'distribution', 'people', 'time', 'good', 'reply', 'use', 'new', 'way', 'make', 'world', 'ca', 'say', 'want', 'thanks', 've', 'need', 'right', 'really', 'used', 'usa', 'work', 'computer', 'did', 'cs', 'problem', 'mail', 'help', 'state', 'said', 'going', 'question', 'years', 'sure', '10', 'believe', 'news', 'point', 'using', 'll', 'better', 'got', 'things', 'case', 'thing', 'let', 'doesn', 'read', 'long', '15', 'look', 'little', '20', 'year', '1993', 'probably', 'come', 'fact', '16', 'information', 'best', 'course', 'tell', 'try', 'day', 'didn', 'great', 'far', 'actually', 'lot', 'post', 'keywords', '14', 'david', 'real', 'software', 'number', 'science', '12', 'bit', 'possible', 'true', 'god', 'version', 'different', 'group', 'says', 'old', 'end', 'apr', 'high', 'public', 'having', '30', 'hard', 'mean', 'window