In [None]:
from nltk.corpus import reuters
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC
from sklearn.cross_validation import cross_val_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from pprint import pprint
from nltk.corpus import stopwords
from nltk import word_tokenize
from operator import itemgetter
import re

# Data Collection Stats

In [None]:
# List of documents
documents = reuters.fileids()
print("Documents: {}".format(len(documents)))

train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents))
print("Total train documents: {}".format(len(train_docs_id)))

test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents))
print("Total test documents: {}".format(len(test_docs_id)))

In [None]:
# List of categories 
categories = reuters.categories();
print("Number of categories: {}".format(len(categories)))
print()

print(categories)
print()

# Documents per category.
category_distribution = sorted([(category, len(reuters.fileids(category))) for category in categories], 
                               key=itemgetter(1), 
                               reverse=True)

print("Most common categories")
pprint(category_distribution[:5])
print()

print("Least common categories")
pprint(category_distribution[-5:])
print()

In [None]:
# Number of labels (different than number of documents...) -> Multi-label problem
print("Number of Labels {}".format(sum([distribution for category, distribution in category_distribution])))
print()

# Documents with multiple labels
doc = 'training/9865'
print(reuters.raw(doc))
print()

print(reuters.categories(doc))

In [None]:
## Some utility functions

# Function to show the feature weights of a document (to be explained later)
# Takes a document and a representation Vectoriser and it returns a list of tuples 
# with the weights for the features that are greater than zero
def feature_values(doc, representer):
    doc_representation = representer.transform([doc])
    features = representer.get_feature_names()
    return [(features[index], doc_representation[0, index]) for index in doc_representation.nonzero()[1]]

# Evaluation
def show_quality(predictions, labels):
    print("Micro-average quality numbers")
    print("Precision: {}, Recall: {}, F1-measure: {}".format(precision_score(predictions, labels, average='micro'),
                                                             recall_score(predictions, labels, average='micro'),
                                                             f1_score(predictions, labels, average='micro')))

    print("Macro-average quality numbers")
    print("Precision: {}, Recall: {}, F1-measure: {}".format(precision_score(predictions, labels, average='macro'),
                                                             recall_score(predictions, labels, average='macro'),
                                                             f1_score(predictions, labels, average='macro')))

In [None]:
stop_words = stopwords.words("english")

train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents))
test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents))

train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]

# Tokenisation 
vectorizer = TfidfVectorizer(stop_words=stop_words)

# Learn and transform train documents
vectorised_train_documents = vectorizer.fit_transform(train_docs)
vectorised_test_documents = vectorizer.transform(test_docs)

# Transform multilabel labels
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform([reuters.categories(doc_id) for doc_id in train_docs_id])
test_labels = mlb.transform([reuters.categories(doc_id) for doc_id in test_docs_id])

# Classifier 
classifier = OneVsRestClassifier(LinearSVC(random_state=42))
classifier.fit(vectorised_train_documents, train_labels)
predictions = classifier.predict(vectorised_test_documents)

print("Number of labels assigned: {}".format(sum([sum(prediction) for prediction in predictions])))
show_quality(predictions, test_labels)