In [58]:
from nltk.corpus import reuters
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC
from sklearn.cross_validation import cross_val_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from pprint import pprint
from nltk.corpus import stopwords
from nltk import word_tokenize
import re


# Data Collection Stats

In [50]:
# List of documents
documents = reuters.fileids()
print("Documents: {}".format(len(documents)))

train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents))
print("Total train documents: {}".format(len(train_docs)))

test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents))
print("Total test documents: {}".format(len(test_docs)))

Documents: 10788
Total train documents: 7769
Total test documents: 3019


In [45]:
# List of categories 
categories = reuters.categories();
print("Number of categories: {}".format(len(categories)))
print()

print(categories)
print()

# Documents per category.
category_distribution = sorted([[category, len(reuters.fileids(category))] for category in categories], 
                               key=lambda item:item[1], 
                               reverse=True)

print("Most common categories")
pprint(category_distribution[:5])
print()

print("Least common categories")
pprint(category_distribution[-5:])
print()

Number of categories: 90

['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']

Most common categories
[['earn', 3964],
 ['acq', 2369],
 ['mone

In [49]:
# Number of labels (different than number of documents...) -> Multi-label problem
print("Number of Labels {}".format(sum([distribution for category, distribution in category_distribution])))
print()

# Documents with multiple labels
doc = 'training/9865'
print(reuters.raw(doc))
print()

print(reuters.categories(doc))

Number of Labels 13328

FRENCH FREE MARKET CEREAL EXPORT BIDS DETAILED
  French operators have requested licences
  to export 675,500 tonnes of maize, 245,000 tonnes of barley,
  22,000 tonnes of soft bread wheat and 20,000 tonnes of feed
  wheat at today's European Community tender, traders said.
      Rebates requested ranged from 127.75 to 132.50 European
  Currency Units a tonne for maize, 136.00 to 141.00 Ecus a tonne
  for barley and 134.25 to 141.81 Ecus for bread wheat, while
  rebates requested for feed wheat were 137.65 Ecus, they said.
  



['barley', 'corn', 'grain', 'wheat']


In [8]:
# Function to show the feature weights of a document (to be explained later)
def feature_values(doc, representer):
    doc_representation = representer.transform([doc])
    features = representer.get_feature_names()
    return [(features[index], doc_representation[0, index]) for index in doc_representation.nonzero()[1]]

In [83]:
stop_words = stopwords.words("english")

train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]

# Tokenisation 
vectorizer = TfidfVectorizer(stop_words=stop_words)

# Learn and transform train documents
vectorised_train_documents = vectorizer.fit_transform(train_docs)
vectorised_test_documents = vectorizer.transform(test_docs)

# Transform multilabel labels
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform([reuters.categories(doc_id) for doc_id in train_docs_id])
test_labels = mlb.transform([reuters.categories(doc_id) for doc_id in test_docs_id])

# Classifier 
classifier = OneVsRestClassifier(LinearSVC(random_state=42))
classifier.fit(vectorised_train_documents, train_labels)
predictions = classifier.predict(vectorised_test_documents)

print("Number of labels assigned: {}".format(sum([sum(prediction) for prediction in predictions])))

Number of labels assigned: 3126


In [84]:
# Evaluation

def show_quality(predictions, labels):
    print("Micro-average quality numbers")
    print("Precision: {}, Recall: {}, F1-measure: {}".format(precision_score(predictions, labels, average='micro'),
                                                             recall_score(predictions, labels, average='micro'),
                                                             f1_score(predictions, labels, average='micro')))

    print("Macro-average quality numbers")
    print("Precision: {}, Recall: {}, F1-measure: {}".format(precision_score(predictions, labels, average='macro'),
                                                             recall_score(predictions, labels, average='macro'),
                                                             f1_score(predictions, labels, average='macro')))
    
show_quality(predictions, test_labels)

Micro-average quality numbers
Precision: 0.7946047008547008, Recall: 0.9516954574536148, F1-measure: 0.8660844250363902
Macro-average quality numbers
Precision: 0.37149154470386453, Recall: 0.6305451234650984, F1-measure: 0.4450579351293774


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


# Tuning our representation

In [91]:
from nltk.stem.porter import PorterStemmer
import re

# Tokenisation 
vectorizer = TfidfVectorizer(ngram_range=(1,2), 
                             stop_words=stop_words, 
                             max_df=0.7, 
                             min_df=3)

# Learn and transform train documents
vectorised_train_documents = vectorizer.fit_transform(train_docs)
vectorised_test_documents = vectorizer.transform(test_docs)

# Transform multilabel labels
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform([reuters.categories(doc_id) for doc_id in train_docs_id])
test_labels = mlb.transform([reuters.categories(doc_id) for doc_id in test_docs_id])

# Classifier 
classifier = OneVsRestClassifier(LinearSVC(random_state=42))
classifier.fit(vectorised_train_documents, train_labels)
predictions = classifier.predict(vectorised_test_documents)

print("Number of labels assigned: {}".format(sum([sum(prediction) for prediction in predictions])))
show_quality(predictions, test_labels)

Number of labels assigned: 3116
Micro-average quality numbers
Precision: 0.7908653846153846, Recall: 0.9502567394094994, F1-measure: 0.863265306122449
Macro-average quality numbers
Precision: 0.35226289340822575, Recall: 0.5781412416628562, F1-measure: 0.420667858686212


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
