In [51]:
from nltk.corpus import reuters
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC
from sklearn.cross_validation import cross_val_score
from sklearn.multiclass import OneVsRestClassifier


# Data Collection Stats

In [22]:
# List of documents
documents = reuters.fileids()
print("Documents: {}".format(len(documents)))

Documents: 10788


In [24]:
train_docs = list(filter(lambda doc: doc.startswith("train"), documents))
print("Total train documents: {}".format(len(train_docs)))

test_docs = list(filter(lambda doc: doc.startswith("test"), documents))
print("Total test documents: {}".format(len(test_docs)))

Total train documents: 7769
Total test documents: 3019


In [25]:
# List of categories 
categories = reuters.categories();
print("Number of categories: {}".format(len(categories)))

categories

Number of categories: 90


['acq',
 'alum',
 'barley',
 'bop',
 'carcass',
 'castor-oil',
 'cocoa',
 'coconut',
 'coconut-oil',
 'coffee',
 'copper',
 'copra-cake',
 'corn',
 'cotton',
 'cotton-oil',
 'cpi',
 'cpu',
 'crude',
 'dfl',
 'dlr',
 'dmk',
 'earn',
 'fuel',
 'gas',
 'gnp',
 'gold',
 'grain',
 'groundnut',
 'groundnut-oil',
 'heat',
 'hog',
 'housing',
 'income',
 'instal-debt',
 'interest',
 'ipi',
 'iron-steel',
 'jet',
 'jobs',
 'l-cattle',
 'lead',
 'lei',
 'lin-oil',
 'livestock',
 'lumber',
 'meal-feed',
 'money-fx',
 'money-supply',
 'naphtha',
 'nat-gas',
 'nickel',
 'nkr',
 'nzdlr',
 'oat',
 'oilseed',
 'orange',
 'palladium',
 'palm-oil',
 'palmkernel',
 'pet-chem',
 'platinum',
 'potato',
 'propane',
 'rand',
 'rape-oil',
 'rapeseed',
 'reserves',
 'retail',
 'rice',
 'rubber',
 'rye',
 'ship',
 'silver',
 'sorghum',
 'soy-meal',
 'soy-oil',
 'soybean',
 'strategic-metal',
 'sugar',
 'sun-meal',
 'sun-oil',
 'sunseed',
 'tea',
 'tin',
 'trade',
 'veg-oil',
 'wheat',
 'wpi',
 'yen',
 'zinc']

In [38]:
# Documents per category. TODO: Lambdas too complex?
category_distribution = sorted([[category, len(reuters.fileids(category))] for category in categories], 
                               key=lambda item:item[1], 
                               reverse=True)

category_distribution[:10]

[['earn', 3964],
 ['acq', 2369],
 ['money-fx', 717],
 ['grain', 582],
 ['crude', 578],
 ['trade', 485],
 ['interest', 478],
 ['ship', 286],
 ['wheat', 283],
 ['corn', 237]]

In [39]:
category_distribution[-10:]

[['dfl', 3],
 ['nkr', 3],
 ['palladium', 3],
 ['palmkernel', 3],
 ['rand', 3],
 ['castor-oil', 2],
 ['groundnut-oil', 2],
 ['lin-oil', 2],
 ['rye', 2],
 ['sun-meal', 2]]

In [15]:
# Number of labels (different than number of documents...)
sum([distribution for category, distribution in category_distribution])

13328

In [40]:
# Documents with multiple labels
doc = 'training/9865'
print(reuters.raw(doc))

reuters.categories(doc)

FRENCH FREE MARKET CEREAL EXPORT BIDS DETAILED
  French operators have requested licences
  to export 675,500 tonnes of maize, 245,000 tonnes of barley,
  22,000 tonnes of soft bread wheat and 20,000 tonnes of feed
  wheat at today's European Community tender, traders said.
      Rebates requested ranged from 127.75 to 132.50 European
  Currency Units a tonne for maize, 136.00 to 141.00 Ecus a tonne
  for barley and 134.25 to 141.81 Ecus for bread wheat, while
  rebates requested for feed wheat were 137.65 Ecus, they said.
  




['barley', 'corn', 'grain', 'wheat']

In [71]:
# Tokenisation 
vectorizer = TfidfVectorizer()

# Learn and transform train documents
vectorised_train_documents = vectorizer.fit_transform(train_docs)
vectorised_test_documents = vectorizer.transform(test_docs)

# Transform multilabel labels
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform([reuters.categories(document) for document in train_docs])

# Classifier 
classifier = OneVsRestClassifier(LinearSVC())
classifier.fit(vectorised_train_documents, train_labels)
predictions = classifier.predict(vectorised_test_documents)

sum([sum(prediction) for prediction in predictions])

0

In [None]:
y = mlb.fit_transform(all_labels)
    classifier = OneVsRestClassifier(LinearSVC())

    scores = cross_val_score(classifier, X, y=y, cv=10, scoring='f1_micro')
    print("Average F1: {}".format(np.mean(scores)))