Source: https://miguelmalvarez.com/2015/03/20/classifying-reuters-21578-collection-with-python-representing-the-data/

In [1]:
from nltk.corpus import reuters

### Corpus stats

In [2]:
def collection_stats():
    documents = reuters.fileids()
    print(str(len(documents)) + " documents")
    
    train_docs = list(filter(lambda doc: doc.startswith("train"),
                            documents))
    print(str(len(train_docs)) + " total train documents")
    
    test_docs = list(filter(lambda doc: doc.startswith("test"),
                           documents))
    
    print(str(len(test_docs)) + " total test documents")
    
    # List of categories
    categories = reuters.categories()
    print(str(len(categories)) + " categories")
    
    # Documents in a category
    category_docs = reuters.fileids("acq")
    
    # Words for document
    document_id = category_docs[0]
    document_words = reuters.words(category_docs[0])
    print(document_words)
    
    # Raw document
    print(reuters.raw(document_id))

In [3]:
collection_stats()

10788 documents
7769 total train documents
3019 total test documents
90 categories
['SUMITOMO', 'BANK', 'AIMS', 'AT', 'QUICK', 'RECOVERY', ...]
SUMITOMO BANK AIMS AT QUICK RECOVERY FROM MERGER
  Sumitomo Bank Ltd &lt;SUMI.T> is certain to
  lose its status as Japan's most profitable bank as a result of
  its merger with the Heiwa Sogo Bank, financial analysts said.
      Osaka-based Sumitomo, with desposits of around 23.9
  trillion yen, merged with Heiwa Sogo, a small, struggling bank
  with an estimated 1.29 billion dlrs in unrecoverable loans, in
  October.
      But despite the link-up, Sumitomo President Koh Komatsu
  told Reuters he is confident his bank can quickly regain its
  position.
      "We'll be back in position in first place within three
  years," Komatsu said in an interview.
      He said that while the merger will initially reduce
  Sumitomo's profitability and efficiency, it will vastly expand
  Sumitomo's branch network in the Tokyo metropolitan area where
  it ha

### Document Representation and weighting

In [3]:
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
import re
from nltk.corpus import stopwords

cached_stop_words = stopwords.words("english")

def tokenize(text):
    min_length = 3
    words = map(lambda word: word.lower(), word_tokenize(text))
    words = [word for word in words
                if word not in cached_stop_words]
    
    tokens = (list(map(lambda token: PorterStemmer().stem(token),
                      words)))
    
    p = re.compile('[a-zA-Z]+')
    filtered_tokens = \
        list(filter(lambda token:
                    p.match(token) and len(token) >= min_length,
                   tokens))
        
    return filtered_tokens

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Return the representer, without transforming
def tf_idf(docs):
    tfidf = TfidfVectorizer(tokenizer=tokenize, min_df=3,
                        max_df=0.90, max_features=3000,
                        use_idf=True, sublinear_tf=True,
                        norm='l2');
    tfidf.fit(docs);
    return tfidf;

In [5]:
def feature_values(doc, representer):
    doc_representation = representer.transform([doc])
    features = representer.get_feature_names()
    return [(features[index], doc_representation[0, index])
                 for index in doc_representation.nonzero()[1]]

In [12]:
def main():
    train_docs = []
    test_docs = []
    
    counter = 1
 
    for doc_id in reuters.fileids():
        if counter < 10000:
            if doc_id.startswith("train"):
                train_docs.append(reuters.raw(doc_id))
            else:
                test_docs.append(reuters.raw(doc_id))
        
        counter += 1
 
    representer = tf_idf(train_docs);
 
    for doc in test_docs[:10]:
        print(feature_values(doc, representer))

In [13]:
main()

[('yesterday', 0.040544656930968107), ('year', 0.048523977933166748), ('yasuhiro', 0.067677005702934515), ('would', 0.063720846991738384), ('worri', 0.063676410398477407), ('world', 0.064802340038079623), ('work', 0.042251959993935766), ('whose', 0.060401706079358616), ('whole', 0.058408677009734268), ('week', 0.033259273803961341), ('washington', 0.051091509389807686), ('warn', 0.055555651985466276), ('want', 0.044303460144781197), ('virtual', 0.064539276657100852), ('view', 0.087341627355791221), ('u.s.-japan', 0.077027298729502522), ('u.s.', 0.10370715732580421), ('two', 0.048678979437021874), ('trade', 0.11365328894735791), ('tough', 0.06515792089182873), ('tom', 0.072231144513242668), ('told', 0.032582334376736896), ('tokyo', 0.09012440517761601), ('time', 0.036985316655361034), ('threat', 0.060603782081520584), ('third', 0.045165111364450818), ('textil', 0.062382972029937769), ('tax', 0.037396844381703584), ('tariff', 0.13460882572566393), ('talk', 0.04043095653861177), ('taiwan'