Comparison of classic text representation methods
(bag-of-words) with vector
representations of words obtained from deep
models (word embeddings) in the task of
categorizing texts.


Dataset: Reuters-21578, Distribution 1.0

In [163]:
import nltk
from nltk.corpus import reuters
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
import numpy as np
import re
from collections import Counter

Since some of the categories raraly appears let's use most frequent ones.

In [115]:
training_ids = [doc_id for doc_id in reuters.fileids() if doc_id.startswith('training')]

cat_counts = dict()

for id in training_ids:
    for i in range(len(reuters.categories(id))):
        category = reuters.categories(id)[i]
        cat_counts[category] = cat_counts.get(category, 0) + 1

num_categories = 30

cat_sorted = sorted(cat_counts.items(), key=lambda x: x[1], reverse=True)[:num_categories]
categories = [item[0] for item in cat_sorted]

In [120]:
training_ids = [id for id in training_ids if reuters.categories(id)[0] in categories]

test_ids = [id for id in reuters.fileids() if id.startswith('test') and reuters.categories(id)[0] in categories]

categories = {category: id for id, category in enumerate(reuters.categories())}


In [160]:
def clean_text(string):
    """
    A function that normalizes given text in string format
    """
    punctuations=r'''.,!?;\"\'-+(){}[]"\,<>./?@$&''',
    stop_words=nltk.corpus.stopwords.words('english')

    # Removing the punctuations
    for x in string.lower(): 
        if x in punctuations: 
            string = string.replace(x, "") 

    # Converting the text to lower
    string = string.lower()

    # Removing stop words
    string = ' '.join([word for word in string.split() if word not in stop_words])

    # Cleaning the whitespaces
    string = re.sub(r'\s+', ' ', string).strip()

    return string  

In [158]:
def tokenize(doc):
    """
    A function that normalizes and divides given text
    into words.
    """
    re_pattern = re.compile(r'([\s\d.,!?;\"\'-+])+')
    stop_words = nltk.corpus.stopwords.words('english')

    tokens = re_pattern.split(doc.lower())
    tokens = [token for token in tokens if token not in ',.!?\'\"-\t\n ;']
    tokens = [token for token in tokens if token not in stop_words]

    return tokens

In [154]:
corpus = {}
tokens = []
DF = {}
term_frequency = []
ids = [] #
doc_lengths = []

for i, id in enumerate(training_ids):
    document = reuters.raw(id)

    vocab = tokenize(document)
    tokens += [vocab]

    for w in vocab:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}

    corpus[id] = dict((tok, 1) for tok in vocab)
    ids.append(id)
    doc_lengths.append(len(vocab))

for i in DF:
    DF[i] = len(DF[i])

for token in vocab:
    term_frequency.append(DF[token])

stop_words = nltk.corpus.stopwords.words('english')
words = reuters.words()
words = [token.lower() for token in words if token not in ',&><.!/?\'\"-\t\n ;']
words = [token for token in words if token not in stop_words]

N = len(words)
token_counts = Counter(words)
# initialize lexicon of all words
lexicon = sorted(set(sum(tokens, [])))

In [161]:
train_data = []
test_data = []

for id in training_ids:
    train_data.append(clean_text(reuters.raw(id)))

for id in test_ids:
    test_data.append(clean_text(reuters.raw(id)))

train_target = [categories[reuters.categories(id)[0]] for id in training_ids]
test_target = [categories[reuters.categories(id)[0]] for id in test_ids]

Vectorization by frequency

Representing information about frequency of every word in lexicon every document.

In [None]:
#Initialize zero vector, starting point for every document 
zero_vector = OrderedDict((token, 0) for token in lexicon)

doc_vectors = []
doc_vectors2 = []

for i, id in enumerate(training_ids):
    document = reuters.raw(id)
    vec = copy.copy(zero_vector)

    vocab = re_pattern.split(document.lower())
    vocab = [token for token in vocab if token not in ',.!?\'\"-\t\n <>;']
    vocab = [token for token in vocab if token not in stop_words]

    token_counts = Counter(vocab)
    for key, value in token_counts.items():
        vec[key] = value #/ len(lexicon)
        doc_vectors.append(vec)

    doc_vectors2.append(list(doc_vectors[i].items()))


TF-IDF vectors

TF-IDF = Term Frequency (TF) * Inverse Document Frequency (IDF)

In [170]:

tf_idf = {}

for doc in corpus:
    for token in corpus[doc]:
        tf = corpus[doc][token] / len(corpus[doc])
        df = token_counts[token] / N
        idf = np.log(N / (df + 1))  #add 1 to avoid possibility of dividing by zero if there's no instance of the word in lexicon
        tf_idf[doc, token] = tf*idf

tf_idf['training/10', 'computer']

0.15390657562176824

In [185]:
def matching_score(k, query):
    preprocessed_query = tokenize(query)
    tokens = tokenize(str(preprocessed_query))
    
    query_weights = {}

    for key in tf_idf:
        
        if key[1] in tokens:
            try:
                query_weights[key[0]] += tf_idf[key]
            except:
                query_weights[key[0]] = tf_idf[key]
    
    query_weights = sorted(query_weights.items(), key=lambda x: x[1], reverse=True)

    l = []
    for i in query_weights[:k]:
        l.append(i[0])
    
    return l

In [187]:
#Show documents most simylar to the given one
print(matching_score(10, reuters.raw('training/9865')))
print(reuters.categories('training/9865'))
print(reuters.categories('training/12500'))


['training/12500', 'training/7215', 'training/9304', 'training/2169', 'training/12583', 'training/2183', 'training/4278', 'training/1396', 'training/6898', 'training/1057']
['barley', 'corn', 'grain', 'wheat']
['corn', 'grain', 'wheat']


Ranking by cosine similarity

In [189]:
def cosine_sim(a, b):
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim

In [190]:
def doc_freq(word):
    c = 0
    try:
        c = DF[word]
    except:
        pass
    return c

In [191]:
def gen_vector(tokens):

    Q = np.zeros((len(lexicon)))
    
    counter = Counter(tokens)
    words_count = len(tokens)

    query_weights = {}
    
    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = np.log((N+1)/(df+1))

        try:
            ind = lexicon.index(token)
            Q[ind] = tf*idf
        except:
            pass
    return Q

In [192]:
def cosine_similarity(k, query):
    preprocessed_query = tokenize(query)
    tokens = tokenize(str(preprocessed_query))
       
    d_cosines = []

    query_vector = gen_vector(tokens)
    
    for d in D:
        d_cosines.append(cosine_sim(query_vector, d))
        
    out = np.array(d_cosines).argsort()[-k:][::-1]

    return out

In [194]:
# Build the model with naive bayes
model_tfidf = make_pipeline(TfidfVectorizer(), MultinomialNB())
model_count = make_pipeline(CountVectorizer(), MultinomialNB())
# Train the model using the training data
model_tfidf.fit(train_data, train_target)
# Predict the categories of the test data
predicted_categories = model_tfidf.predict(test_data)

#confusion_matrix(test_target, predicted_categories)

In [84]:
predicted = sum([test_target[i] == predicted_categories[i] for i in range(len(test_target))])

all = len(test_target)

predicted/all

0.6344873501997337