NLP with python

Dataset: Reuters-21578, Distribution 1.0

In [1]:
import nltk
from nltk.corpus import reuters, stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from collections import Counter
from collections import OrderedDict

import numpy as np
import re
import copy

Getting the data

The distribution of categories in the corpus is highly skewed, with 36.7% of the documents in the most common category, and only 0.0185% (2 documents) in each of the five least common categories so let's use most frequent ones.

Finding most common categories

In [2]:
training_ids = [doc_id for doc_id in reuters.fileids() if doc_id.startswith('training')]

cat_counts = dict()

for id in training_ids:
    for i in range(len(reuters.categories(id))):
        category = reuters.categories(id)[i]
        cat_counts[category] = cat_counts.get(category, 0) + 1

num_categories = 30     #select documents for 30 most common categories

cat_sorted = sorted(cat_counts.items(), key=lambda x: x[1], reverse=True)[:num_categories]
categories = [item[0] for item in cat_sorted]

print("Final categories:", categories)

Final categories: ['earn', 'acq', 'money-fx', 'grain', 'crude', 'trade', 'interest', 'wheat', 'ship', 'corn', 'money-supply', 'dlr', 'sugar', 'oilseed', 'coffee', 'gnp', 'gold', 'veg-oil', 'soybean', 'nat-gas', 'livestock', 'bop', 'cpi', 'cocoa', 'reserves', 'carcass', 'copper', 'jobs', 'yen', 'ipi']


In [3]:
training_ids = [id for id in training_ids if reuters.categories(id)[0] in categories]
test_ids = [id for id in reuters.fileids() if id.startswith('test') and reuters.categories(id)[0] in categories]
categories = {category: id for id, category in enumerate(reuters.categories())}


In [4]:
def clean_text(string):
    """
    A function that normalizes given text in string format
    """
    punctuations = r'''.,!?;\"\'-+(){}[]"\,<>./?@$&''',
    stop_words = stopwords.words('english')

    # Removing the punctuations
    for x in string.lower(): 
        if x in punctuations: 
            string = string.replace(x, "") 

    # Converting the text to lower
    string = string.lower()

    # Removing stop words
    string = ' '.join([word for word in string.split() if word not in stop_words])

    # Cleaning the whitespaces
    string = re.sub(r'\s+', ' ', string).strip()

    return string  

In [5]:
def tokenize(doc):
    """
    A function that normalizes and divides given text
    into words.
    """
    re_pattern = re.compile(r'([\s\d.,!?;\"\'-+])+')
    stop_words = stopwords.words('english')

    doc = clean_text(doc)

    tokens = re_pattern.split(doc.lower())
    tokens = [token for token in tokens if token not in ',.!?\'\"-\t\n ;']
    tokens = [token for token in tokens if token not in stop_words]

    return tokens

In [6]:
corpus = {}
tokens = []
DF = {}
term_frequency = [] #frequency of a given word
ids = [] #
doc_lengths = [] #number of words in document

for i, id in enumerate(training_ids):
    document = reuters.raw(id)

    vocab = tokenize(document)
    tokens += [vocab]

    for w in vocab:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}

    corpus[id] = dict((tok, 1) for tok in vocab)
    ids.append(id)
    doc_lengths.append(len(vocab))

for i in DF:
    DF[i] = len(DF[i])

for token in vocab:
    term_frequency.append(DF[token])

stop_words = nltk.corpus.stopwords.words('english')
words = reuters.words()
words = [token.lower() for token in words if token not in ',&><.!/?\'\"-\t\n ;']
words = [token for token in words if token not in stop_words]

N = len(words)
token_counts = Counter(words)

lexicon = sorted(set(sum(tokens, [])))# initialize lexicon of all words

In [7]:
train_data = []
test_data = []

for id in training_ids:
    train_data.append(clean_text(reuters.raw(id)))

for id in test_ids:
    test_data.append(clean_text(reuters.raw(id)))

train_target = [categories[reuters.categories(id)[0]] for id in training_ids]
test_target = [categories[reuters.categories(id)[0]] for id in test_ids]

Comparison of text tokenization methods

In [8]:
sample_text = reuters.raw('training/10489')
oryginal = sample_text.split()
my_method = tokenize(sample_text)
nltk_tokenizer = word_tokenize(sample_text)

print('  Original  | My Method  | NLTK Tokenizer')
print('-' * 41)
for original, my, nltk in zip(oryginal[:20], my_method[:20], nltk_tokenizer[:20]):
    print(str(original).rjust(10, ' '), ' | ', str(my).rjust(10, ' '), ' | ', str(nltk).rjust(10, ' '))

  Original  | My Method  | NLTK Tokenizer
-----------------------------------------
    GREECE  |      greece  |      GREECE
      BUYS  |        buys  |        BUYS
    55,000  |      tonnes  |      55,000
    TONNES  |      french  |      TONNES
    FRENCH  |       maize  |      FRENCH
     MAIZE  |       trade  |       MAIZE
         -  |      greece  |           -
     TRADE  |      bought  |       TRADE
    Greece  |       total  |      Greece
    bought  |      tonnes  |      bought
         a  |      french  |           a
     total  |       maize  |       total
        of  |    tendered  |          of
    55,000  |   yesterday  |      55,000
    tonnes  |   initially  |      tonnes
        of  |      tonnes  |          of
    French  |       april  |      French
     maize  |    delivery  |       maize
      when  |       trade  |        when
        it  |     sources  |          it


Stemming & Lemmanization

Stemming is a process of merging and converting words into groups that are derived from the same stem. It simply gets rid of the last letters of the word to obtain a shorter form. 

Lemmatization is a process of converting the given word into its base form. 

In [9]:
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

wnl = WordNetLemmatizer()

stemmer = PorterStemmer()
stemmed = [stemmer.stem(token) for token in my_method]
lemm = [wnl.lemmatize(token) for token in my_method]

print('  Original  |    Stemmer   |   Lemmatizer')
print('-' * 41)
for original, stem, lem in zip(my_method[:20], stemmed[:20], lemm[:20]):
    print(str(original).rjust(10, ' '), ' | ', str(stem).rjust(10, ' '), ' | ', str(lem).rjust(10, ' '))

  Original  |    Stemmer   |   Lemmatizer
-----------------------------------------
    greece  |       greec  |      greece
      buys  |         buy  |         buy
    tonnes  |        tonn  |       tonne
    french  |      french  |      french
     maize  |        maiz  |       maize
     trade  |       trade  |       trade
    greece  |       greec  |      greece
    bought  |      bought  |      bought
     total  |       total  |       total
    tonnes  |        tonn  |       tonne
    french  |      french  |      french
     maize  |        maiz  |       maize
  tendered  |      tender  |    tendered
 yesterday  |   yesterday  |   yesterday
 initially  |       initi  |   initially
    tonnes  |        tonn  |       tonne
     april  |       april  |       april
  delivery  |    deliveri  |    delivery
     trade  |       trade  |       trade
   sources  |       sourc  |      source


TF-IDF vectors

TF-IDF = Term Frequency (TF) * Inverse Document Frequency (IDF)

In [11]:
tf_idf = {}

for doc in corpus:
    for token in corpus[doc]:
        tf = corpus[doc][token] / len(corpus[doc])
        df = token_counts[token] / N
        idf = np.log(N / (df + 1))  #add 1 to avoid possibility of dividing by zero if there's no instance of the word in lexicon
        tf_idf[doc, token] = tf*idf

tf_idf['training/10', 'computer']

0.15221529457097957

In [12]:
def matching_score(k, query):
    preprocessed_query = tokenize(query)
    tokens = tokenize(str(preprocessed_query))
    
    query_weights = {}

    for key in tf_idf:
        
        if key[1] in tokens:
            try:
                query_weights[key[0]] += tf_idf[key]
            except:
                query_weights[key[0]] = tf_idf[key]
    
    query_weights = sorted(query_weights.items(), key=lambda x: x[1], reverse=True)

    l = []
    for i in query_weights[:k]:
        l.append(i[0])
    
    return l

In [13]:
#Show documents most simylar to the given one
print(matching_score(10, reuters.raw('training/9865')))
print(reuters.categories('training/9865'))
print(reuters.categories('training/12500'))


['training/12500', 'training/7215', 'training/9304', 'training/2169', 'training/12583', 'training/2183', 'training/4278', 'training/1396', 'training/6898', 'training/1057']
['barley', 'corn', 'grain', 'wheat']
['corn', 'grain', 'wheat']


In [14]:
# Build the model with naive bayes
model_tfidf = make_pipeline(TfidfVectorizer(), MultinomialNB())
model_count = make_pipeline(CountVectorizer(), MultinomialNB())

In [15]:
def train_and_evaluate(model, X_train, X_test, y_train, y_test):
    
    model.fit(X_train, y_train)
    
    print("Accuracy on training set:")
    print(model.score(X_train, y_train))
    print("Accuracy on testing set:")
    print(model.score(X_test, y_test))
    
    y_pred = model.predict(X_test)

In [16]:
print("\tAccuracy of tfidf model")
train_and_evaluate(model_tfidf, train_data, test_data, train_target, test_target)
print("\tAccuracy of count based model")
train_and_evaluate(model_count, train_data, test_data, train_target, test_target)

	Accuracy of tfidf model
Accuracy on training set:
0.7247681396617567
Accuracy on testing set:
0.7135212272565109
	Accuracy of count based model
Accuracy on training set:
0.9253955264593563
Accuracy on testing set:
0.8683553335711738
