In [1]:
import numpy as np
import pandas as pd
import nltk
import re
from nltk.corpus import reuters
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from collections import Counter

import gensim
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix

import logging

stop_words = stopwords.words("english")

In [2]:
print("The reuters corpus has {} tags".format(len(reuters.categories())))
print("The reuters corpus has {} documents".format(len(reuters.fileids())))

The reuters corpus has 90 tags
The reuters corpus has 10788 documents


In [3]:
categories = []
file_count = []

for i in reuters.categories():
    file_count.append(len(reuters.fileids(i)))
    categories.append(i)

cats = pd.DataFrame({'categories': categories, "file_count": file_count}).sort_values('file_count', ascending=False)
cats.head()

Unnamed: 0,categories,file_count
21,earn,3964
0,acq,2369
46,money-fx,717
26,grain,582
17,crude,578


##### chose the second and third tags on this top tags list, since the first earn tag is most likely the highly-standardized news pieces with earnings reports.

#### Select documents that only contains top two labels with most documents

In [4]:
cat_start = 1
cat_end = 2
category_filter = cats.iloc[cat_start:cat_end + 1, 0].values.tolist()
category_filter

['acq', 'money-fx']

In [5]:
# select fileid with the category filter
doc_id_list = np.array(reuters.fileids(category_filter))
doc_id_list = doc_id_list[doc_id_list != 'training/3267']

In [6]:
train_labels = [reuters.categories(doc_id)for doc_id in doc_id_list if 'training' in doc_id]

In [7]:
test_labels = [reuters.categories(doc_id)for doc_id in doc_id_list if 'test' in doc_id]

In [8]:
train_doc = doc_id_list[['training' in doc_id for doc_id in doc_id_list]]
print("train_doc is created with following document names: {} ...".format(train_doc[0:5]))

train_doc is created with following document names: ['training/10' 'training/1000' 'training/10005' 'training/10018'
 'training/10025'] ...


In [9]:
test_doc = doc_id_list[['test' in doc_id for doc_id in doc_id_list]]
print("test_doc is created with following document names: {} ...".format(test_doc[0:5]))

test_doc is created with following document names: ['test/14843' 'test/14849' 'test/14852' 'test/14861' 'test/14865'] ...


In [10]:
test_corpus = [" ".join([t for t in reuters.words(test_doc[t])])
               for t in range(len(test_doc))]
print("test_corpus is created, the first line is: {} ...".format(test_corpus[0][:100]))

test_corpus is created, the first line is: SUMITOMO BANK AIMS AT QUICK RECOVERY FROM MERGER Sumitomo Bank Ltd & lt ; SUMI . T > is certain to l ...


In [11]:
train_corpus = [" ".join([t for t in reuters.words(train_doc[t])])
                for t in range(len(train_doc))]
print("train_corpus is created, the first line is: {} ...".format(train_corpus[0][:100]))

train_corpus is created, the first line is: COMPUTER TERMINAL SYSTEMS & lt ; CPML > COMPLETES SALE Computer Terminal Systems Inc said it has com ...


In [12]:

def load_response(dataframe, col_name):
    """
    Function to load survey response from a pandas dataframe into a list
    object that can be passed to the clean_corpus() function
    -----PARAMETERS-----
    dataframe: the pandas dataframe where the survey responses are stored
    col_name: a string of the column name of the survey responses
    -----OUTPUT-----
    Returned object is a list of responses as strings
    """
    try:
        responses = [t[0] for t in dataframe[[col_name]].values.tolist()]
    except (TypeError, NameError):
        print("Please input string as col_name")
        pass
    return responses


def clean_corpus(texts, string_line=True, stopping=True, pos='v'):
    """
    Function to clean up survey answers and return list for NLP processing
    --------PARAMETERS---------
    texts: list objects that contains survey response strings
    string_line: if True, each returned survey response is a single string
    if False, each response is a list of words in the original sequence
    stopping: (default) if True, filter stopwords
    pos: (default) if 'v', lemmatize input words as verbs;
    if 'n', lemmatize input words as nouns
    """
    cleaned = []
    i = 0
    stop = set(stopwords.words("english"))
    # print("$$$ empty cleaned created")
    print(">>>> response cleaning initiated")
    for text in texts:
        if (i + 1) % 500 == 0:
            print("--cleaning response #{} out of {}".format(i + 1, len(texts)))
        try:
            text = re.sub("[^a-zA-Z]", " ", text)
            text = word_tokenize(text)
            text = [t.lower() for t in text]
            if stopping:
                text = [t for t in text if t not in stop]
            lemmatizer = WordNetLemmatizer()
            text = [lemmatizer.lemmatize(t, pos=pos) for t in text]
            # TODO: determine which lemmatizer to use for this project
            cleaned.append(text)
        except TypeError:
            cleaned.append([])
        i += 1
    if string_line:
        cleaned = [" ".join(t) for t in cleaned]
    return cleaned


def get_bow(tokenized_text):
    """
    Function to generate bow_list and word_freq from a tokenized_text
    -----PARAMETER-----
    tokenized_text should be in the form of [['a'], ['a', 'b'], ['b']] format,
    where the object is a list of survey response, with each survey response
    as a list of word tokens
    -----OUTPUT-----
    The function returns two objects
    bow_list: a list of Counter objects with word frequency of each response
    word_freq: a Counter object that summarizes the word frequency of the input
    tokenized_text
    """
    bow_list = []
    word_freq = Counter()
    for text in tokenized_text:
        bow = Counter(text)
        word_freq.update(text)
        bow_list.append(bow)
    print("This corpus has {} key words, and the 10 \
most frequent words are: {}".format(len(word_freq.keys()), word_freq.most_common(10)))
    return bow_list, word_freq

##### After the text is cleaned, I can now apply the BOW model on the corpora. The BOW is basically a frequency Counter, so I have written a function to get BOW from the corpora in my text_clean.py module.

In [13]:
# create clean corpus for bow approach
test_clean_token = clean_corpus(test_corpus, string_line=False)
train_clean_token = clean_corpus(train_corpus, string_line=False)

>>>> response cleaning initiated
--cleaning response #500 out of 898
>>>> response cleaning initiated
--cleaning response #500 out of 2186
--cleaning response #1000 out of 2186
--cleaning response #1500 out of 2186
--cleaning response #2000 out of 2186


In [14]:
# quick look at the word frequency
test_bow, test_word_freq = get_bow(test_clean_token)
train_bow, train_word_freq = get_bow(train_clean_token)

This corpus has 7126 key words, and the 10 most frequent words are: [('say', 2976), ('lt', 1112), ('share', 1067), ('dlrs', 921), ('company', 886), ('pct', 758), ('mln', 755), ('inc', 637), ('bank', 505), ('corp', 500)]
This corpus has 11042 key words, and the 10 most frequent words are: [('say', 7388), ('lt', 2802), ('share', 2306), ('dlrs', 2247), ('mln', 2172), ('pct', 2051), ('bank', 1983), ('company', 1942), ('inc', 1469), ('u', 1291)]


In [15]:
print(pd.DataFrame(test_bow).head())

   aabex  aame  aar  ab  abandon  abate  abatement  abboud  abegglen  abeles  \
0    NaN   NaN  NaN NaN      NaN    NaN        NaN     NaN       NaN     NaN   
1    NaN   NaN  NaN NaN      NaN    NaN        NaN     NaN       NaN     NaN   
2    NaN   NaN  NaN NaN      NaN    NaN        NaN     NaN       NaN     NaN   
3    NaN   NaN  NaN NaN      NaN    NaN        NaN     NaN       NaN     NaN   
4    NaN   NaN  NaN NaN      NaN    NaN        NaN     NaN       NaN     NaN   

     ...     zellerbach  zenex  zinn  zoete  zond  zondervan  zone  zoran  \
0    ...            NaN    NaN   NaN    NaN   NaN        NaN   NaN    NaN   
1    ...            NaN    NaN   NaN    NaN   NaN        NaN   NaN    NaN   
2    ...            NaN    NaN   NaN    NaN   NaN        NaN   NaN    NaN   
3    ...            NaN    NaN   NaN    NaN   NaN        NaN   NaN    NaN   
4    ...            NaN    NaN   NaN    NaN   NaN        NaN   NaN    NaN   

   zurich  zwermann  
0     NaN       NaN  
1     NaN   

The typical way to reduce the noise in BOW models is to use dimensionality reduction methods, such as Latent Semantic Indexing (LSA), Random Projections (RP), Latent Dirichlet Allocation (LDA), or Hierachical Dirichlet Process (HDP).

# Glimpse of Word2Vec Model
The alternative, of course, is to use the famous word2vec algorithm to generate continous numeric vectors. I will use gensim package to conduct this task. After I train the model with the reuters corpora, I get a dictionary that maps a numeric vector to each word that appears in the corpora.

In [16]:

# set up logging for gensim training
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
model = gensim.models.Word2Vec(train_clean_token, min_count=1, workers=2)

2018-07-31 23:47:28,672 : INFO : collecting all words and their counts
2018-07-31 23:47:28,675 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-07-31 23:47:28,755 : INFO : collected 11042 word types from a corpus of 191770 raw words and 2186 sentences
2018-07-31 23:47:28,757 : INFO : Loading a fresh vocabulary
2018-07-31 23:47:28,842 : INFO : min_count=1 retains 11042 unique words (100% of original 11042, drops 0)
2018-07-31 23:47:28,843 : INFO : min_count=1 leaves 191770 word corpus (100% of original 191770, drops 0)
2018-07-31 23:47:28,969 : INFO : deleting the raw counts dictionary of 11042 items
2018-07-31 23:47:28,972 : INFO : sample=0.001 downsamples 40 most-common words
2018-07-31 23:47:28,974 : INFO : downsampling leaves estimated 169461 word corpus (88.4% of prior 191770)
2018-07-31 23:47:29,054 : INFO : estimated required memory for 11042 words and 100 dimensions: 14354600 bytes
2018-07-31 23:47:29,056 : INFO : resetting layer weights
2018-07-31

With the model, I can get the numeric representation of any word that's included in the model's dictionary, and even quickly calculate the "similarity" between two words.

In [17]:
print("Printing the vector of 'inc': {} ...".format(model['inc'][:10]))
print("Printing the similarity between 'inc' and 'love': {}"\
      .format(model.wv.similarity('inc', 'love')))
print("Printing the similarity between 'inc' and 'company': {}"\
      .format(model.wv.similarity('inc', 'company')))

Printing the vector of 'inc': [ 0.18019953  0.65239906 -1.00536036  0.07709646  0.31514305  0.19264233
 -0.41766736  0.85372269 -1.01059175 -0.40029141] ...
Printing the similarity between 'inc' and 'love': 0.8626278351070105
Printing the similarity between 'inc' and 'company': 0.9311575893495986


  if __name__ == '__main__':


In [18]:
def get_doc_matrix(w2v_model, corpora_token):
    """
    Function to aggregate document vector from a built gensim w2v model that
    calculates the vector mean based on vector representation of words in the
    document
    -----PARAMETERS-----
    w2v_model: a gensim.models.Word2Vec object that can return a numeric array
        when queried with w2v_model['word']
    corpora_token: a list of sentence in list form, e.g. [['sentence','one'],
        ['sentence','two'],...]
    -----OUTPUT-----
    returned object (text_matrix) is a numpy.ndarray with the shape
    (len(corpora_token), word_vector_length)
    """
    word_vector_length = len(w2v_model[w2v_model.wv.index2word[0]])  # get word vector length
    text_matrix = np.zeros((len(corpora_token), word_vector_length))
    for i in range(len(corpora_token)):
        text_vector = np.zeros(word_vector_length)
        for j in range(len(corpora_token[i])):
            try:
                text_vector += w2v_model[corpora_token[i][j]]
            except:
                pass
            if j == len(corpora_token[i]) - 1:
                text_vector = text_vector / len(corpora_token[i])
        text_matrix[i][:] = text_vector
    return text_matrix

In [19]:
test_matrix = get_doc_matrix(model, test_clean_token)
train_matrix = get_doc_matrix(model, train_clean_token)



In [20]:
print(pd.DataFrame(train_matrix).head())

         0         1         2         3         4         5         6   \
0 -0.007053  0.510637 -0.444755  0.381168 -0.253832  0.218109 -0.314217   
1  0.008513  0.542247 -0.479434  0.370381 -0.213024  0.225680 -0.335633   
2 -0.191740  0.458929 -0.457054  0.216853 -0.238729  0.151147 -0.192088   
3  0.001081  0.448670 -0.502514  0.181731 -0.062267  0.104628 -0.227786   
4  0.007104  0.507678 -0.493714  0.525003 -0.315540  0.088627 -0.351352   

         7         8         9     ...           90        91        92  \
0  0.196461 -0.272996 -0.212591    ...    -0.522528  0.112731 -0.133989   
1  0.254908 -0.316666 -0.221621    ...    -0.550824  0.204567 -0.164425   
2  0.324972 -0.121832 -0.005484    ...    -0.484766  0.145381 -0.258645   
3  0.380240 -0.252435 -0.098570    ...    -0.471194  0.307452 -0.179525   
4  0.237284 -0.152665 -0.195754    ...    -0.434458 -0.016464 -0.082804   

         93        94        95        96        97        98        99  
0 -0.333362 -0.817477  0

In [21]:
print(pd.DataFrame(test_matrix).head())

         0         1         2         3         4         5         6   \
0 -0.137655  0.409809 -0.397694  0.241818 -0.244117  0.136173 -0.209175   
1 -0.234845  0.607888 -0.477836  0.497215 -0.475905  0.172352 -0.325924   
2 -0.072341  0.438973 -0.423213  0.303832 -0.223895  0.119996 -0.248904   
3 -0.222821  0.637043 -0.658885  0.825898 -0.585690  0.069154 -0.353872   
4 -0.006987  0.377220 -0.432845  0.307658 -0.166488  0.087884 -0.266331   

         7         8         9     ...           90        91        92  \
0  0.235962 -0.114298 -0.038916    ...    -0.433409  0.082815 -0.176602   
1  0.200129  0.010578 -0.051898    ...    -0.571753 -0.140075 -0.225004   
2  0.253012 -0.141939 -0.088440    ...    -0.449004  0.089478 -0.157954   
3  0.354253  0.058971 -0.099296    ...    -0.447013 -0.338993 -0.233701   
4  0.266183 -0.248923 -0.157470    ...    -0.400242  0.141753 -0.113321   

         93        94        95        96        97        98        99  
0 -0.237479 -0.728287  0

### Label Binerizer

In [22]:
all_labels = []
for i in train_labels:
    all_labels.extend(i)
for i in test_labels:
    all_labels.extend(i)

In [24]:
mlb = MultiLabelBinarizer(classes=list(set(all_labels)))
train_labels_bin = mlb.fit_transform(train_labels)
test_labels_bin = mlb.transform(test_labels)

In [25]:
# Classifier
classifier = OneVsRestClassifier(LinearSVC(random_state=42))
classifier.fit(train_matrix, train_labels_bin)
y_pred = classifier.predict(test_matrix) 
y_pred

  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [26]:
predictions = mlb.inverse_transform(y_pred)

In [27]:
confusion = pd.DataFrame({"Predicted": test_labels, "Actual":test_labels })

In [28]:
confusion

Unnamed: 0,Actual,Predicted
0,[acq],[acq]
1,"[interest, money-fx]","[interest, money-fx]"
2,"[acq, copper]","[acq, copper]"
3,"[interest, money-fx]","[interest, money-fx]"
4,[acq],[acq]
5,[acq],[acq]
6,"[interest, money-fx]","[interest, money-fx]"
7,[acq],[acq]
8,[acq],[acq]
9,[acq],[acq]


In [29]:
def evaluate(test_labels, predictions):
    precision = precision_score(test_labels, predictions, average='micro')
    recall = recall_score(test_labels, predictions, average='micro')
    f1 = f1_score(test_labels, predictions, average='micro')
    print("Micro-average quality numbers")
    print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

    precision = precision_score(test_labels, predictions, average='macro')
    recall = recall_score(test_labels, predictions, average='macro')
    f1 = f1_score(test_labels, predictions, average='macro')
    print("Macro-average quality numbers")
    print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
    
    precision = precision_score(test_labels, predictions, average='samples')
    recall = recall_score(test_labels, predictions, average='samples')
    f1 = f1_score(test_labels, predictions, average='samples')
    print("samples-average quality numbers")
    print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

In [30]:
evaluate(y_pred, test_labels_bin)

Micro-average quality numbers
Precision: 0.8842, Recall: 0.9830, F1-measure: 0.9310
Macro-average quality numbers
Precision: 0.0633, Recall: 0.0791, F1-measure: 0.0682
samples-average quality numbers
Precision: 0.9425, Recall: 0.9889, F1-measure: 0.9544


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
