In [1]:
import time
start=time.time()

# Parsing Reuters 21578

In [2]:
import html
import pprint
import re
from html.parser import HTMLParser

from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings("ignore")

class ReutersParser(HTMLParser):
    """
    ReutersParser subclasses HTMLParser and is used to open the SGML
    files associated with the Reuters-21578 categorised test collection.

    The parser is a generator and will yield a single document at a time.
    Since the data will be chunked on parsing, it is necessary to keep 
    some internal state of when tags have been "entered" and "exited".
    Hence the in_body, in_topics and in_topic_d boolean members.
    """
    def __init__(self, encoding='latin-1'):
        """
        Initialise the superclass (HTMLParser) and reset the parser.
        Sets the encoding of the SGML files by default to latin-1.
        """
        html.parser.HTMLParser.__init__(self)
        self._reset()
        self.encoding = encoding

    def _reset(self):
        """
        This is called only on initialisation of the parser class
        and when a new topic-body tuple has been generated. It
        resets all off the state so that a new tuple can be subsequently
        generated.
        """
        self.in_body = False
        self.in_topics = False
        self.in_topic_d = False
        self.body = ""
        self.topics = []
        self.topic_d = ""

    def parse(self, fd):
        """
        parse accepts a file descriptor and loads the data in chunks
        in order to minimise memory usage. It then yields new documents
        as they are parsed.
        """
        self.docs = []
        for chunk in fd:
            self.feed(chunk.decode(self.encoding))
            for doc in self.docs:
                yield doc
            self.docs = []
        self.close()

    def handle_starttag(self, tag, attrs):
        """
        This method is used to determine what to do when the parser
        comes across a particular tag of type "tag". In this instance
        we simply set the internal state booleans to True if that particular
        tag has been found.
        """
        if tag == "reuters":
            pass
        elif tag == "body":
            self.in_body = True
        elif tag == "topics":
            self.in_topics = True
        elif tag == "d":
            self.in_topic_d = True 

    def handle_endtag(self, tag):
        """
        This method is used to determine what to do when the parser
        finishes with a particular tag of type "tag". 

        If the tag is a  tag, then we remove all 
        white-space with a regular expression and then append the 
        topic-body tuple.

        If the tag is a  or  tag then we simply set
        the internal state to False for these booleans, respectively.

        If the tag is a  tag (found within a  tag), then we
        append the particular topic to the "topics" list and 
        finally reset it.
        """
        if tag == "reuters":
            self.body = re.sub(r'\s+', r' ', self.body)
            self.docs.append( (self.topics, self.body) )
            self._reset()
        elif tag == "body":
            self.in_body = False
        elif tag == "topics":
            self.in_topics = False
        elif tag == "d":
            self.in_topic_d = False
            self.topics.append(self.topic_d)
            self.topic_d = ""  

    def handle_data(self, data):
        """
        The data is simply appended to the appropriate member state
        for that particular tag, up until the end closing tag appears.
        """
        if self.in_body:
            self.body += data
        elif self.in_topic_d:
            self.topic_d += data




# Extracting the topics from Reuters 21578

In [3]:
def obtain_topic_tags():
    """
    Open the topic list file and import all of the topic names
    taking care to strip the trailing "\n" from each word.
    """
    topics = open(
        "all-topics-strings.lc.txt", "r"
    ).readlines()
    topics = [t.strip() for t in topics]
    return topics

# Filtering docs by topics

In [4]:
def filter_doc_list_through_topics(topics, docs):
    """
    Reads all of the documents and creates a new list of two-tuples
    that contain a single feature entry and the body text, instead of
    a list of topics. It removes all geographic features and only 
    retains those documents which have at least one non-geographic
    topic.
    """
    ref_docs = []
    for d in docs:
        if d[0] == [] or d[0] == "":
            continue
        for t in d[0]:
            if t in topics:
                d_tup = (t, d[1])
                ref_docs.append(d_tup)
                break
    return ref_docs

In [5]:
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
 
cachedStopWords = stopwords.words("english")
 
def tokenize(text):
    min_length = 3
    words = map(lambda word: word.lower(), word_tokenize(text));
    words = [word for word in words
                  if word not in cachedStopWords]
    tokens =(list(map(lambda token: PorterStemmer().stem(token),
                  words)));
    p = re.compile('[a-zA-Z]+');
    filtered_tokens =list(filter(lambda token:
                  p.match(token) and len(token)>=min_length,
         tokens));
    return filtered_tokens

In [6]:
def create_corpus_and_topics(docs):
    # Create the topics bag
    topics = [d[0] for d in docs]
    
    # Create the document corpus list
    corpus = [d[1] for d in docs]
    return corpus,topics

In [7]:
if __name__ == "__main__":
    # Create the list of Reuters data and create the parser
    files = ["reut2-%03d.sgm" % r for r in range(0, 22)]
    parser = ReutersParser()

    # Parse the document and force all generated docs into
    # a list so that it can be printed out to the console
    docs = []
    for fn in files:
        for d in parser.parse(open(fn, 'rb')):
            docs.append(d)
            
            
    # Obtain the topic tags and filter docs through it 
    topics = obtain_topic_tags()
    ref_docs = filter_doc_list_through_topics(topics, docs)
    print(ref_docs)



In [8]:
# Vectorise and TF-IDF transform the corpus 
corpus, topics = create_corpus_and_topics(ref_docs)

In [9]:
end=time.time()
print("Time taken: ",end-start, "seconds")

Time taken:  34.232715368270874 seconds


# Implementing with  test_size = 0.2 that is 20% of the corpus is taken as test

In [10]:
# Create the training-test split of the data
corpus_train,corpus_test,topics_train,topics_test = train_test_split(
        corpus, topics, test_size=0.2, random_state=42
    )

In [11]:
len(corpus_train)

9093

In [12]:
len(corpus_test)

2274

In [13]:
len(topics_train)

9093

In [14]:
len(topics_test)

2274

In [15]:
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline    
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
pipeline_bayes = Pipeline([
                ('tfidf', TfidfVectorizer(tokenizer=tokenize, min_df=3,
                        max_df=0.90, max_features=3000,
                        use_idf=True, sublinear_tf=True,
                        norm='l2')),
                ('clf',OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])
pipeline_bayes.fit(corpus_train,topics_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=3000, min_df=3,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...assifier(estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
          n_jobs=1))])

In [16]:
cv_score_naive_bayes = cross_val_score(pipeline_bayes, corpus_train, topics_train, cv=5)
print("The cross validation score for Naive Bayes is:")
print(cv_score_naive_bayes)

The cross validation score for Naive Bayes is:
[0.70635776 0.7022319  0.70507166 0.71802002 0.72706935]


In [17]:
predicted_topics= pipeline_bayes.predict(corpus_test)

from sklearn.metrics import precision_recall_fscore_support, accuracy_score

accuracy = accuracy_score(topics_test, predicted_topics)
precision, recall, f1_score, _ = precision_recall_fscore_support(topics_test, predicted_topics, average='weighted')# Weighted means the weighted average of the metric scores are given as output

print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1 score: ", f1_score)

Accuracy:  0.7484608619173263
Precision:  0.7054023383976247
Recall:  0.7484608619173263
F1 score:  0.6969756712370362


In [18]:
import sklearn
sklearn.metrics.confusion_matrix(topics_test,predicted_topics, labels=None, sample_weight=None)

array([[423,   0,   0, ...,   0,   0,   0],
       [  8,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       ...,
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0]])

In [19]:
temp_test={}
temp_predicted={}
for i in range(0,len(corpus_test)):
    if corpus_test[i] in temp_test.keys():
        temp_test[corpus_test[i]].append(topics_test[i])
        temp_predicted[corpus_test[i]].append(predicted_topics[i])
    else:
        temp_test[corpus_test[i]]=[]
        temp_test[corpus_test[i]].append(topics_test[i])
        temp_predicted[corpus_test[i]]=[]
        temp_predicted[corpus_test[i]].append(predicted_topics[i])

for i in temp_test.keys():# To keep only the distinct labels
    temp_test[i]=list(set(temp_test[i]))
    temp_predicted[i]=list(set(temp_predicted[i]))

import pandas as pd
pd.DataFrame(
    {'Test docs': list(temp_test.keys()),
     'Test Topics': list(temp_test.values()),
     'Predicted test topics': list(temp_predicted.values())
    })

Unnamed: 0,Predicted test topics,Test Topics,Test docs
0,[earn],[earn],Shr 41 cts vs 80 cts Net 40.3 mln vs 78.9 mln ...
1,[grain],[grain],"Jamaica bought U.S. corn, wheat and rice at it..."
2,[earn],[earn],Shr primary 28 cts vs 22 cts Shr diluted 26 ct...
3,[acq],[acq],British Petroleum Co PLC said it has scheduled...
4,[grain],[grain],"China has added 90,000 tonnes of U.S. wheat to..."
5,[acq],[acq],Continental Materials Corp said its directors ...
6,[earn],[earn],Shr profit four cts vs loss 58 cts Net profit ...
7,[money-fx],[money-fx],Bank of Japan Governor Satoshi Sumita said in ...
8,[trade],[trade],Denmark's balance of payments on current accou...
9,[earn],[earn],The <British Petroleum Co of Australia Ltd> re...


# Note: For example, in the row corresponding to row 27 in the above data frame  'earn' in the predicted column imply that each of the topic in the topics columns for the corresponding article are predicted as 'earn'.

In [20]:
end=time.time()
print("Time taken: ",end-start, "seconds")

Time taken:  623.3920772075653 seconds


In [21]:
start=time.time()

# Implementing with  test_size = 0.4 that is 40% of the corpus is taken as test

In [22]:
corpus_train,corpus_test,topics_train,topics_test = train_test_split(
        corpus, topics, test_size=0.4, random_state=42
    )

In [23]:
pipeline_bayes = Pipeline([
                ('tfidf', TfidfVectorizer(tokenizer=tokenize, min_df=3,
                        max_df=0.90, max_features=3000,
                        use_idf=True, sublinear_tf=True,
                        norm='l2')),
                ('clf',OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])
pipeline_bayes.fit(corpus_train,topics_train)

predicted_topics= pipeline_bayes.predict(corpus_test)

from sklearn.metrics import precision_recall_fscore_support, accuracy_score

accuracy = accuracy_score(topics_test, predicted_topics)
precision, recall, f1_score, _ = precision_recall_fscore_support(topics_test, predicted_topics, average='weighted')

print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1 score: ", f1_score)

Accuracy:  0.7248735429953815
Precision:  0.690626017087693
Recall:  0.7248735429953815
F1 score:  0.6697699066626455


In [24]:
cv_score_naive_bayes = cross_val_score(pipeline_bayes, corpus_train, topics_train, cv=5)
print("The cross validation score for Naive Bayes is:")
print(cv_score_naive_bayes)

The cross validation score for Naive Bayes is:
[0.67834167 0.68627451 0.69801616 0.70653789 0.72326103]


In [25]:
sklearn.metrics.confusion_matrix(topics_test,predicted_topics, labels=None, sample_weight=None)

array([[864,   0,   0, ...,   0,   0,   0],
       [ 10,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       ...,
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       [  3,   0,   0, ...,   0,   0,   0]])

In [26]:
temp_test={}
temp_predicted={}
for i in range(0,len(corpus_test)):
    if corpus_test[i] in temp_test.keys():
        temp_test[corpus_test[i]].append(topics_test[i])
        temp_predicted[corpus_test[i]].append(predicted_topics[i])
    else:
        temp_test[corpus_test[i]]=[]
        temp_test[corpus_test[i]].append(topics_test[i])
        temp_predicted[corpus_test[i]]=[]
        temp_predicted[corpus_test[i]].append(predicted_topics[i])

for i in temp_test.keys():
    temp_test[i]=list(set(temp_test[i]))
    temp_predicted[i]=list(set(temp_predicted[i]))

pd.DataFrame(
    {'Test docs': list(temp_test.keys()),
     'Test Topics': list(temp_test.values()),
     'Predicted test topics': list(temp_predicted.values())
    })

Unnamed: 0,Predicted test topics,Test Topics,Test docs
0,[earn],[earn],Shr 41 cts vs 80 cts Net 40.3 mln vs 78.9 mln ...
1,[grain],[grain],"Jamaica bought U.S. corn, wheat and rice at it..."
2,[earn],[earn],Shr primary 28 cts vs 22 cts Shr diluted 26 ct...
3,[acq],[acq],British Petroleum Co PLC said it has scheduled...
4,[grain],[grain],"China has added 90,000 tonnes of U.S. wheat to..."
5,[acq],[acq],Continental Materials Corp said its directors ...
6,[earn],[earn],Shr profit four cts vs loss 58 cts Net profit ...
7,[money-fx],[money-fx],Bank of Japan Governor Satoshi Sumita said in ...
8,[trade],[trade],Denmark's balance of payments on current accou...
9,[earn],[earn],The <British Petroleum Co of Australia Ltd> re...


In [27]:
end=time.time()
print("Time taken: ",end-start, "seconds")

Time taken:  455.6999111175537 seconds


In [28]:
start=time.time()

# Implementing with  test_size = 0.1 that is 10% of the corpus is taken as test

In [29]:
corpus_train,corpus_test,topics_train,topics_test = train_test_split(
        corpus, topics, test_size=0.1, random_state=42
    )

In [30]:
pipeline_bayes = Pipeline([
                ('tfidf', TfidfVectorizer(tokenizer=tokenize, min_df=3,
                        max_df=0.90, max_features=3000,
                        use_idf=True, sublinear_tf=True,
                        norm='l2')),
                ('clf',OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])
pipeline_bayes.fit(corpus_train,topics_train)



predicted_topics= pipeline_bayes.predict(corpus_test)

from sklearn.metrics import precision_recall_fscore_support, accuracy_score

accuracy = accuracy_score(topics_test, predicted_topics)
precision, recall, f1_score, _ = precision_recall_fscore_support(topics_test, predicted_topics, average='weighted')

print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1 score: ", f1_score)

Accuracy:  0.7475813544415127
Precision:  0.7148719105156288
Recall:  0.7475813544415127
F1 score:  0.6963127102059349


In [31]:
cv_score_naive_bayes = cross_val_score(pipeline_bayes, corpus_train, topics_train, cv=5)
print("The cross validation score for Naive Bayes is:")
print(cv_score_naive_bayes)

The cross validation score for Naive Bayes is:
[0.71744354 0.7131068  0.71421569 0.72405312 0.72519841]


In [32]:
sklearn.metrics.confusion_matrix(topics_test,predicted_topics, labels=None, sample_weight=None)

array([[227,   0,   0, ...,   0,   0,   0],
       [  6,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       ...,
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0]])

In [33]:
temp_test={}
temp_predicted={}
for i in range(0,len(corpus_test)):
    if corpus_test[i] in temp_test.keys():
        temp_test[corpus_test[i]].append(topics_test[i])
        temp_predicted[corpus_test[i]].append(predicted_topics[i])
    else:
        temp_test[corpus_test[i]]=[]
        temp_test[corpus_test[i]].append(topics_test[i])
        temp_predicted[corpus_test[i]]=[]
        temp_predicted[corpus_test[i]].append(predicted_topics[i])

for i in temp_test.keys():
    temp_test[i]=list(set(temp_test[i]))
    temp_predicted[i]=list(set(temp_predicted[i]))

pd.DataFrame(
    {'Test docs': list(temp_test.keys()),
     'Test Topics': list(temp_test.values()),
     'Predicted test topics': list(temp_predicted.values())
    })

Unnamed: 0,Predicted test topics,Test Topics,Test docs
0,[earn],[earn],Shr 41 cts vs 80 cts Net 40.3 mln vs 78.9 mln ...
1,[grain],[grain],"Jamaica bought U.S. corn, wheat and rice at it..."
2,[earn],[earn],Shr primary 28 cts vs 22 cts Shr diluted 26 ct...
3,[acq],[acq],British Petroleum Co PLC said it has scheduled...
4,[grain],[grain],"China has added 90,000 tonnes of U.S. wheat to..."
5,[acq],[acq],Continental Materials Corp said its directors ...
6,[earn],[earn],Shr profit four cts vs loss 58 cts Net profit ...
7,[money-fx],[money-fx],Bank of Japan Governor Satoshi Sumita said in ...
8,[trade],[trade],Denmark's balance of payments on current accou...
9,[earn],[earn],The <British Petroleum Co of Australia Ltd> re...


In [34]:
end=time.time()
print("Time taken: ",end-start, "seconds")

Time taken:  638.039274930954 seconds
