Multi-Label Classification on Reuters

In [2]:

import nltk
nltk_packages = [
    ("reuters", "corpora/reuters.zip"),
    ("punkt", "tokenizers/punkt")
]

for pid, fid in nltk_packages:
    try:
        nltk.data.find(fid)
    except LookupError:
        nltk.download(pid)

[nltk_data] Downloading package reuters to /Users/bhydemi/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


In [3]:
#import the dataset from nltk.corpus
from nltk.corpus import reuters

### GET DATA

#### The data imported from nltk is being splitted into train and test, so also their corresponding documents and categories.

In [4]:
train_documents, train_categories = zip(*[(reuters.raw(i), reuters.categories(i)) for i in reuters.fileids() if i.startswith('training/')])
test_documents, test_categories = zip(*[(reuters.raw(i), reuters.categories(i)) for i in reuters.fileids() if i.startswith('test/')])

In [17]:
#import neccessary libraries 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from collections import Counter
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
#filter warnings 
import warnings
warnings.filterwarnings("ignore")
#Assign stopwords to stopwords variables
stopwords = stopwords.words("english")
#Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

## FEATURE ENGINEERING, EXTRACTION AND TEXT PROCESSING

Bag of words was used for feature extraction.The total words in the whole document was extracted and frequency of each word was taken note of. From observation, words with minimum number of occurence of 137 resulted in a total feature of approximately 1000, which was used in modelling. This way, weights are being assigned to each words based on frequency.

In [18]:
def processing(documents):
    large_collection = []
    for document in documents:
        for text in document:
            #tokenize each article in a document
            word_tok = word_tokenize(text)
            #removal of stopwords 
            word_tok = [word for word in word_tok if word not in stopwords]
            #Lemmatizing of words
            word_stemed = [lemmatizer.lemmatize(word.lower()) for word in word_tok]
            #regex to remove numbers etc
            regex = re.compile('[a-zA-Z]+')
            all_words = list(filter(lambda token:regex.match(token),word_stemed))
            large_collection += all_words
    #Count the frequency of occurence of each word
    word_freq = Counter(large_collection)
    features = []
    for w in word_freq:
        if word_freq[w] >=137: 
            features.append(w)
    #generate a top 1000 features in terms of frequency
    return features

In [19]:
def get_features_set(document, features):
    #The frequnency of each word in the  whole corpus derived
    featureset=[]
    for text in document:
        current_text = word_tokenize(text.lower())
        text_stemmed = [lemmatizer.lemmatize(word) for word in current_text]
        regex = re.compile('[a-zA-Z]+')
        all_words = list(filter(lambda token:regex.match(token),text_stemmed))
        #get an array of zeros from the features
        feat = np.zeros(len(features))
        for word in all_words:
            #check for each word in the article in the top 1000 features generated.
            #if exist therefore its frequency is counted
            if word.lower() in features:
                index_val = features.index(word.lower())
                feat[index_val] += 1
        feat = list(feat)
        featureset.append(feat)
    # A matrix of frequency of top 1000 features' presence in each article per test and train
    return featureset

In [20]:
def gen_train_test_data(test_documents, train_documents, test_categories,train_categories):
    # The MultiLabelBinarizer will convert the categories from each article
    # into a vector with 0 or 1 depending if the article is from the category or not.
    mlb = MultiLabelBinarizer()
    mlb.fit(train_categories + test_categories)
    y_train = mlb.transform(train_categories)
    y_test = mlb.transform(test_categories)
    #Top 100 features of the corpus will be generated
    features = processing([test_documents, train_documents])
    #The train and test features will be generated
    X_train = get_features_set(train_documents, features)
    X_test = get_features_set(test_documents, features)
    
    return X_train, X_test, y_test, y_train, mlb, features

In [21]:
X_train, X_test, y_test, y_train, mlb, features = gen_train_test_data(test_documents, train_documents, 
                                                                      test_categories, train_categories)

In [10]:
#converted the data into a numpy array
X_train = np.array(X_train)
X_test = np.array(X_test)

### MODELLING USING KERAS

A simple neural network was used to create the classifier. With an output activation Sigmoid due to the nature of Multilabel classification i.e binary classification. Due to simplicity Keras was chosen in implementing the Neural networks and it's simplicity in the learning algorithm.
Neural network with inpute node of 1002 and output node of 90.

In [11]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
model = Sequential()
model.add(Dense(50, activation='relu', input_dim=X_train.shape[1]))
#Dropout to prevent overfitting
model.add(Dropout(0.1))
model.add(Dense(30, activation='relu'))
#Dropout to prevent overfitting#Dropout to prevent overfitting

model.add(Dropout(0.1))
model.add(Dense(y_train.shape[1], activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=2, mode='auto')
checkpointer = ModelCheckpoint(filepath="best_weights.hdf5", verbose=2, save_best_only=True) # save best model
model.fit(X_train,y_train,validation_data=(X_test,y_test),verbose=0,epochs=400, batch_size=50)

Using TensorFlow backend.


<keras.callbacks.History at 0x1116be518>

## INTERPRETATION OF RESULTS

The accuracy, recall, precision and f1-score metrics are being used to determine the performance of the model.
The macro-average is being used because we want to determine how the model performs overall across the whole corpus.

Accuracy of 78% doesn't provide us with enough detail in terms of model performance.It is simply a ratio of correctly predicted observation to the total observations.

A precision of 51.69% meaning, 51.69% of all predicted categories actually belong to the category. This relates inversely to false positive rate.

A recall of 31.78 %, meaning of all actual categories only 31.78% was predicted right. This relates to false negative.

F1-score is the weighted average of both precision and recall. Therefore, this score takes both false positives and false negatives into account.

It's difficult to acheive high precision and recall due to trade-offs between the two metrics. For example in our results a well above averge was gotten for precision while below average recall



In [12]:
from sklearn import metrics
preds = model.predict(X_test)
preds[preds>=0.5] = 1
preds[preds<0.5] = 0
score = metrics.accuracy_score(preds, y_test)
print("Final accuracy: {}".format(score))

Final accuracy: 0.7797283868830739


In [13]:
print("Accuracy : {:.4f}".format(metrics.accuracy_score(y_test, preds)))
print("Precision: {:.4f}".format(metrics.precision_score(y_test, preds, average='macro')))
print("Recall   : {:.4f}".format(metrics.recall_score(y_test, preds, average='macro')))
print("F1-Score : {:.4f}".format(metrics.f1_score(y_test, preds, average='macro')))

Accuracy : 0.7797
Precision: 0.5221
Recall   : 0.3321
F1-Score : 0.3885


In [14]:
print(metrics.classification_report(y_true=y_test, y_pred=preds, target_names=mlb.classes_))

                 precision    recall  f1-score   support

            acq       0.96      0.93      0.95       719
           alum       0.83      0.22      0.34        23
         barley       0.86      0.43      0.57        14
            bop       0.75      0.60      0.67        30
        carcass       0.40      0.22      0.29        18
     castor-oil       0.00      0.00      0.00         1
          cocoa       0.94      0.89      0.91        18
        coconut       0.00      0.00      0.00         2
    coconut-oil       0.00      0.00      0.00         3
         coffee       0.93      0.96      0.95        28
         copper       1.00      0.72      0.84        18
     copra-cake       0.00      0.00      0.00         1
           corn       0.84      0.77      0.80        56
         cotton       0.92      0.60      0.73        20
     cotton-oil       0.00      0.00      0.00         2
            cpi       0.67      0.36      0.47        28
            cpu       0.00    

## PIPELINE

In [15]:
def pipeline(text):
    # Extract features
    current_text = word_tokenize(text.lower())
    text_stemmed = [lemmatizer.lemmatize(word) for word in current_text]
    regex = re.compile('[a-zA-Z]+')
    all_words = list(filter(lambda token:regex.match(token),text_stemmed))
    feat = np.zeros(len(features))
    for word in all_words:
        if word.lower() in features:
            index_val = features.index(word.lower())
            feat[index_val] += 1
    feat = np.asarray([feat])
    
    # # Do prediction
    pred = model.predict(feat)
    pred[pred>=0.5] = 1
    pred[pred<0.5] = 0

    # Convert predictions back to labels
    labels = mlb.inverse_transform(pred)
    return labels

In [16]:
pipeline(train_documents[111])

[('money-supply',)]

## CONCLUSION

Feature extraction and engineering was done using bag of words by getting the top 1000 words based on frequency. The lables were converted by using the MultiLabelBinarizer from sklearn to a vector with 0 or 1 depending if the article is from the category or not. The features alongside the lable were passed into a 3 layer Neural network. The model derived was then evaluated based on precision, accuracy, recall and f1-score. The resulting accuracy gotten was 78.11%, precision 51.7%, Recall of 31.87%, and F1-score of 37.3%. 


 Large amount of data would improve on this approach because neural networks perform well with large dataset.Futher works can be done in area of tuning the number of nodes and layers in the Neural network.
