# Text Mining – Assignment 2

In [1]:
Name = "Anand Kumar Shanmugam"

Number = "87621"

email = "shanmu01@gw.uni-passau.de"

In [2]:
from collections import OrderedDict
import warnings
warnings.filterwarnings("ignore")

In [3]:
try:
    import sklearn
except ModuleNotFoundError:
    !pip install scikit-learn   

try:
    import nltk
except ModuleNotFoundError:
    !pip install nltk

try:
    import numpy as np
except ModuleNotFoundError:
    !pip install numpy

In [4]:

nltk_packages = [
    ("stopwords","corpora/stopwords"),
    ("wordnet","corpora/wordnet"),
    ("reuters", "corpora/reuters.zip"),
    ("punkt", "tokenizers/punkt"),
]

for pid, fid in nltk_packages:
    try:
        nltk.data.find(fid)
    except LookupError:
        nltk.download(pid)

[nltk_data] Downloading package stopwords to C:\Users\Ram
[nltk_data]     Prasanth\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to C:\Users\Ram
[nltk_data]     Prasanth\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.
[nltk_data] Downloading package reuters to C:\Users\Ram
[nltk_data]     Prasanth\AppData\Roaming\nltk_data...
[nltk_data] Downloading package punkt to C:\Users\Ram
[nltk_data]     Prasanth\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


## Setting up corpus

In [5]:
from nltk.corpus import reuters

## Setting up train/test data

In [6]:
train_documents, train_categories = zip(*[(reuters.raw(i), reuters.categories(i)) for i in reuters.fileids() if i.startswith('training/')])
test_documents, test_categories = zip(*[(reuters.raw(i), reuters.categories(i)) for i in reuters.fileids() if i.startswith('test/')])

In [7]:
all_categories = sorted(list(set(reuters.categories())))

## Feature Engineering

### Preprocessing

In [10]:
train_tokens = [nltk.word_tokenize(text) for text in train_documents]
test_tokens = [nltk.word_tokenize(text) for text in test_documents]

#Removing Numbers, Special Characters excluding Alphabets
import re
train_tokens = [[w for w in tokens if re.match("[a-zA-Z]+",w)] for tokens in train_tokens] 
test_tokens = [[w for w in tokens if re.match("[a-zA-Z]+",w)] for tokens in test_tokens]

#Stemming Eg:OPERATE => OPER
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
train_tokens = [[stemmer.stem(w) for w in tokens if len(stemmer.stem(w))>2] for tokens in train_tokens] 
test_tokens = [[stemmer.stem(w) for w in tokens if len(stemmer.stem(w))>2] for tokens in test_tokens]

#Removing Stopwords
from nltk.corpus import stopwords
stop_words=stopwords.words('english')

train_tokens = [[w for w in tokens if w not in stop_words] for tokens in train_tokens] 
test_tokens = [[w for w in tokens if w not in stop_words] for tokens in test_tokens]
print(len(train_tokens))

7769


In [11]:
freq = nltk.FreqDist(token for tokens in train_tokens for token in tokens)

In [12]:
word_features = [w for w, _ in freq.most_common(100)]
print(len(word_features))

100


### Feature Extraction

In [13]:
X_train = [[1 if w in tokens else 0 for w in word_features] for tokens in train_tokens]
X_test  = [[1 if w in tokens else 0 for w in word_features] for tokens in test_tokens]

### Multi Label Binarizing

In [14]:
from sklearn.preprocessing import MultiLabelBinarizer


mlb = MultiLabelBinarizer()
mlb.fit(train_categories + test_categories)

print("These are the all categories from the MultiLabelBinarizer:\n{}".format(", ".join(mlb.classes_)))

# Example:
example = mlb.transform([train_categories[6]])[0]
print("\nCategories: {}\nVector: {}".format(train_categories[6], example))
print("\nThe 0th entry represents the label '{}'".format(mlb.classes_[0]))


y_train = mlb.transform(train_categories)
y_test  = mlb.transform(test_categories)

These are the all categories from the MultiLabelBinarizer:
acq, alum, barley, bop, carcass, castor-oil, cocoa, coconut, coconut-oil, coffee, copper, copra-cake, corn, cotton, cotton-oil, cpi, cpu, crude, dfl, dlr, dmk, earn, fuel, gas, gnp, gold, grain, groundnut, groundnut-oil, heat, hog, housing, income, instal-debt, interest, ipi, iron-steel, jet, jobs, l-cattle, lead, lei, lin-oil, livestock, lumber, meal-feed, money-fx, money-supply, naphtha, nat-gas, nickel, nkr, nzdlr, oat, oilseed, orange, palladium, palm-oil, palmkernel, pet-chem, platinum, potato, propane, rand, rape-oil, rapeseed, reserves, retail, rice, rubber, rye, ship, silver, sorghum, soy-meal, soy-oil, soybean, strategic-metal, sugar, sun-meal, sun-oil, sunseed, tea, tin, trade, veg-oil, wheat, wpi, yen, zinc

Categories: ['acq', 'trade']
Vector: [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 

### Classifier Training

In [15]:
from sklearn.svm import SVC, LinearSVC

In [16]:
clfs = OrderedDict()

for i, category in enumerate(all_categories):
    clf = LinearSVC()
    
    # We train each classifier individually, so we must use
    # only 0 or 1 as y_train.
    y_train_clf = [yt[i] for yt in y_train]
    
    # .fit() will train the model with the training data
    clf.fit(X_train, y_train_clf)
    
    clfs[category] = clf

### Classifier Evaluation

In [17]:
y_pred = np.zeros((len(y_test), len(all_categories)))

for i, (cat, clf) in enumerate(clfs.items()):
    y_pred[:, i] = clf.predict(X_test)

In [18]:
from sklearn import metrics

In [19]:
print("Accuracy : {:.4f}".format(metrics.accuracy_score(y_test, y_pred)))
print("Precision: {:.4f}".format(metrics.precision_score(y_test, y_pred, average='macro')))
print("Recall   : {:.4f}".format(metrics.recall_score(y_test, y_pred, average='macro')))
print("F1-Score : {:.4f}".format(metrics.f1_score(y_test, y_pred, average='macro')))

Accuracy : 0.6211
Precision: 0.2524
Recall   : 0.1389
F1-Score : 0.1646


In [20]:
print(metrics.classification_report(y_true=y_test, y_pred=y_pred, target_names=mlb.classes_))

                 precision    recall  f1-score   support

            acq       0.92      0.83      0.87       719
           alum       0.00      0.00      0.00        23
         barley       0.00      0.00      0.00        14
            bop       0.62      0.33      0.43        30
        carcass       0.00      0.00      0.00        18
     castor-oil       0.00      0.00      0.00         1
          cocoa       0.14      0.06      0.08        18
        coconut       0.00      0.00      0.00         2
    coconut-oil       0.00      0.00      0.00         3
         coffee       0.50      0.14      0.22        28
         copper       0.00      0.00      0.00        18
     copra-cake       0.00      0.00      0.00         1
           corn       0.38      0.05      0.09        56
         cotton       0.00      0.00      0.00        20
     cotton-oil       0.00      0.00      0.00         2
            cpi       0.38      0.21      0.27        28
            cpu       0.00    

### Pipeline

In [21]:
# Input sentence
example_text = "This example text should cover coconuts. Coconuts are available in plenty in Kerala"

# Tokenize
example_tokens = nltk.word_tokenize(example_text)

# Extract features
example_features = [[1 if w in example_tokens else 0 for w in word_features]]

# Do prediction
example_preds = [clf.predict(example_features)[0] for clf in clfs.values()]

# Convert predictions back to labels
example_labels = mlb.inverse_transform(np.array([example_preds]))

# Print labels
print("Example text: {}".format(example_text))
print("Example labels: {}".format(example_labels))

Example text: This example text should cover coconuts. Coconuts are available in plenty in Kerala
Example labels: [()]
