Perform text classification on the **Reuters corpus**.  

In [1]:
import nltk
from nltk.corpus import reuters
import pandas as pd
import numpy as np
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier

In [2]:
# Load Reuters data
documents = reuters.fileids()
texts = [reuters.raw(doc_id) for doc_id in documents]
labels = [reuters.categories(doc_id) for doc_id in documents]

In [3]:
len(texts)

10788

In [5]:
labels[:3]

[['trade'], ['grain'], ['crude', 'nat-gas']]

In [6]:
# Convert to DataFrame
df = pd.DataFrame({'text': texts, 'labels': labels})

# Split into train and test sets
train_docs = [doc for doc in documents if doc.startswith('train')]
test_docs = [doc for doc in documents if doc.startswith('test')]

X_train = [reuters.raw(doc_id) for doc_id in train_docs]
y_train = [reuters.categories(doc_id) for doc_id in train_docs]
X_test = [reuters.raw(doc_id) for doc_id in test_docs]
y_test = [reuters.categories(doc_id) for doc_id in test_docs]

In [7]:
print(len(X_train))
print(len(y_train))
print(len(X_test))
print(len(y_test))

7769
7769
3019
3019


In [8]:
y_train[:10]

[['cocoa'],
 ['acq'],
 ['money-supply'],
 ['acq'],
 ['earn'],
 ['earn'],
 ['acq', 'trade'],
 ['earn'],
 ['crude', 'nat-gas'],
 ['cocoa', 'coffee', 'sugar']]

In [9]:
# MultiLabel Binarizer
y_train_bin = MultiLabelBinarizer().fit_transform(y_train)
y_test_bin = MultiLabelBinarizer().fit_transform(y_test)

In [10]:
y_train_bin[:2]

array([[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0]])

In [11]:
# Preprocessing methods

# Preprocessing-1: Basic
# Convert to lowercase
# Word tokenization
def basic_preprocessing(text):
    tokens = nltk.word_tokenize(text.lower())
    return ' '.join(tokens)

# Preprocessing-2: Advanced-Lemmatization
# Convert to lowercase
# Word tokenization
# Stopword Removal
# Punctuation Removal
# Lemmatization
def advanced_lemmatization(text):
    tokens = nltk.word_tokenize(text.lower())
    lemmatizer = nltk.WordNetLemmatizer()
    stopwords = set(nltk.corpus.stopwords.words('english'))
    return ' '.join([lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stopwords])

# Preprocessing-3: Advanced-Stemming
# Convert to lowercase
# Word tokenization
# Stopword Removal
# Punctuation Removal
# Stemming using Porter stemmer algorithm
def advanced_stemming(text):
    tokens = nltk.word_tokenize(text.lower())
    stemmer = nltk.PorterStemmer()
    stopwords = set(nltk.corpus.stopwords.words('english'))
    return ' '.join([stemmer.stem(word) for word in tokens if word.isalnum() and word not in stopwords])

In [12]:
# Apply preprocessing
X_train_basic = [basic_preprocessing(text) for text in X_train]
X_test_basic = [basic_preprocessing(text) for text in X_test]
X_train_lemmatized = [advanced_lemmatization(text) for text in X_train]
X_test_lemmatized = [advanced_lemmatization(text) for text in X_test]
X_train_stemmed = [advanced_stemming(text) for text in X_train]
X_test_stemmed = [advanced_stemming(text) for text in X_test]

In [13]:
# Vectorization (TF-IDF)
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf_basic = tfidf_vectorizer.fit_transform(X_train_basic)
X_test_tfidf_basic = tfidf_vectorizer.transform(X_test_basic)
X_train_tfidf_lemmatized = tfidf_vectorizer.fit_transform(X_train_lemmatized)
X_test_tfidf_lemmatized = tfidf_vectorizer.transform(X_test_lemmatized)
X_train_tfidf_stemmed = tfidf_vectorizer.fit_transform(X_train_stemmed)
X_test_tfidf_stemmed = tfidf_vectorizer.transform(X_test_stemmed)

In [14]:
# Train classifiers using OneVsRestClassifier
models = {
    "Naive Bayes (Basic)": MultinomialNB(),
    "SVM (Basic)": SVC(),
    "Naive Bayes (Lemmatization)": MultinomialNB(),
    "SVM (Lemmatization)": SVC(),
    "Naive Bayes (Stemming)": MultinomialNB(),
    "SVM (Stemming)": SVC()
}

data_variants = {
    "Basic": (X_train_tfidf_basic, X_test_tfidf_basic),
    "Lemmatization": (X_train_tfidf_lemmatized, X_test_tfidf_lemmatized),
    "Stemming": (X_train_tfidf_stemmed, X_test_tfidf_stemmed)
}

accuracy_results = []

In [15]:
# Evaluation function
def evaluate_model(name, y_true, y_pred):
    print(f"\n{name} Classification Report:")
    print(classification_report(y_true, y_pred))
    return accuracy_score(y_true, y_pred)

In [16]:
# Train and evaluate Naive Bayes (Basic)
start_time = time.time()
X_train_tfidf, X_test_tfidf = data_variants["Basic"]
nb_basic = OneVsRestClassifier(MultinomialNB())
nb_basic.fit(X_train_tfidf, y_train_bin)
y_pred_nb_basic = nb_basic.predict(X_test_tfidf)
nb_basic_time = time.time() - start_time
nb_basic_acc = evaluate_model("Naive Bayes (Basic Preprocessing)", y_test_bin, y_pred_nb_basic)
print(f"Naive Bayes (Basic) Training Time: {nb_basic_time:.2f} seconds")


Naive Bayes (Basic Preprocessing) Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.36      0.53       719
           1       0.00      0.00      0.00        23
           2       0.00      0.00      0.00        14
           3       0.00      0.00      0.00        30
           4       0.00      0.00      0.00        18
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00        18
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         3
           9       0.00      0.00      0.00        28
          10       0.00      0.00      0.00        18
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00        56
          13       0.00      0.00      0.00        20
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00        28
          16       0.00

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
# Train and evaluate SVM (Basic)
start_time = time.time()
svm_basic = OneVsRestClassifier(SVC())
svm_basic.fit(X_train_tfidf, y_train_bin)
y_pred_svm_basic = svm_basic.predict(X_test_tfidf)
svm_basic_time = time.time() - start_time
svm_basic_acc = evaluate_model("SVM (Basic Preprocessing)", y_test_bin, y_pred_svm_basic)
print(f"SVM (Basic) Training Time: {svm_basic_time:.2f} seconds")


SVM (Basic Preprocessing) Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.95      0.96       719
           1       1.00      0.13      0.23        23
           2       1.00      0.57      0.73        14
           3       1.00      0.47      0.64        30
           4       1.00      0.11      0.20        18
           5       0.00      0.00      0.00         1
           6       1.00      0.67      0.80        18
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         3
           9       0.96      0.86      0.91        28
          10       1.00      0.61      0.76        18
          11       0.00      0.00      0.00         1
          12       0.97      0.61      0.75        56
          13       1.00      0.35      0.52        20
          14       0.00      0.00      0.00         2
          15       1.00      0.32      0.49        28
          16       0.00      0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [18]:
# Train and evaluate Naive Bayes (Lemmatization)
start_time = time.time()
X_train_tfidf, X_test_tfidf = data_variants["Lemmatization"]
nb_lemmatized = OneVsRestClassifier(MultinomialNB())
nb_lemmatized.fit(X_train_tfidf, y_train_bin)
y_pred_nb_lemmatized = nb_lemmatized.predict(X_test_tfidf)
nb_lemmatized_time = time.time() - start_time
nb_lemmatized_acc = evaluate_model("Naive Bayes (Lemmatization)", y_test_bin, y_pred_nb_lemmatized)
print(f"Naive Bayes (Lemmatization) Training Time: {nb_lemmatized_time:.2f} seconds")


Naive Bayes (Lemmatization) Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.42      0.59       719
           1       0.00      0.00      0.00        23
           2       0.00      0.00      0.00        14
           3       0.00      0.00      0.00        30
           4       0.00      0.00      0.00        18
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00        18
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         3
           9       0.00      0.00      0.00        28
          10       0.00      0.00      0.00        18
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00        56
          13       0.00      0.00      0.00        20
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00        28
          16       0.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [19]:
# Train and evaluate SVM (Lemmatization)
start_time = time.time()
svm_lemmatized = OneVsRestClassifier(SVC())
svm_lemmatized.fit(X_train_tfidf, y_train_bin)
y_pred_svm_lemmatized = svm_lemmatized.predict(X_test_tfidf)
svm_lemmatized_time = time.time() - start_time
svm_lemmatized_acc = evaluate_model("SVM (Lemmatization)", y_test_bin, y_pred_svm_lemmatized)
print(f"SVM (Lemmatization) Training Time: {svm_lemmatized_time:.2f} seconds")


SVM (Lemmatization) Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.95      0.96       719
           1       1.00      0.22      0.36        23
           2       1.00      0.57      0.73        14
           3       1.00      0.43      0.60        30
           4       1.00      0.17      0.29        18
           5       0.00      0.00      0.00         1
           6       1.00      0.67      0.80        18
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         3
           9       0.96      0.86      0.91        28
          10       1.00      0.56      0.71        18
          11       0.00      0.00      0.00         1
          12       0.97      0.61      0.75        56
          13       1.00      0.40      0.57        20
          14       0.00      0.00      0.00         2
          15       1.00      0.32      0.49        28
          16       0.00      0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [20]:
# Train and evaluate Naive Bayes (Stemming)
start_time = time.time()
X_train_tfidf, X_test_tfidf = data_variants["Stemming"]
nb_stemmed = OneVsRestClassifier(MultinomialNB())
nb_stemmed.fit(X_train_tfidf, y_train_bin)
y_pred_nb_stemmed = nb_stemmed.predict(X_test_tfidf)
nb_stemmed_time = time.time() - start_time
nb_stemmed_acc = evaluate_model("Naive Bayes (Stemming)", y_test_bin, y_pred_nb_stemmed)
print(f"Naive Bayes (Stemming) Training Time: {nb_stemmed_time:.2f} seconds")


Naive Bayes (Stemming) Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.43      0.60       719
           1       0.00      0.00      0.00        23
           2       0.00      0.00      0.00        14
           3       0.00      0.00      0.00        30
           4       0.00      0.00      0.00        18
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00        18
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         3
           9       0.00      0.00      0.00        28
          10       0.00      0.00      0.00        18
          11       0.00      0.00      0.00         1
          12       1.00      0.05      0.10        56
          13       0.00      0.00      0.00        20
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00        28
          16       0.00      0.00 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [21]:
# Train and evaluate SVM (Stemming)
start_time = time.time()
svm_stemmed = OneVsRestClassifier(SVC())
svm_stemmed.fit(X_train_tfidf, y_train_bin)
y_pred_svm_stemmed = svm_stemmed.predict(X_test_tfidf)
svm_stemmed_time = time.time() - start_time
svm_stemmed_acc = evaluate_model("SVM (Stemming)", y_test_bin, y_pred_svm_stemmed)
print(f"SVM (Stemming) Training Time: {svm_stemmed_time:.2f} seconds")


SVM (Stemming) Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.96      0.97       719
           1       1.00      0.22      0.36        23
           2       1.00      0.57      0.73        14
           3       0.93      0.43      0.59        30
           4       1.00      0.17      0.29        18
           5       0.00      0.00      0.00         1
           6       1.00      0.67      0.80        18
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         3
           9       0.96      0.86      0.91        28
          10       1.00      0.67      0.80        18
          11       0.00      0.00      0.00         1
          12       0.95      0.62      0.75        56
          13       1.00      0.40      0.57        20
          14       0.00      0.00      0.00         2
          15       1.00      0.32      0.49        28
          16       0.00      0.00      0.0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [22]:
# Create accuracy comparison table
accuracy_results = [
    ("Naive Bayes (Basic)", nb_basic_acc, nb_basic_time),
    ("SVM (Basic)", svm_basic_acc, svm_basic_time),
    ("Naive Bayes (Lemmatization)", nb_lemmatized_acc, nb_lemmatized_time),
    ("SVM (Lemmatization)", svm_lemmatized_acc, svm_lemmatized_time),
    ("Naive Bayes (Stemming)", nb_stemmed_acc, nb_stemmed_time),
    ("SVM (Stemming)", svm_stemmed_acc, svm_stemmed_time)
]

accuracy_df = pd.DataFrame(accuracy_results, columns=['Model', 'Accuracy', 'Training Time (s)'])
print("\nModel Accuracy and Training Time Comparison:")
print(accuracy_df)


Model Accuracy and Training Time Comparison:
                         Model  Accuracy  Training Time (s)
0          Naive Bayes (Basic)  0.413051           0.758855
1                  SVM (Basic)  0.760848         435.702291
2  Naive Bayes (Lemmatization)  0.433587           0.644122
3          SVM (Lemmatization)  0.770123         303.832131
4       Naive Bayes (Stemming)  0.434250           0.719355
5               SVM (Stemming)  0.776416         282.498932


### Conclusion & Improvement

1. I use 3 different preprocessing method to compare:  
- Basic: Convert to lowercase & word tokenization
- Advanced (Lemmatization): Basic + Stopword removal + Punctuation Removal + Lemmatization
- Advanced (Stemming): Basic + Stopword removal + Punctuation Removal + Porter Stemming
2. I used 2 classification algorithm: Naive Bayes & SVM
3. For both model, applying advanced preprocessing gives better result compare to basic preprocessing
4. Stemming gives a slightly better results compare to lemmatization, but the difference is insignificant in this case.
5. A possible improvement in this model includes:
a. Remove rare word that doesn't help classification
b. Use unigram to improve context awareness
c. User word embedding such as word2vec that better in capture context compare to tf-idf