## Duplicate document classifier

### Imports

In [2]:
import numpy as np
import pandas as pd
import re
from nltk import download
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
import torch
import torch.nn as nn
import sys

sys.path.append("..")

from modules.clean_data import CleanData
from modules.document_vectorizer import DocumentVectorizer
from modules.enums import VectorizerType

download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADE\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Global constants and variables

In [3]:
DATASET_PATH = "data/df_file.csv"
RANDOM_SEED = 100
LINE_DIVIDER = "=" * 50
LABEL_MAPPER = {
    0: "Politics",
    1: "Sport",
    2: "Technology",
    3: "Entertainment",
    4: "Business",
}

### Read dataset

In [4]:
dataset_df = pd.read_csv(DATASET_PATH)

In [5]:
dataset_df

Unnamed: 0,Text,Label
0,Budget to set scene for election\n \n Gordon B...,0
1,Army chiefs in regiments decision\n \n Militar...,0
2,Howard denies split over ID cards\n \n Michael...,0
3,Observers to monitor UK election\n \n Minister...,0
4,Kilroy names election seat target\n \n Ex-chat...,0
...,...,...
2220,India opens skies to competition\n \n India wi...,4
2221,Yukos bankruptcy 'not US matter'\n \n Russian ...,4
2222,Survey confirms property slowdown\n \n Governm...,4
2223,High fuel prices hit BA's profits\n \n British...,4


### Define training data

In [6]:
X = pd.DataFrame({"document": dataset_df["Text"]})
y = dataset_df["Label"]

In [7]:
X_clean = X.document.apply(lambda doc: CleanData.run(doc))
X_clean.head()

0    budget set scene election gordon brown seek pu...
1    army chiefs regiments decision military chiefs...
2    howard denies split cards michael howard denie...
3    observers monitor election ministers invite in...
4    kilroy names election seat target exchat show ...
Name: document, dtype: object

### Vectorize documents
 - Count Vectorizer => "Bag of Words"

In [8]:
vectorizer_bow = DocumentVectorizer()
X_vectors_bow = vectorizer_bow.fit_transform(X_clean).toarray()

In [9]:
X_vectors_bow[0, :20]

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0])

In [10]:
y[:5]

0    0
1    0
2    0
3    0
4    0
Name: Label, dtype: int64

##### Document-Term-Matrix => Vocabulary

In [11]:
vectorizer_bow.document_term_matrix.head()

Unnamed: 0,aaas,abandoned,abc,ability,able,abn,abortion,about,above,abroad,...,youre,youth,youve,yuan,yugansk,yuganskneftegas,yukos,yushchenko,zealand,zone
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Partition dataset (BOW)

In [12]:
X_train_bow, X_test_bow, y_train, y_test = train_test_split(
  X_vectors_bow, y , random_state=RANDOM_SEED,test_size=0.2, shuffle=True)

### Convert the features & labels into tensors

In [13]:
features_train = torch.tensor(X_train_bow).float()
features_test = torch.tensor(X_test_bow).float()
labels_train = torch.tensor(y_train.values)
labels_test = torch.tensor(y_test.values)   

### Classifer data using "bag of words" i.e. count vectorizer

In [14]:
def classify_documents(
        features_train, 
        labels_train, 
        features_test, 
        labels_test
        ):
    """
    Classify documents using Logistic Regression and print evaluation metrics.
    :param features_train: Training feature set
    :param labels_train: Training labels   
    :param features_test: Testing feature set
    :param labels_test: Testing labels
    :return: None
    """
    model = LogisticRegression()
    model.fit(features_train, labels_train)
    predictions = model.predict(features_test)
    f1 = f1_score(labels_test, predictions, average='weighted')
    confusion_matrix_score = confusion_matrix(labels_test, predictions)
    target_names = [f"topic {x}" for x in list(set(labels_test.tolist()))] 
    print(classification_report(labels_test, predictions, target_names=target_names))
    print("\n\n")
    print(f"Confusion Matrix:\n{confusion_matrix_score}")
    print(f"\n{LINE_DIVIDER}")

In [15]:
print("Bag-of-Words Document Classification Results")
classify_documents(
        features_train, 
        labels_train, 
        features_test, 
        labels_test
        )

Bag-of-Words Document Classification Results
              precision    recall  f1-score   support

     topic 0       0.99      0.96      0.98        84
     topic 1       0.98      0.99      0.99       100
     topic 2       1.00      0.91      0.95        91
     topic 3       0.96      1.00      0.98        64
     topic 4       0.95      1.00      0.97       106

    accuracy                           0.97       445
   macro avg       0.97      0.97      0.97       445
weighted avg       0.97      0.97      0.97       445




Confusion Matrix:
[[ 81   1   0   0   2]
 [  1  99   0   0   0]
 [  0   1  83   3   4]
 [  0   0   0  64   0]
 [  0   0   0   0 106]]



### Vectorize documents 
 - TF-IDF (Term Frequency Document Frequency)

In [16]:
vectorizer_tfidf = DocumentVectorizer(vectorizer_type=VectorizerType.TFIDF)
X_vectors_tfidf = vectorizer_tfidf.fit_transform(X_clean).toarray()


In [17]:
X_vectors_tfidf[0, :20]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.02992667, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.04099694, 0.        , 0.        , 0.        ])

In [18]:
y[:5]

0    0
1    0
2    0
3    0
4    0
Name: Label, dtype: int64

In [19]:
vectorizer_tfidf.document_term_matrix.head()

Unnamed: 0,aaas,abandoned,abc,ability,able,abn,abortion,about,above,abroad,...,youre,youth,youve,yuan,yugansk,yuganskneftegas,yukos,yushchenko,zealand,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.029927,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.032113,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.039394,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Partition dataset (TF-IDF)

In [20]:
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(
  X_vectors_tfidf, y , random_state=RANDOM_SEED,test_size=0.2, shuffle=True)

In [21]:
print("TF-IDF Document Classification Results")
classify_documents(
        X_train_tfidf, 
        y_train, 
        X_test_tfidf, 
        y_test
        )

TF-IDF Document Classification Results
              precision    recall  f1-score   support

     topic 0       1.00      0.96      0.98        84
     topic 1       0.98      1.00      0.99       100
     topic 2       0.98      0.97      0.97        91
     topic 3       0.97      0.98      0.98        64
     topic 4       0.98      0.99      0.99       106

    accuracy                           0.98       445
   macro avg       0.98      0.98      0.98       445
weighted avg       0.98      0.98      0.98       445




Confusion Matrix:
[[ 81   1   0   0   2]
 [  0 100   0   0   0]
 [  0   1  88   2   0]
 [  0   0   1  63   0]
 [  0   0   1   0 105]]

