<a href="https://colab.research.google.com/github/alexlimatds/victor-doc_classification/blob/main/victor_xgboost_tf_idf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Document classification with XGBoost 

This notebook implements a XGBoost model to perform classification of documents from the Victor dataset.

original source code: https://github.com/peluz/VICTOR-dataset/blob/master/shallow_clf_docType.ipynb

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from tqdm import tqdm

tqdm.pandas()

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = '/content/gdrive/My Drive/'

Mounted at /content/gdrive


### Loading and preprocessing dataset

In [None]:
dataset_dir = root_dir + 'Machine Learning/Victor datasets/'

train = pd.read_csv(dataset_dir + "train_small.csv", usecols=["document_type", "body"])
valid = pd.read_csv(dataset_dir + "validation_small.csv", usecols=["document_type", "body"])
test = pd.read_csv(dataset_dir + "test_small.csv", usecols=["document_type", "body"])

In [None]:
def strip_trash(df, column="body"):
  df[column] = df[column].str.strip('{}"')
  return df

train_clean = strip_trash(train)
valid_clean = strip_trash(valid)
test_clean = strip_trash(test)
len(train)

149217

In [None]:
del(train)
del(valid)
del(test)

In [None]:
x_train, y_train = train_clean["body"].values, train_clean["document_type"].values
x_valid, y_valid = valid_clean["body"].values, valid_clean["document_type"].values
x_test, y_test = test_clean["body"].values, test_clean["document_type"].values

len(x_train), len(y_train), len(x_valid), len(y_valid), len(x_test), len(y_test)

(149217, 149217, 94735, 94735, 95526, 95526)

In [None]:
del(train_clean)
del(valid_clean)
del(test_clean)

### Training model

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost.sklearn import XGBClassifier

pipe_xgb = Pipeline((
  ("vectorizer", 
   TfidfVectorizer(
       ngram_range=(1, 2), 
       sublinear_tf=True,
       min_df=2, 
       max_df=0.5)),
  ("clf", XGBClassifier())
))

In [None]:
%%time

pipe_xgb.fit(x_train, y_train)

CPU times: user 43min 32s, sys: 2.78 s, total: 43min 35s
Wall time: 43min 35s


Pipeline(memory=None,
         steps=[('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.5, max_features=None,
                                 min_df=2, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=True,
                                 token_patte...
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, learning_rate=0.1,
                               max_delta_step=0, max_depth=3,
                        

### Evaluation

In [None]:
preds_train = pipe_xgb.predict(x_train)
preds_test = pipe_xgb.predict(x_test)
preds_valid = pipe_xgb.predict(x_valid)

In [None]:
from sklearn.metrics import classification_report

test_report = classification_report(
    y_test, 
    preds_test, 
    digits=4, 
    target_names=pipe_xgb.classes_)

valid_report = classification_report(
    y_valid, 
    preds_valid, 
    digits=4, 
    target_names=pipe_xgb.classes_)

train_report = classification_report(
    y_train, 
    preds_train, 
    digits=4, 
    target_names=pipe_xgb.classes_)

print('Train\n', train_report)
print('Validation\n', valid_report)
print('Test\n', test_report)

Train
                                   precision    recall  f1-score   support

          acordao_de_2_instancia     0.9381    0.5750    0.7130       553
agravo_em_recurso_extraordinario     0.9803    0.2541    0.4036      2546
     despacho_de_admissibilidade     0.9279    0.5578    0.6968       346
                          outros     0.9362    0.9968    0.9655    134134
                   peticao_do_RE     0.9081    0.4217    0.5759      9509
                        sentenca     0.9832    0.3584    0.5253      2129

                        accuracy                         0.9358    149217
                       macro avg     0.9456    0.5273    0.6467    149217
                    weighted avg     0.9358    0.9358    0.9233    149217

Validation
                                   precision    recall  f1-score   support

          acordao_de_2_instancia     0.9536    0.6187    0.7505       299
agravo_em_recurso_extraordinario     0.9714    0.1582    0.2721      2149
     despacho_d