<a href="https://colab.research.google.com/github/alexlimatds/victor-doc_classification/blob/main/victor_SVM_tf_idf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Document classification with SVM

This notebook implements a SVM model to perform classification of documents from the Victor dataset.

original source code: https://github.com/peluz/VICTOR-dataset/blob/master/shallow_clf_docType.ipynb

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from tqdm import tqdm


In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = '/content/gdrive/My Drive/'

Mounted at /content/gdrive


### Loading and pre-processing the dataset

In [None]:
dataset_dir = root_dir + 'Machine Learning/Victor datasets/'

train = pd.read_csv(dataset_dir + "train_small.csv", usecols=["document_type", "body"])
valid = pd.read_csv(dataset_dir + "validation_small.csv", usecols=["document_type", "body"])
test = pd.read_csv(dataset_dir + "test_small.csv", usecols=["document_type", "body"])

In [None]:
def strip_trash(df, column="body"):
  df[column] = df[column].str.strip('{}"')
  return df

train_clean = strip_trash(train)
valid_clean = strip_trash(valid)
test_clean = strip_trash(test)
len(train)

149217

In [None]:
del(train)
del(valid)
del(test)

In [None]:
x_train, y_train = train_clean["body"].values, train_clean["document_type"].values
x_valid, y_valid = valid_clean["body"].values, valid_clean["document_type"].values
x_test, y_test = test_clean["body"].values, test_clean["document_type"].values

len(x_train), len(y_train), len(x_valid), len(y_valid), len(x_test), len(y_test)

(149217, 149217, 94735, 94735, 95526, 95526)

In [None]:
del(train_clean)
del(valid_clean)
del(test_clean)

### Text vectorization

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

pipe_svm = Pipeline((
  ("vectorizer", 
   TfidfVectorizer(
       ngram_range=(1, 2), 
       sublinear_tf=True,
       min_df=2, 
       max_df=0.5)),
  ("clf", 
   LinearSVC(
       verbose=2, 
       class_weight="balanced"))
))

In [None]:
%%time

pipe_svm.fit(x_train, y_train)

[LibLinear]CPU times: user 3min 12s, sys: 1.21 s, total: 3min 13s
Wall time: 3min 14s




Pipeline(memory=None,
         steps=[('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.5, max_features=None,
                                 min_df=2, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=True,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight='balanced', dual=True,
                           fit_intercept=True, intercept_scaling=1,
     

### Evaluation

In [None]:
preds_train = pipe_svm.predict(x_train)
preds_test = pipe_svm.predict(x_test)
preds_valid = pipe_svm.predict(x_valid)

In [None]:
from sklearn.metrics import classification_report

test_report = classification_report(
    y_test, 
    preds_test, 
    digits=4, 
    target_names=pipe_svm.classes_)

valid_report = classification_report(
    y_valid, 
    preds_valid, 
    digits=4, 
    target_names=pipe_svm.classes_)

train_report = classification_report(
    y_train, 
    preds_train, 
    digits=4, 
    target_names=pipe_svm.classes_)

print('Train\n', train_report)
print('Validation\n', valid_report)
print('Test\n', test_report)

Train
                                   precision    recall  f1-score   support

          acordao_de_2_instancia     0.7889    1.0000    0.8820       553
agravo_em_recurso_extraordinario     0.6994    0.9788    0.8158      2546
     despacho_de_admissibilidade     0.6732    1.0000    0.8047       346
                          outros     0.9985    0.9734    0.9858    134134
                   peticao_do_RE     0.8225    0.9646    0.8879      9509
                        sentenca     0.8434    0.9991    0.9146      2129

                        accuracy                         0.9735    149217
                       macro avg     0.8043    0.9860    0.8818    149217
                    weighted avg     0.9784    0.9735    0.9748    149217

Validation
                                   precision    recall  f1-score   support

          acordao_de_2_instancia     0.7778    0.8194    0.7980       299
agravo_em_recurso_extraordinario     0.5750    0.6901    0.6273      2149
     despacho_d