<a href="https://colab.research.google.com/github/alexlimatds/victor-doc_classification/blob/main/victor_SVM_tf_idf_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Document classification with SVM and additional text preprocessing

This notebook implements a SVM model to perform classification of documents from the Victor dataset. The dataset used was previously preprocessed with: stop words removal; NER; lemmatization.

Original source code: https://github.com/peluz/VICTOR-dataset/blob/master/shallow_clf_docType.ipynb

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = '/content/gdrive/My Drive/'

Mounted at /content/gdrive


### Loading dataset

In [None]:
dataset_dir = root_dir + 'Machine Learning/Victor datasets/'

train = pd.read_csv(dataset_dir + "TRAIN-tag_stop_words_False-lemmatize_True.zip", compression='zip').dropna()
valid = pd.read_csv(dataset_dir + "VALIDATION-tag_stop_words_False-lemmatize_True.zip", compression='zip').dropna()
test = pd.read_csv(dataset_dir + "TEST-tag_stop_words_False-lemmatize_True.zip", compression='zip').dropna()

In [None]:
def strip_trash(df, column="body"):
  df[column] = df[column].str.strip('{}"')
  return df

train_clean = strip_trash(train)
valid_clean = strip_trash(valid)
test_clean = strip_trash(test)
len(train)

149214

In [None]:
del(train)
del(valid)
del(test)

In [None]:
x_train, y_train = train_clean["body"].values, train_clean["document_type"].values
x_valid, y_valid = valid_clean["body"].values, valid_clean["document_type"].values
x_test, y_test = test_clean["body"].values, test_clean["document_type"].values

len(x_train), len(y_train), len(x_valid), len(y_valid), len(x_test), len(y_test)

(149214, 149214, 94733, 94733, 95524, 95524)

In [None]:
del(train_clean)
del(valid_clean)
del(test_clean)

### Training model

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

pipe_svm = Pipeline((
  ("vectorizer", 
   TfidfVectorizer(
       ngram_range=(1, 2), 
       sublinear_tf=True,
       min_df=2, 
       max_df=0.5)),
  ("clf", 
   LinearSVC(
       verbose=2, 
       class_weight="balanced"))
))

In [None]:
%%time

pipe_svm.fit(x_train, y_train)

[LibLinear]CPU times: user 2min 24s, sys: 984 ms, total: 2min 25s
Wall time: 2min 26s




Pipeline(memory=None,
         steps=[('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.5, max_features=None,
                                 min_df=2, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=True,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight='balanced', dual=True,
                           fit_intercept=True, intercept_scaling=1,
     

### Evaluation

In [None]:
preds_train = pipe_svm.predict(x_train)
preds_test = pipe_svm.predict(x_test)
preds_valid = pipe_svm.predict(x_valid)

In [None]:
from sklearn.metrics import classification_report

test_report = classification_report(
    y_test, 
    preds_test, 
    digits=4, 
    target_names=pipe_svm.classes_)

valid_report = classification_report(
    y_valid, 
    preds_valid, 
    digits=4, 
    target_names=pipe_svm.classes_)

train_report = classification_report(
    y_train, 
    preds_train, 
    digits=4, 
    target_names=pipe_svm.classes_)

print('Train\n', train_report)
print('Validation\n', valid_report)
print('Test\n', test_report)

Train
                                   precision    recall  f1-score   support

          acordao_de_2_instancia     0.7855    1.0000    0.8799       553
agravo_em_recurso_extraordinario     0.6997    0.9800    0.8164      2546
     despacho_de_admissibilidade     0.6758    1.0000    0.8065       346
                          outros     0.9986    0.9743    0.9863    134131
                   peticao_do_RE     0.8263    0.9622    0.8891      9509
                        sentenca     0.8552    0.9986    0.9213      2129

                        accuracy                         0.9741    149214
                       macro avg     0.8068    0.9859    0.8833    149214
                    weighted avg     0.9789    0.9741    0.9755    149214

Validation
                                   precision    recall  f1-score   support

          acordao_de_2_instancia     0.7729    0.8194    0.7955       299
agravo_em_recurso_extraordinario     0.5546    0.6831    0.6122      2149
     despacho_d