<a href="https://colab.research.google.com/github/alexlimatds/victor-doc_classification/blob/main/victor_SVM_tf_idf_SMOTE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Document classification with SVM and input oversampling

This notebook implements a SVM model to perform classification of documents from the Victor dataset. Because the dataset is high imbalanced, the SMOTE algorithm is applied to oversample the minority classes.

original source code: https://github.com/peluz/VICTOR-dataset/blob/master/shallow_clf_docType.ipynb

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = '/content/gdrive/My Drive/'

Mounted at /content/gdrive


### Loading and pre-processing the dataset

In [None]:
dataset_dir = root_dir + 'Machine Learning/Victor datasets/'

train = pd.read_csv(dataset_dir + "train_small.csv", usecols=["document_type", "body"])
valid = pd.read_csv(dataset_dir + "validation_small.csv", usecols=["document_type", "body"])
test = pd.read_csv(dataset_dir + "test_small.csv", usecols=["document_type", "body"])

In [None]:
def strip_trash(df, column="body"):
  df[column] = df[column].str.strip('{}"')
  return df

train_clean = strip_trash(train)
valid_clean = strip_trash(valid)
test_clean = strip_trash(test)
len(train)

149217

In [None]:
del(train)
del(valid)
del(test)

### Text vectorization

In [None]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    ngram_range=(1, 2), 
    sublinear_tf=True,
    min_df=2, 
    max_df=0.5)
tfidf.fit(train_clean['body'])

CPU times: user 1min 3s, sys: 1.46 s, total: 1min 4s
Wall time: 1min 5s


In [None]:
%%time

x_train = tfidf.transform(train_clean['body'])
y_train = train_clean["document_type"].values
del(train_clean)

x_test = tfidf.transform(test_clean['body'])
y_test = test_clean["document_type"].values
del(test_clean)

x_valid = tfidf.transform(valid_clean['body'])
y_valid = valid_clean["document_type"].values
del(valid_clean)

TypeError: ignored

### Dataset oversampling

In [None]:
%%time
from imblearn.over_sampling import SMOTE

strategy = {
    'acordao_de_2_instancia': 2000, 
    'despacho_de_admissibilidade': 2000
}
x_train_resampled, y_train_resampled = SMOTE(sampling_strategy=strategy).fit_resample(x_train, y_train)



CPU times: user 35 s, sys: 4.26 s, total: 39.3 s
Wall time: 39.4 s


### Training model

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

svm = LinearSVC(
    verbose=2, 
    class_weight="balanced")

In [None]:
%%time

svm.fit(x_train_resampled, y_train_resampled)

[LibLinear]CPU times: user 1min 46s, sys: 92 ms, total: 1min 46s
Wall time: 1min 47s


LinearSVC(C=1.0, class_weight='balanced', dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=2)

### Evaluation

In [None]:
preds_train = svm.predict(x_train)
preds_test = svm.predict(x_test)
preds_valid = svm.predict(x_valid)

In [None]:
from sklearn.metrics import classification_report

test_report = classification_report(
    y_test, 
    preds_test, 
    digits=4, 
    target_names=svm.classes_)

valid_report = classification_report(
    y_valid, 
    preds_valid, 
    digits=4, 
    target_names=svm.classes_)

train_report = classification_report(
    y_train, 
    preds_train, 
    digits=4, 
    target_names=svm.classes_)

print('Train\n', train_report)
print('Validation\n', valid_report)
print('Test\n', test_report)

Train
                                   precision    recall  f1-score   support

          acordao_de_2_instancia     0.7702    1.0000    0.8702       553
agravo_em_recurso_extraordinario     0.7000    0.9788    0.8162      2546
     despacho_de_admissibilidade     0.6541    1.0000    0.7909       346
                          outros     0.9985    0.9732    0.9857    134134
                   peticao_do_RE     0.8223    0.9648    0.8879      9509
                        sentenca     0.8427    0.9991    0.9142      2129

                        accuracy                         0.9733    149217
                       macro avg     0.7980    0.9860    0.8775    149217
                    weighted avg     0.9783    0.9733    0.9747    149217

Validation
                                   precision    recall  f1-score   support

          acordao_de_2_instancia     0.7477    0.8328    0.7880       299
agravo_em_recurso_extraordinario     0.5757    0.6901    0.6277      2149
     despacho_d