In [4]:
import pandas as pd

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder


In [6]:
train_df = pd.read_json("./data/dataset/v6/train.json")

In [10]:
print(train_df.shape)

(2199, 9)


In [12]:
texts = train_df['section_content'].values
labels_text = train_df['tags'].values

In [15]:
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels_text)

In [18]:
labels

array([ 5,  5, 11, ..., 12, 12, 12])

In [19]:
# Build the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', SGDClassifier(loss='log_loss', random_state=42))
])

# Train the model
pipeline.fit(texts, labels)


In [26]:
preds = pipeline.predict(texts)
preds = label_encoder.inverse_transform(preds)
print(classification_report(preds, label_encoder.inverse_transform(labels)))

                                    precision    recall  f1-score   support

                  Additional Liens       0.98      0.98      0.98        60
                 Asset Disposition       0.97      1.00      0.99        34
            Compliance Certificate       0.97      1.00      0.99        69
           Consequences of Default       0.88      0.97      0.92        31
                  Event of Default       1.00      1.00      1.00        59
           Facilities / Instrument       0.92      0.91      0.92       116
                Financial Covenant       1.00      0.98      0.99        63
              Financial Statements       0.99      1.00      0.99        82
                    Governing Laws       1.00      0.96      0.98        52
            Incremental Facilities       0.79      0.93      0.86        29
                     Interest Rate       1.00      0.97      0.99        76
                    Loan Repayment       0.97      1.00      0.98        63
Mandatory P

In [27]:
test_df = pd.read_json("./data/dataset/v6/test.json")
test_texts = test_df['section_content'].values
test_labels = test_df['tags'].values
# test_labels = label_encoder.transform(test_labels)

In [29]:
y_pred = pipeline.predict(test_texts)

# Evaluate the model
print(classification_report(test_labels, label_encoder.inverse_transform(y_pred)))

                                    precision    recall  f1-score   support

                  Additional Liens       1.00      0.83      0.91        18
                 Asset Disposition       1.00      0.83      0.91        12
            Compliance Certificate       1.00      0.67      0.80         3
           Consequences of Default       0.86      0.55      0.67        11
                  Event of Default       1.00      0.92      0.96        13
           Facilities / Instrument       0.90      0.90      0.90        20
                Financial Covenant       0.92      1.00      0.96        12
              Financial Statements       1.00      1.00      1.00         4
                    Governing Laws       0.83      0.91      0.87        11
            Incremental Facilities       1.00      0.83      0.91         6
                     Interest Rate       0.82      0.95      0.88        19
                    Loan Repayment       1.00      0.53      0.70        15
Mandatory P