In [7]:
from read_data import read_data

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import ClassifierChain
from sklearn.pipeline import make_pipeline

In [8]:
train_df, test_df, test_labels_df = read_data("data/")

# ignore test values with -1 labels
test_df_usable = test_df.loc[test_labels_df['toxic'] != -1]
test_labels_df_usable = test_labels_df.loc[test_labels_df['toxic'] != -1]

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [9]:
# establish train and test datasets
train_X = train_df['comment_text']
train_y = train_df[label_cols]
test_X = test_df_usable['comment_text']
test_y = test_labels_df_usable[label_cols]

Chain order: insult, obscene, toxic, severe_toxic, identity_hate, threat  
(idx: 4, 2, 0, 1, 5, 3)

In [10]:
# TF-IDF + ClassifierChain(LogReg), probabilities fed forward
base = LogisticRegression(max_iter=2000, class_weight="balanced")
order = [4, 2, 0, 1, 5, 3]

chain = ClassifierChain(
    estimator=base,
    order=order,
    cv=5,
    chain_method="predict_proba"
)

pipe = make_pipeline(
    TfidfVectorizer(max_features=50000, ngram_range=(1,2), stop_words="english"),
    chain
)

pipe.fit(train_X, train_y)

# Probabilities for each label in chain order, shape: (n_samples, n_labels)
Y_proba = pipe.predict_proba(test_X)