# Train model

In [None]:
raw_train = pd.read_csv(DATA_DIR / 'train.csv')
labels = list(raw_train.columns[2:])

train_df = pd.DataFrame()
train_df['text'] = raw_train['comment_text']
train_df['labels'] = raw_train[labels].values.tolist()

train_df.head()

In [None]:
raw_test_comments = pd.read_csv(DATA_DIR / 'test.csv')
raw_test_labels = pd.read_csv(DATA_DIR / 'test_labels.csv')
raw_test_labels = raw_test_labels[raw_test_labels['toxic'] != -1]
raw_test = raw_test_comments.merge(raw_test_labels)

test_df = pd.DataFrame()
test_df['text'] = raw_test['comment_text']
test_df['labels'] = raw_test_labels[labels].values.tolist()

test_df.head()

In [None]:
# Create a MultiLabelClassificationModel
model = MultiLabelClassificationModel(
    'distilbert',
    'distilbert-base-uncased',
    num_labels=len(labels),
    args={'reprocess_input_data': True, 'overwrite_output_dir': True, 'num_train_epochs': 3}
)

# Train the model
model.train_model(train_df)

In [None]:
# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(test_df)
print(result)
print(model_outputs)

predictions, raw_outputs = model.predict(['This thing is entirely different from the other thing. '])
print(predictions)
print(raw_outputs)

In [None]:
predictions, test_outputs = model.predict(raw_test_comments['comment_text'].tolist())

sub_df = pd.DataFrame(test_outputs, columns=labels)
sub_df['id'] = raw_test_comments['id']
sub_df = sub_df[['id', 'toxic','severe_toxic','obscene','threat','insult','identity_hate']]
sub_df.to_csv('submission.csv', index=False)

# Predict all data

In [2]:
from pathlib import Path
import pandas as pd
from simpletransformers.classification import MultiLabelClassificationModel


ROOT_DIR = Path('../')
DATA_DIR = ROOT_DIR / 'data' / 'toxic_comments'

In [None]:
import itertools

def grouper_it(n, it):
    while True:
        chunk_it = itertools.islice(it, n)
        try:
            first_el = next(chunk_it)
        except StopIteration:
            return
        yield itertools.chain((first_el,), chunk_it)

In [None]:
CHUNK_SIZE = 100000
TOTAL_ROWS = 8003023
TEXTS_DIR = ROOT_DIR / 'data' / 'texts'
LABELS = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

In [None]:
# Load model
model = MultiLabelClassificationModel(
    'distilbert',
    'outputs/',
    num_labels=len(LABELS)
)

In [None]:
from tqdm import tqdm_notebook

STORE = pd.HDFStore('toxicity_classifications.h5', mode='w')

with tqdm_notebook(total=TOTAL_ROWS) as pbar:
    for chunk in grouper_it(CHUNK_SIZE, TEXTS_DIR.iterdir()):
        pbar.set_description('Loading texts')
        paths = list(chunk)
        texts = [file.read_text() for file in paths]

        pbar.set_description('Making predictions')
        predictions, outputs = model.predict(texts)
        
        pbar.set_description('Concatenating predictions to dataframe')
        filenames = [file.name for file in paths]
        df = pd.DataFrame(filenames, columns=['filename'])
        df[LABELS] = pd.DataFrame(outputs)
        
        pbar.set_description('Appending to HDF file')
        STORE.append('df', df, expectedrows=TOTAL_ROWS, index=False)
            
        pbar.update(CHUNK_SIZE)

# Load data

In [1]:
from pathlib import Path
import pandas as pd

In [2]:
store = pd.HDFStore('toxicity_classifications.h5')

In [6]:
df = store['df']

In [8]:
df.sort_values(by='toxic', ascending=False)

Unnamed: 0,filename,toxic,severe_toxic,obscene,threat,insult,identity_hate
50023,0268200-f3ffb0ce62a38c69a22e5f71a78b25d7.txt,0.999969,7.188264e-01,0.999042,9.984918e-03,0.964930,8.156478e-02
41398,0889736-881ff0021b2c637b077a2f8a3043caad.txt,0.999965,7.398418e-01,0.998936,1.684912e-02,0.954415,5.402331e-02
52083,0928499-a5045f73bef5b35d70bdf62997596936.txt,0.999962,6.638936e-01,0.998649,6.553145e-03,0.975948,2.995005e-01
86371,0994084-522158f188c00b021667c11ce23f5e17.txt,0.999961,6.740288e-01,0.998990,4.759024e-03,0.934879,2.318956e-02
72119,0672031-2e34d061fc3c5e59d8730115bf3ef6b5.txt,0.999959,7.773548e-01,0.998670,9.143099e-03,0.968365,8.965641e-02
...,...,...,...,...,...,...,...
84252,0250312-493662392ffc27a3b4ce1dbf4bf25ef7.txt,0.000069,1.842586e-07,0.000039,1.819052e-07,0.000021,6.871662e-07
24974,0541954-4d98290dc30119365ef44728e1f1c7a3.txt,0.000069,1.912773e-07,0.000040,1.839583e-07,0.000022,6.929150e-07
68984,0218730-595620768bd5238e5b1fa98c50d3c925.txt,0.000069,1.771524e-07,0.000039,1.788764e-07,0.000021,6.764629e-07
98059,0491329-b4b2006a1f439b69013027618bdb1f4b.txt,0.000069,1.779267e-07,0.000039,1.788297e-07,0.000021,6.761495e-07
