In [1]:
import fastText
from script import load_data
import pandas as pd
import subprocess
import numpy as np
import os
import re
from sklearn.metrics import auc, roc_auc_score

hdf5 is not supported on this machine (please install/reinstall h5py for optimal experience)


In [17]:
def cleaned(content):
    # First remove inline JavaScript/CSS:
    cleaned_content = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", content)
    # Then remove html comments.
    cleaned_content = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned_content)
    # Next remove the remaining tags:
    cleaned_content = re.sub(r"(?s)<.*?>", " ", cleaned_content)
    # Finally deal with whitespace
    cleaned_content = re.sub(r"&nbsp;", " ", cleaned_content)
    cleaned_content = re.sub(r"^$", "", cleaned_content)
    cleaned_content = re.sub("''|,", "", cleaned_content)
    cleaned_content = re.sub(r" {2}", " ", cleaned_content)
    cleaned_content = re.sub(r"[^A-Za-z0-9(),!?\'`]", " ", cleaned_content)
    cleaned_content = re.sub(r"\'s", " 's", cleaned_content)
    cleaned_content = re.sub(r"\'m", " 'm", cleaned_content)
    cleaned_content = re.sub(r"\'ve", " 've", cleaned_content)
    cleaned_content = re.sub(r"n\'t", " n't", cleaned_content)
    cleaned_content = re.sub(r"\'re", " 're", cleaned_content)
    cleaned_content = re.sub(r"\'d", " 'd", cleaned_content)
    cleaned_content = re.sub(r"\'ll", " 'll", cleaned_content)
    cleaned_content = re.sub(r",", " , ", cleaned_content)
    cleaned_content = re.sub(r"!", " ! ", cleaned_content)
    cleaned_content = re.sub(r"\(", " ( ", cleaned_content)
    cleaned_content = re.sub(r"\)", " ) ", cleaned_content)
    cleaned_content = re.sub(r"\?", " ? ", cleaned_content)
    cleaned_content = re.sub(r"\s{2,}", " ", cleaned_content)
    cleaned_content = re.sub(r"\d+", "", cleaned_content)
    cleaned_content = re.sub(r"[\r\n]+", " ", cleaned_content)
    cleaned_content = re.sub(r'^(https|http)?://.*[\r\n]*', '', cleaned_content)
    return cleaned_content.strip()


def to_fast_text_format(labels):
    def _to_fast_text_format(item):
        prepended = ["__label__{}".format(label) for label in labels if item[label] == 1]
        if len(prepended) == 0:
            prepended.append('__label__none')
        return " ".join(prepended)+ " " + str(item.comment_text)
        
    return _to_fast_text_format

def load_and_save_ft_format(input_csv_file, output_ft_file):
    labels = [
        'toxic',
        'obscene',
        'insult',
        'identity_hate',
        'severe_toxic',
        'threat'
    ]
    with open(input_csv_file, 'r') as csvfile:
        df = pd.read_csv(input_csv_file)
        out = df.apply(to_fast_text_format(labels), axis=1)
        
    with open(output_ft_file, 'w+') as ftfile:
        for _, text in out.iteritems():
            ftfile.write(text)
            ftfile.write('\n')
    return len(out)

def eval_model(input_file, ft, labels):
    with open(input_file, 'r') as f:
        ys = []
        y_preds = []
        for line in f.readlines():
            sent = line.split()
            sample_labels, sample = sent[:7], sent[7:]
            sample = " ".join(sample)
            y = [1 if label in sample_labels else 0 for label in labels]
            ys.append(y)
            _, pred = ft.predict(sample, k=7)
            y_preds.append(pred)
        y_preds = np.array(y_preds)
        ys = np.array(ys, dtype=np.float32)
        y_preds = y_preds[:, 1:]
        ys = ys[:, 1:]
        print(input_file, "auc: ", roc_auc_score(ys, y_preds, average="micro"))

def load_and_preprocess_data(data_type):
    df = pd.read_csv(f'data/{data_type}.csv')
    df['comment_text'].fillna("unknown", inplace=True)
    df['comment_text'] = df['comment_text'].apply(lambda x: cleaned(x).lower())
    df.to_csv(f'data/{data_type}.preprocessed.csv', index=False)
    if data_type == 'train':
        lino = load_and_save_ft_format(f'data/{data_type}.preprocessed.csv', f'data/{data_type}.fasttext.txt')
        os.system(f'gshuf -o data/shuffled_{data_type}.txt data/{data_type}.fasttext.txt')
        train_part = int(lino * 0.9)
        dev_part = lino - train_part
        os.system(f'head -n {train_part} data/shuffled_{data_type}.txt > data/fast_text_train')
        os.system(f'tail -n {dev_part} data/shuffled_{data_type}.txt > data/fast_text_dev')
    return df            

In [18]:
df = load_and_preprocess_data('train')

In [40]:
labels = [
    '__label__none',
    '__label__toxic',
    '__label__obscene',
    '__label__insult',
    '__label__identity_hate',
    '__label__severe_toxic',
    '__label__threat'
]

ft = fastText.train_supervised('data/fast_text_train',
                               epoch=2, 
                               dim=100,
                               lr=1.0,
                               loss="softmax",
                               wordNgrams=2, 
                               verbose=2,
                               minCount=1)
ft.save_model('model.bin')
eval_model('data/fast_text_dev', ft, labels)
eval_model('data/fast_text_train', ft, labels)

data/fast_text_dev auc:  0.8775110019227296
data/fast_text_train auc:  0.8958360848684574


In [41]:
def predictions(model, comments, label_names):
    out = []
    target_order = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
    for row in comments.itertuples():
        mapping, preds = model.predict(row.comment_text, k=len(label_names))
        res = {label: pred for label, pred in zip(mapping, preds)}
        prediction = [row.id]
        for label in target_order:
            ft_label = f'__label__{label}'
            prediction.append(res[ft_label])
        out.append(prediction)
    return out

In [28]:
submit_data = load_and_preprocess_data('test')

In [42]:
out = predictions(ft, submit_data, labels)

In [43]:
submit_df = pd.DataFrame(out, columns=['id','toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])
submit_df.to_csv('data/submition.csv', index=False)

153164