In [None]:
import numpy as np
np.random.seed(42)
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

import warnings
warnings.filterwarnings('ignore')

import os
os.environ['OMP_NUM_THREADS'] = '8'

import json







In [None]:
def __flatten_child(trees):
    flat = []
    #flat.extend(map(lambda x: {"id": x["id"], "text": x["text"], "insult": int(x["insult"]), "not insult": int(not x["insult"])}, filter(lambda x: "insult"  in x, trees)))
    flat.extend(map(lambda x: {"id": x["id"], "text": x["text"], "insult": int(x["insult"])}, filter(lambda x: "insult" in x, trees)))
    for obj in trees:
        if "children" in obj:
            flat.extend(__flatten_child(obj["children"]))
    return flat

def __flatten(trees):
    flat = []
    for root in trees:
        obj = root["root"]
        if "children" in obj:
            flat.extend(__flatten_child(obj["children"]))
    return flat

def __split_corp_label(labeled_discussions):
    corpus = []
    labels = []
    for obj in __flatten(labeled_discussions):
        if "insult" in obj:
            corpus.append({"id": obj["id"], "text": obj["text"]})
            #labels.append({"id": obj["id"], "insult": int(obj["insult"]), "not insult": int(not obj["insult"])})
            labels.append({"id": obj["id"], "insult": int(obj["insult"])})
    
    return corpus, labels

#EMBEDDING_FILE = 'C:/fasttext/cc.ru.300.vec'
EMBEDDING_FILE = 'C:/fasttext/fasttext300.bin'



test_data = json.load(open("discussions_tpc_2015/modis/discussions.json", encoding="utf8"))
train_data = json.load(open("discussions_tpc_2015/students/discussions.json", encoding="utf8"))

train = pd.DataFrame.from_records(__flatten(train_data))
test = pd.DataFrame.from_records(__flatten(test_data))
#submission = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv')



In [None]:
import re
import logging
import itertools
import unicodedata
# import contractions

from bs4 import BeautifulSoup

class TextCleaningUtils:
    '''
        This class contains implementations of various text cleaning operations (Static Methods)
    '''


    cleaning_regex_map = {
        'web_links': r'(?i)(?:(?:http(?:s)?:)|(?:www\.))\S+',
        'special_chars': r'[^a-zA-Zа-яА-ЯёЁ0-9\s\.,!?;:]+',
        'redundant_spaces': r'\s\s+',
        'redundant_newlines': r'[\r|\n|\r\n]+',
        'twitter_handles': r'[#@]\S+',
        'punctuations': r'[\.,!?;:]+'
    }

    @staticmethod
    def clean_text_from_regex(text, text_clean_regex):
        '''
            Follow a particular cleaning expression, provided
            as an input by an user to clean the text.
        '''

        text = text_clean_regex.sub(' ', text).strip()
        return text

    @staticmethod
    def strip_html(text):
        soup = BeautifulSoup(text, "html.parser")
        return soup.get_text()

    @staticmethod
    def remove_special_chars(text):
        '''
            Replace any special character provided as default,
            which is present in the text with space
        '''

        special_chars_regex = re.compile(TextCleaningUtils.cleaning_regex_map['special_chars'])
        text = TextCleaningUtils.clean_text_from_regex(text, special_chars_regex)
        return text

    @staticmethod
    def remove_redundant_spaces(text):
        '''
            Remove any redundant space provided as default,
            that is present in the text.
        '''

        redundant_spaces_regex = re.compile(
            TextCleaningUtils.cleaning_regex_map['redundant_spaces'])
        text = TextCleaningUtils.clean_text_from_regex(text, redundant_spaces_regex)
        return text

    @staticmethod
    def remove_web_links(text):
        '''
            Removes any web link that follows a particular default expression,
            present in the text.
        '''

        web_links_regex = re.compile(TextCleaningUtils.cleaning_regex_map['web_links'])
        text = TextCleaningUtils.clean_text_from_regex(text, web_links_regex)
        return text

    @staticmethod
    def remove_twitter_handles(text):
        '''
            Removes any twitter handle present in the text.
        '''

        twitter_handles_regex = re.compile(TextCleaningUtils.cleaning_regex_map['twitter_handles'])
        text = TextCleaningUtils.clean_text_from_regex(text, twitter_handles_regex)
        return text

    @staticmethod
    def remove_redundant_newlines(text):
        '''
            Removes any redundant new line present in the text.
        '''

        redundant_newlines_regex = re.compile(
            TextCleaningUtils.cleaning_regex_map['redundant_newlines'])
        text = TextCleaningUtils.clean_text_from_regex(text, redundant_newlines_regex)
        return text

    @staticmethod
    def remove_punctuations(text):
        '''
            Removes any punctuation that follows the default expression, in the text.
        '''

        remove_punctuations_regex = re.compile(TextCleaningUtils.cleaning_regex_map['punctuations'])
        text = TextCleaningUtils.clean_text_from_regex(text, remove_punctuations_regex)
        return text

    @staticmethod
    def remove_exaggerated_words(text):
        '''
            Removes any exaggerated word present in the text.
        '''

        return ''.join(''.join(s)[:2] for _, s in itertools.groupby(text))

    @staticmethod
    def replace_multiple_chars(text):
        '''
            Replaces multiple characters present in the text.
        '''

        char_list = ['.', '?', '!', '#', '$', '/', '@', '*', '(', ')', '+']
        final_text = ''
        for i in char_list:
            if i in text:
                pattern = "\\" + i + '{2,}'
                repl_str = i.replace("\\", "")
                text = re.sub(pattern, repl_str, text)
                final_text = ' '.join(text.split())
        return final_text

    @staticmethod
    def replace_sign(text):
        '''
            Replaces any sign with words like & with 'and', in the text.
        '''
        sign_list = {'&': ' and ', '/': ' or ', '\xa0': ' '}
        final_text = ''
        for i in sign_list:
            if i in text:
                text = re.sub(i, sign_list[i], text)
                final_text = ' '.join(text.split())
        return final_text

    @staticmethod
    def remove_accented_char(text):
        text = unicodedata.normalize('NFD', text) \
            .encode('ascii', 'ignore') \
            .decode("utf-8")
        return str(text)

    @staticmethod
    def replace_characters(text, replace_map):
        '''
            Replaces any character custom provided by an user.
        '''

        for char, replace_val in replace_map.items():
            text = text.replace(char, replace_val)
        return text

In [None]:
def clean_data(df,col_to_clean):
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_special_chars)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_redundant_spaces)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_punctuations)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_exaggerated_words)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_redundant_newlines)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_twitter_handles)
  df[col_to_clean] = df[col_to_clean].apply(TextCleaningUtils.remove_web_links)
  df[col_to_clean] = df[col_to_clean].astype(str)
  df[col_to_clean] = df[col_to_clean].str.lower()
  return df

In [None]:
train = clean_data(train,'text')
test = clean_data(test,'text')

In [None]:

X_train = train["text"].fillna("fillna").values
y_train = train[["insult"]].values
X_test = test["text"].fillna("fillna").values
y_test = test[["insult"]].values

In [None]:
max_features = 30000
maxlen = 100
embed_size = 300

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [None]:
%%time
from gensim.models.fasttext import load_facebook_model

embeddings_index = load_facebook_model(EMBEDDING_FILE)


In [None]:
%%time
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE, encoding="utf8"))
               

In [None]:
%%time


word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    #embedding_vector = embeddings_index.get(word)
    embedding_vector = None
    if word in embeddings_index.wv:
        embedding_vector = embeddings_index.wv[word]
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [None]:
from keras.metrics import Precision, Recall, AUC

def f1(y_true, y_pred):
    p = Precision()
    p.update_state(y_true, y_pred)
    precision = p.result().numpy()
    r = Recall()
    r.update_state(y_true, y_pred)
    recall = r.result().numpy()
    return precision, recall, 2*((precision*recall)/(precision+recall))


class F1Evaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = f1(self.y_val, y_pred)
            print(score)


def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(80, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(1, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=[Precision(), Recall()])

    return model

In [None]:
model = get_model()


batch_size = 32
epochs = 6

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.8)
F1_e = F1Evaluation(validation_data=(X_val, y_val), interval=1)



In [None]:
%%time
hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[F1_e], verbose=1)




In [None]:
batch_size = 32
epochs = 4
def cross_val(X, Y):
    for i in range(5):

        print(i)
        X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.8)
        F1_e = F1Evaluation(validation_data=(X_val, y_val), interval=1)
        model = get_model()
        model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[F1_e], verbose=1)
        

In [None]:
batch_size = 32
epochs = 4
def cross_val(X, Y):
    for i in range(5):

        print(i)
        X_tra, X_val, y_tra, y_val = train_test_split(x_test, y_test, train_size=0.8)
        F1_e = F1Evaluation(validation_data=(X_val, y_val), interval=1)
        model = get_model()
        model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[F1_e], verbose=1)

In [None]:
%%time
cross_val(x_train, y_train)

In [None]:
y_pred = model.predict(x_test, batch_size=1024)
print(f1(y_test, y_pred))