In [1]:
import os
import gc
import copy
import time
import random
import pickle
import joblib
import string

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader

# Utils
from tqdm.notebook import tqdm
tqdm.pandas()
from collections import defaultdict

# Sklearn Imports
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.utils.class_weight import compute_class_weight
from sklearn.linear_model import Ridge
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from scipy import sparse

# ML Imports
from xgboost import XGBRegressor

# NLP Imports
import re
import nltk
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
nltk.download('wordnet')
import itertools
from string import ascii_lowercase
import fasttext as ft
from bs4 import BeautifulSoup
from gensim.models import KeyedVectors, FastText

# For Transformer Models
from transformers import AutoTokenizer, AutoModel, AdamW, AutoConfig

# For colored terminal text
from colorama import Fore, Back, Style
b_ = Fore.BLUE
y_ = Fore.YELLOW
sr_ = Style.RESET_ALL

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

[nltk_data] Error loading wordnet: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>


In [2]:
PRIVATE = False
VAL = False
n_folds =7

In [3]:
if PRIVATE:
    def create_folds(data, num_splits):
        num_bins = int(np.floor(1 + np.log2(len(data))))
        data.loc[:, "bins"] = pd.cut(
            data["y"], bins=num_bins, labels=False
        )

        data["fold"] = -1
        kf = StratifiedKFold(n_splits=num_splits)
        for f, (t_, v_) in enumerate(kf.split(X=data, y=data['bins'].values)):
            data.loc[v_, 'fold'] = f
        data = data.drop("bins", axis=1)
        return data

In [4]:
df_val = pd.read_csv("../input/js-cleaned-validation-data/Validation_data_clean_no_duplicates.csv")
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

In [5]:
if df_sub.shape[0] == 7537:
    PRIVATE = False

In [6]:
RE_PATTERNS = {
    ' american ':
        [
            'amerikan'
        ],

    ' adolf ':
        [
            'adolf'
        ],


    ' hitler ':
        [
            'hitler'
        ],

    ' fuck':
        [
            '(f)(u|[^a-z0-9 ])(c|[^a-z0-9 ])(k|[^a-z0-9 ])([^ ])*',
            '(f)([^a-z]*)(u)([^a-z]*)(c)([^a-z]*)(k)',
            ' f[!@#\$%\^\&\*]*u[!@#\$%\^&\*]*k', 'f u u c',
            '(f)(c|[^a-z ])(u|[^a-z ])(k)', r'f\*',
            'feck ', ' fux ', 'f\*\*', 
            'f\-ing', 'f\.u\.', 'f###', ' fu ', 'f@ck', 'f u c k', 'f uck', 'f ck','fuk', 'wtf','fucck','f cking'
        ],

    ' ass ':
        [
            '[^a-z]ass ', '[^a-z]azz ', 'arrse', ' arse ', '@\$\$'
                                                           '[^a-z]anus', ' a\*s\*s', '[^a-z]ass[^a-z ]',
            'a[@#\$%\^&\*][@#\$%\^&\*]', '[^a-z]anal ', 'a s s'
        ],

    ' asshole ':
        [
            ' a[s|z]*wipe', 'a[s|z]*[w]*h[o|0]+[l]*e', '@\$\$hole', 'ass hole'
        ],

    ' bitch ':
        [
            'b[w]*i[t]*ch', 'b!tch',
            'bi\+ch', 'b!\+ch', '(b)([^a-z]*)(i)([^a-z]*)(t)([^a-z]*)(c)([^a-z]*)(h)',
            'biatch', 'bi\*\*h', 'bytch', 'b i t c h','beetch'
        ],

    ' bastard ':
        [
            'ba[s|z]+t[e|a]+rd'
        ],

    ' transgender':
        [
            'transgender','trans gender'
        ],

    ' gay ':
        [
            'gay'
        ],

    ' cock ':
        [
            '[^a-z]cock', 'c0ck', '[^a-z]cok ', 'c0k', '[^a-z]cok[^aeiou]', ' cawk',
            '(c)([^a-z ])(o)([^a-z ]*)(c)([^a-z ]*)(k)', 'c o c k'
        ],

    ' dick ':
        [
            ' dick[^aeiou]', 'deek', 'd i c k','diick '
        ],

    ' suck ':
        [
            'sucker', '(s)([^a-z ]*)(u)([^a-z ]*)(c)([^a-z ]*)(k)', 'sucks', '5uck', 's u c k'
        ],

    ' cunt ':
        [
            'cunt', 'c u n t'
        ],

    ' bullshit ':
        [
            'bullsh\*t', 'bull\$hit','bs'
        ],

    ' homosexual':
        [
            'homo sexual','homosex'
        ],

    ' jerk ':
        [
            'jerk'
        ],

    ' idiot ':
        [
            'i[d]+io[t]+', '(i)([^a-z ]*)(d)([^a-z ]*)(i)([^a-z ]*)(o)([^a-z ]*)(t)', 'idiots', 'i d i o t'
        ],

    ' dumb ':
        [
            '(d)([^a-z ]*)(u)([^a-z ]*)(m)([^a-z ]*)(b)'
        ],

    ' shit ':
        [
            'shitty', '(s)([^a-z ]*)(h)([^a-z ]*)(i)([^a-z ]*)(t)', 'shite', '\$hit', 's h i t'
        ],

    ' shithole ':
        [
            'shythole','shit hole'
        ],

    ' retard ':
        [
            'returd', 'retad', 'retard', 'wiktard', 'wikitud'
        ],

    ' rape ':
        [
            ' raped'
        ],

    ' dumbass':
        [
            'dumb ass', 'dubass'
        ],

    ' asshead':
        [
            'butthead', 'ass head'
        ],

    ' sex ':
        [
            's3x', 'sexuality',
        ],


    ' nigger ':
        [
            'nigger', 'ni[g]+a', ' nigr ', 'negrito', 'niguh', 'n3gr', 'n i g g e r'
        ],

    ' shut the fuck up':
        [
            'stfu'
        ],

    ' pussy ':
        [
            'pussy[^c]', 'pusy', 'pussi[^l]', 'pusses'
        ],

    ' faggot ':
        [
            'faggot', ' fa[g]+[s]*[^a-z ]', 'fagot', 'f a g g o t', 'faggit',
            '(f)([^a-z ]*)(a)([^a-z ]*)([g]+)([^a-z ]*)(o)([^a-z ]*)(t)', 'fau[g]+ot', 'fae[g]+ot',
        ],

    ' motherfucker':
        [
            ' motha ', ' motha f', ' mother f', 'motherucker', 'mother fucker'
        ],

    ' whore ':
        [
            'wh\*\*\*', 'w h o r e'
        ],
}

In [7]:
#Text Normalization

def clean_text(text, remove_repeat_text=True, remove_patterns_text=True, is_lower=True):
    
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip()
    if is_lower:
        text=text.lower()

    if remove_patterns_text:
        for target, patterns in RE_PATTERNS.items():
          for pat in patterns:
            text=str(text).replace(pat, target)

    if remove_repeat_text:
        text = re.sub(r'(.)\1{2,}', r'\1', text) 
    
    text = str(text).replace("\n", " ")
    text = re.sub(r'[^\w\s]',' ',text)
    text = re.sub('[0-9]',"",text)
    text = re.sub(" +", " ", text)
    text = re.sub("([^\x00-\x7F])+"," ",text)
    return text 

def clean(text):
        text = text.replace(r"what's", "what is ")
        text = text.replace(r"\'ve", " have ")
        text = text.replace(r"can't", "cannot ")
        text = text.replace(r"n't", " not ")
        text = text.replace(r"i'm", "i am ")
        text = text.replace(r"\'re", " are ")
        text = text.replace(r"\'d", " would ")
        text = text.replace(r"\'ll", " will ")
        text = text.replace(r"\'scuse", " excuse ")
        text = text.replace(r"\'s", " ")
        text = text.replace('\n', ' \n ')
        text = text.replace(r'([a-zA-Z]+)([/!?.])([a-zA-Z]+)', r'\1 \2 \3')
        text = text.replace(r'([*!?\'])\1\1{2,}', r'\1\1\1')
        text = text.replace(r'([*!?\']+)', r' \1 ')
        text = text.replace(r'([a-zA-Z])\1{2,}\b', r'\1\1')
        text = text.replace(r'([a-zA-Z])\1\1{2,}\B', r'\1\1\1')
        text = text.replace(r'[ ]{2,}', ' ').strip()
        text = text.replace(r'[ ]{2,}', ' ').strip()
        return text

In [8]:
lemmatizer= WordNetLemmatizer()
def lemma(text, lemmatization=True):
  output=''
  if lemmatization:
    text=text.split(' ')
    for word in text:
      word1 = lemmatizer.lemmatize(word, pos = "n") #noun 
      word2 = lemmatizer.lemmatize(word1, pos = "v") #verb
      word3 = lemmatizer.lemmatize(word2, pos = "a") #adjective
      word4 = lemmatizer.lemmatize(word3, pos = "r") #adverb
      output=output + " " + word4
  else:
    output=text
  
  return str(output.strip())

In [9]:
##Stopwords Removal

stopword_list=[]
def iter_all_strings():
    for size in itertools.count(1):
        for s in itertools.product(ascii_lowercase, repeat=size):
            yield "".join(s)

dual_alpha_list=[]
for s in iter_all_strings():
    dual_alpha_list.append(s)
    if s == 'zz':
        break

dual_alpha_list.remove('i')
dual_alpha_list.remove('a')
dual_alpha_list.remove('am')
dual_alpha_list.remove('an')
dual_alpha_list.remove('as')
dual_alpha_list.remove('at')
dual_alpha_list.remove('be')
dual_alpha_list.remove('by')
dual_alpha_list.remove('do')
dual_alpha_list.remove('go')
dual_alpha_list.remove('he')
dual_alpha_list.remove('hi')
dual_alpha_list.remove('if')
dual_alpha_list.remove('is')
dual_alpha_list.remove('in')
dual_alpha_list.remove('me')
dual_alpha_list.remove('my')
dual_alpha_list.remove('no')
dual_alpha_list.remove('of')
dual_alpha_list.remove('on')
dual_alpha_list.remove('or')
dual_alpha_list.remove('ok')
dual_alpha_list.remove('so')
dual_alpha_list.remove('to')
dual_alpha_list.remove('up')
dual_alpha_list.remove('us')
dual_alpha_list.remove('we')


for letter in dual_alpha_list:
    stopword_list.append(letter)
    
potential_stopwords=['editor', 'reference', 'thank', 'work','find', 'good', 'know', 'like', 'look', 'thing', 'want', 'time', 'list', 'section','wikipedia', 'doe', 'add','new', 'try', 'think', 'write','use', 'user', 'way', 'page']

#Adding above retrived words into the stopwords list.
for word in tqdm(potential_stopwords):
    stopword_list.append(word)
    
def remove_stopwords(text, remove_stop=True):
  output = ""
  if remove_stop:
    text=text.split(" ")
    for word in text:
      if word not in stopword_list:
        output=output + " " + word
  else :
    output=text

  return str(output.strip())

  0%|          | 0/25 [00:00<?, ?it/s]

In [10]:
if PRIVATE:
    df_jigsaw = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")

    df_jigsaw['severe_toxic'] = df_jigsaw.severe_toxic * 2
    df_jigsaw['y'] = (df_jigsaw[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis = 1)).astype(int)
    df_jigsaw['y'] = df_jigsaw['y'] / df_jigsaw['y'].max()
    df_jigsaw = df_jigsaw[['comment_text', 'y']].rename(columns = {'comment_text': 'text'})
    df_jigsaw["text"] = df_jigsaw["text"].progress_apply(clean_text)
    df_jigsaw['text'] = df_jigsaw["text"].progress_apply(clean)
    df_jigsaw["text"] = df_jigsaw["text"].progress_apply(lemma)
    df_jigsaw["text"] = df_jigsaw["text"].progress_apply(remove_stopwords)
    df_jigsaw.head()

In [11]:
if PRIVATE:
    if VAL:
        df_val["less_toxic"] = df_val["less_toxic"].progress_apply(clean_text)
        df_val['less_toxic'] = df_val["less_toxic"].progress_apply(clean)
        df_val["less_toxic"] = df_val["less_toxic"].progress_apply(lemma)
        df_val["less_toxic"] = df_val["less_toxic"].progress_apply(remove_stopwords)

In [12]:
if PRIVATE:
    if VAL:
        df_val["more_toxic"] = df_val["more_toxic"].progress_apply(clean_text)
        df_val['more_toxic'] = df_val["more_toxic"].progress_apply(clean)
        df_val["more_toxic"] = df_val["more_toxic"].progress_apply(lemma)
        df_val["more_toxic"] = df_val["more_toxic"].progress_apply(remove_stopwords)

In [13]:
if PRIVATE:
    df_sub["text"] = df_sub["text"].progress_apply(clean_text)
    df_sub['text'] = df_sub["text"].progress_apply(clean)
    df_sub["text"] = df_sub["text"].progress_apply(lemma)
    df_sub["text"] = df_sub["text"].progress_apply(remove_stopwords)

In [14]:
if PRIVATE:
    df_jigsaw = create_folds(df_jigsaw, n_folds)
    df_jigsaw.head()

In [15]:
if PRIVATE:
    df_ruddit = pd.read_csv("../input/ruddit-jigsaw-dataset/Dataset/ruddit_with_text.csv")
    df_ruddit = df_ruddit[['txt', 'offensiveness_score']].rename(columns = {'txt': 'text', 'offensiveness_score': 'y'})
    df_ruddit['y'] = (df_ruddit['y'] - df_ruddit.y.min()) / (df_ruddit.y.max() - df_ruddit.y.min())
    df_ruddit = create_folds(df_ruddit, n_folds)

In [16]:
if PRIVATE:
    val_preds_jigsaw1 = np.zeros((df_val.shape[0], n_folds))
    val_preds_jigsaw2 = np.zeros((df_val.shape[0], n_folds))
    test_preds_jigsaw = np.zeros((df_sub.shape[0], n_folds))

In [17]:
if PRIVATE:
    for fld in tqdm(range(n_folds)):
        df = df_jigsaw.loc[df_jigsaw.fold == fld]
        features = FeatureUnion([("vect3", TfidfVectorizer(min_df = 3, max_df = 0.5, analyzer = 'char_wb', ngram_range = (3, 5)))])
        pipeline = Pipeline([("features", features), ("clf", Ridge())])
        pipeline.fit(df['text'], df['y'])
        val_preds_jigsaw1[:, fld] = pipeline.predict(df_val['less_toxic'])
        val_preds_jigsaw2[:, fld] = pipeline.predict(df_val['more_toxic'])
        test_preds_jigsaw[:, fld] = pipeline.predict(df_sub['text'])

In [18]:
if PRIVATE:
    del df, pipeline
    gc.collect()

In [19]:
if PRIVATE:
    val_preds_ruddit1 = np.zeros((df_val.shape[0], n_folds))
    val_preds_ruddit2 = np.zeros((df_val.shape[0], n_folds))
    test_preds_ruddit = np.zeros((df_sub.shape[0], n_folds))

In [20]:
if PRIVATE:
    for fld in tqdm(range(n_folds)):
        df = df_ruddit.loc[df_ruddit.fold == fld]
        features = FeatureUnion([("vect3", TfidfVectorizer(min_df = 3, max_df = 0.5, analyzer = 'char_wb', ngram_range = (3, 5)))])
        pipeline = Pipeline([("features", features), ("clf", Ridge())])
        pipeline.fit(df['text'], df['y'])
        val_preds_ruddit1[:, fld] = pipeline.predict(df_val['less_toxic'])
        val_preds_ruddit2[:, fld] = pipeline.predict(df_val['more_toxic'])
        test_preds_ruddit[:, fld] = pipeline.predict(df_sub['text'])

In [21]:
if PRIVATE:
    del df, pipeline
    gc.collect()

In [22]:
if PRIVATE:
    p1 = val_preds_jigsaw1.mean(axis = 1)
    p2 = val_preds_jigsaw2.mean(axis = 1)
    p3 = val_preds_ruddit1.mean(axis = 1)
    p4 = val_preds_ruddit2.mean(axis = 1)
    # p5 = val_preds_arr1c.mean(axis = 1)
    # p6 = val_preds_arr2c.mean(axis = 1)

In [23]:
if PRIVATE:
    wts_acc = []
    for i in range(1, 100, 1):
        w1 = i / 100
        w2 = (100 - i) / 100
        p1_wt = w1 * p1 + w2 * p3
        p2_wt = w1 * p2 + w2 * p4
        wts_acc.append((w1, w2, np.round((p1_wt < p2_wt).mean() * 100, 2)))

In [24]:
if PRIVATE:
    w1, w2, score = sorted(wts_acc, key = lambda x: x[2], reverse = True)[0]
    print(score)

In [25]:
if PRIVATE:
    print(w1)
    print(w2)

In [26]:
if PRIVATE:
    p1_wt = w1 * p1 + w2 * p3
    p2_wt = w1 * p2 + w2 * p4
    df_val['p1'] = p1_wt
    df_val['p2'] = p2_wt
    df_val['diff'] = np.abs(p2_wt - p1_wt)
    df_val['correct'] = (p1_wt < p2_wt).astype('int')
    df_sub['score'] = w1 * test_preds_jigsaw.mean(axis = 1) + w2 * test_preds_ruddit.mean(axis = 1)
    df_sub.to_csv("submission_tfidf.csv", index = None)
    # df_sub['score'].count() - df_sub['score'].nunique()

    # same_score = df_sub['score'].value_counts().reset_index()[:10]
    # df_sub[df_sub['score'].isin(same_score['index'].tolist())]

### *DEEP LEARNING* - Ranking Loss

In [27]:
CONFIG = dict(
    seed = 42,
    config_name = '../input/js-mpnet/mpnet-base/mpnet-base/config.json',
    model_name = '../input/js-mpnet/mpnet-base/mpnet-base',
    test_batch_size = 32,
    max_length = 128,
    num_classes = 1,
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    debug= False,
    size = 768
)

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])

In [28]:
val_df = pd.read_csv("../input/js-cleaned-validation-data/Validation_data_clean_no_duplicates.csv")

In [29]:
sub_df2 = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

In [30]:
def set_seed(seed = 42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(CONFIG['seed'])

In [31]:
model_dict = {
    "roberta-base":{
        "model_name": "../input/js-roberta/roberta-base/roberta-base",
        "config_name": "../input/js-roberta/roberta-base/roberta-base/config.json",
        "model_paths": [f"../input/js-roberta/JS_roberta-base_RL/models/roberta-base_{fold}_model.bin" for fold in range(5)],
        "version": 0,
        "size": 768
    },
    "roberta-base2":{
        "model_name": "../input/js-roberta/roberta-base/roberta-base",
        "config_name": "../input/js-roberta/roberta-base/roberta-base/config.json",
        "model_paths": [f"../input/js-roberta/JS_roBERTa-base-cat-multi/JS_roBERTa-base-cat-multi/roberta-base_toxic_{fold}_model.bin" for fold in range(5)],
        "version": 0,
        "size": 768    
    },
    "deberta-base":{
        "model_name": "../input/js-deberta/deberta-base/deberta-base",
        "config_name": "../input/js-deberta/deberta-base/deberta-base/config.json",
        "model_paths": [f"../input/js-deberta/JS_deberta-base_RL/models/deberta-base_{fold}_model.bin" for fold in range(5)],
        "version": 1,
        "size": 768
    },
    "deberta-base2":{
        "model_name": "../input/js-deberta/deberta-base/deberta-base",
        "config_name": "../input/js-deberta/deberta-base/deberta-base/config.json",
        "model_paths": [f"../input/js-deberta/JS_deberta-base-multi-cat/JS_deberta-base-multi-cat/deberta-v3-base_toxic_{fold}_model.bin" for fold in range(5)],
        "version": 2,
        "size": 768
    },
    "mpnet-base":{
        "model_name": "../input/js-mpnet/mpnet-base/mpnet-base",
        "config_name": "../input/js-mpnet/mpnet-base/mpnet-base/config.json",
        "model_paths": [f"../input/js-mpnet/JS_mpnet-base_RL (1)/models/mpnet-base_{fold}_model.bin" for fold in range(5)],
        "version": 0,
        "size": 768
    },
    "mpnetv2-base":{
        "model_name": "../input/js-mpnet/all-mpnet-base-v2/all-mpnet-base-v2",
        "config_name": "../input/js-mpnet/all-mpnet-base-v2/all-mpnet-base-v2/config.json",
        "model_paths": [f"../input/js-mpnet/JS_mpnetv2-base_RL/models/all-mpnet-base-v2_{fold}_model.bin" for fold in range(5)],
        "version": 0,
        "size": 768
    },
    "mpnetv2-base2":{
        "model_name": "../input/js-mpnet/all-mpnet-base-v2/all-mpnet-base-v2",
        "config_name": "../input/js-mpnet/all-mpnet-base-v2/all-mpnet-base-v2/config.json",
        "model_paths": [f"../input/js-mpnet/JS_p0 (2)/content/models/all-mpnet-base-v2_toxic_{fold}_model.bin" for fold in range(5)],
        "version": 0,
        "size": 768
    },
    "albert-base":{
        "model_name": "../input/js-albert/albert-base-v2/albert-base-v2",
        "config_name": "../input/js-albert/albert-base-v2/albert-base-v2/config.json",
        "model_paths": [f"../input/js-albert/JS_albert-base_RL/models/albert-base-v2_{fold}_model.bin" for fold in range(5)],
        "version": 0,
        "size": 768
    },    
}

In [32]:
class JigsawDataset(Dataset):
    def __init__(self, df, tokenizer, max_length, col):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.text = self.df[col].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
                        text,
                        truncation=True,
                        add_special_tokens=True,
                        max_length=self.max_len,
                        padding='max_length'
                    )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']        
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long)
        }

In [33]:
class JigsawModel(nn.Module):
    def __init__(self, config_name):
        super(JigsawModel, self).__init__()
        config = AutoConfig.from_pretrained(config_name)
        self.model = AutoModel.from_config(config = config)
        self.drop = nn.Dropout(p=0.2)
        self.fc1 = nn.Linear(CONFIG['size'],256)
        self.fc2 = nn.Linear(256,CONFIG['num_classes'])

    def forward(self, ids, mask):        
        out = self.model(input_ids=ids,attention_mask=mask,
                         output_hidden_states=False)
        out = self.drop(out[1])
        outputs = self.fc2(self.fc1(out))
        return outputs

In [34]:
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim
        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

class JigsawModel2(nn.Module):
    def __init__(self, model_name, config_path=None, pretrained=False):
        super().__init__()
        self.config = AutoConfig.from_pretrained(model_name, output_hidden_states=True)
        self.model = AutoModel.from_config(config=self.config)
        self.head = AttentionHead(self.config.hidden_size,self.config.hidden_size)
        self.dropout = nn.Dropout(0.4)
        self.linear = nn.Linear(self.config.hidden_size,1)

    def forward(self, ids, mask):
        x = self.model(input_ids=ids,attention_mask=mask)[0]
        x = self.head(x)
        x = self.dropout(x)
        x = self.linear(x)
        return x

class JigsawModel3(nn.Module):
    def __init__(self, model_name, config_path=None, pretrained=False):
        super().__init__()
        self.config = AutoConfig.from_pretrained(model_name, output_hidden_states=True)
        self.model = AutoModel.from_config(config=self.config)
        self.head = AttentionHead(self.config.hidden_size,self.config.hidden_size)
        self.dropout = nn.Dropout(0.4)
        self.fc1 = nn.Linear(self.config.hidden_size,256)
        self.fc2 = nn.Linear(256,1)

    def forward(self, ids, mask):
        x = self.model(input_ids=ids,attention_mask=mask)[0]
        x = self.head(x)
        x = self.dropout(x)
        x = self.fc1(x)
        x = torch.sigmoid(self.fc2(x))
        return x

In [35]:
@torch.no_grad()
def valid_fn(model, dataloader, device):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    PREDS = []
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        
        outputs = model(ids, mask)
        PREDS.append(outputs.sigmoid().view(-1).cpu().detach().numpy()) 
    
    PREDS = np.concatenate(PREDS)
    del model
    gc.collect()
    
    return PREDS

In [36]:
def inference(model_paths, dataloader, device, version):
    final_preds = []
    for i, path in enumerate(model_paths):
        if version == 1:
            model = JigsawModel2(CONFIG['config_name'])
        elif version == 0:
            model = JigsawModel(CONFIG["config_name"])
        elif version ==2:
            model = JigsawModel3(CONFIG["config_name"])
        model.to(CONFIG['device'])
        model.load_state_dict(torch.load(path, map_location = device))
        
        print(f"Getting predictions for model {i+1}")
        preds = valid_fn(model, dataloader, device)
        final_preds.append(preds)
    
    final_preds = np.array(final_preds)
    final_preds = np.mean(final_preds, axis=0)
    return final_preds

In [37]:
def inference_multimodel(model_dict, device, col = 'text', df = sub_df2):
    
    pred =pd.DataFrame()
    pred 
    for key, item in model_dict.items():
        print(key)
        CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(item['model_name']) 
        CONFIG["config_name"] = item["config_name"]
        CONFIG["size"] = item["size"]
        MODEL_PATHS = item["model_paths"]
        version = item["version"]
        test_dataset = JigsawDataset(df, CONFIG['tokenizer'], max_length=CONFIG['max_length'], col = col)
        test_loader = DataLoader(test_dataset, batch_size=CONFIG['test_batch_size'],
                                 num_workers=2, shuffle=False, pin_memory=True)
        preds = inference(MODEL_PATHS, test_loader, device, version)
        pred[key] = preds   
    return pred

In [38]:
if PRIVATE:
    if VAL:
        preds_more_toxic_ = inference_multimodel(model_dict, CONFIG['device'], col = "more_toxic", df = val_df)
        preds_less_toxic_ = inference_multimodel(model_dict, CONFIG['device'], col = "less_toxic", df = val_df)
        p1 = preds_more_toxic_[list(model_dict.keys())].mean(axis = 1)
        p2 = preds_less_toxic_[list(model_dict.keys())].mean(axis = 1)
        print(f"Accuracy: {np.round((p2 < p1).mean() * 100, 2)}")

In [39]:
if PRIVATE:
    preds = inference_multimodel(model_dict, CONFIG['device'], col = "text", df = sub_df2)

In [40]:
if PRIVATE:
    preds["score"] = preds[list(model_dict.keys())].mean(axis = 1)

In [41]:
if PRIVATE:
    sub_df2["score"] = preds["score"]
    sub_df2.to_csv("submission_DL.csv", index=None)

### *FASTTEXT*

In [42]:
N_MODELS = 4
EXTRA_DIM = 256
ALPHA_STEP_SIZE = 0.5

In [43]:
def text_cleaning_fasttext(text):
    template = re.compile(r'https?://\S+|www\.\S+')
    text = template.sub(r'', text)
    soup = BeautifulSoup(text, 'lxml')
    only_text = soup.get_text()
    text = only_text
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"
                               u"\U0001F300-\U0001F5FF"
                               u"\U0001F680-\U0001F6FF"
                               u"\U0001F1E0-\U0001F1FF"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags = re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    text = re.sub(r"[^a-zA-Z\d]", " ", text)
    text = re.sub(' +', ' ', text)
    text = text.strip()
    return text

In [44]:
if PRIVATE:
    df = pd.read_csv('../input/jigsaw-regression-based-data/train_data_version3.csv')
    df = df.dropna(axis = 0)

In [45]:
if PRIVATE:
    df["text"] = df["text"].progress_apply(text_cleaning_fasttext)

In [46]:
if PRIVATE:
    vec = TfidfVectorizer(min_df = 3, max_df = 0.5, analyzer = 'char_wb', ngram_range = (3, 5), max_features = 46000)
    vec.fit(df['text'])

In [47]:
if PRIVATE:
    fmodel = FastText.load('../input/jigsaw-regression-based-data/FastText-jigsaw-256D/Jigsaw-Fasttext-Word-Embeddings-256D.bin')

In [48]:
def splitter(text): 
    return [word for word in text.split(' ')]

def vectorizer(text):
    tokens = splitter(text)
    x1 = vec.transform([text]).toarray()
    x2 = np.mean(fmodel.wv[tokens], axis = 0).reshape(1, -1)
    x = np.concatenate([x1, x2], axis = -1).astype(np.float16)
    del x1
    del x2
    return x

In [49]:
if PRIVATE:
    X_np = np.array([vectorizer(text) for text in df.text]).reshape(-1, (len(vec.vocabulary_) + EXTRA_DIM))
    X = sparse.csr_matrix(X_np)
    del X_np

In [50]:
class RidgeEnsemble():
    def __init__(self, n_models = 4, alpha_step_size = 0.5): 

        self.models = [Ridge(alpha = alpha) for alpha in [alpha_step_size * i for i in range(1, n_models + 1)]]

    def fit(self, X, y): 

        self.models = [model.fit(X, y) for model in self.models]

    def predict(self, X): 

        return np.mean(np.concatenate([np.expand_dims(model.predict(X), axis = 0) for model in self.models], axis = 0), axis = 0)

In [51]:
if PRIVATE:
    model = RidgeEnsemble()
    model.fit(X, df['y'])

In [52]:
if PRIVATE:
    if VAL:
        val_df = pd.read_csv("../input/js-cleaned-validation-data/Validation_data_clean_no_duplicates.csv")

        val_df["more_toxic"] = val_df["more_toxic"].progress_apply(text_cleaning_fasttext)
        val_df["less_toxic"] = val_df["less_toxic"].progress_apply(text_cleaning_fasttext)

        X_less_toxic_temp = []
        for text in val_df.less_toxic: 
            X_less_toxic_temp.append(vectorizer(text))
        X_less_toxic_temp = np.array(X_less_toxic_temp).reshape(-1, (len(vec.vocabulary_) + EXTRA_DIM))
        X_less_toxic = sparse.csr_matrix(X_less_toxic_temp)
        del X_less_toxic_temp

        X_more_toxic_temp = []
        for text in val_df.more_toxic: 
            X_more_toxic_temp.append(vectorizer(text))
        X_more_toxic_temp = np.array(X_more_toxic_temp).reshape(-1, (len(vec.vocabulary_) + EXTRA_DIM))
        X_more_toxic = sparse.csr_matrix(X_more_toxic_temp)
        del X_more_toxic_temp

        preds_more_toxic = model.predict(X_more_toxic)
        preds_less_toxic = model.predict(X_less_toxic)

        print(f"Accuracy: {np.round((preds_less_toxic < preds_more_toxic).mean() * 100, 2)}")

In [53]:
if PRIVATE:
    df_sub3 = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
    df_sub3['text'] = df_sub3['text'].progress_apply(text_cleaning_fasttext)
    X_sub_temp = []
    for text in df_sub3.text: 
        X_sub_temp.append(vectorizer(text))
    X_sub_temp = np.array(X_sub_temp).reshape(-1, (len(vec.vocabulary_) + 256))
    X_test = sparse.csr_matrix(X_sub_temp)
    del X_sub_temp

    df_sub3['score'] = model.predict(X_test)
    df_sub3['score'] = df_sub3['score']
    df_sub3[['comment_id', 'score']].to_csv("submission_fasttext.csv", index = False)

In [54]:
if PRIVATE:
    if VAL:
        p1 = p2_wt
        p2 = p1_wt
        p3 = preds_more_toxic_[list(model_dict.keys())].mean(axis = 1)
        p4 = preds_less_toxic_[list(model_dict.keys())].mean(axis = 1)
        p5 = preds_more_toxic
        p6 = preds_less_toxic
        wts_acc_ = []
        for i in range(1, 100, 1):
            for j in range(i,100, 1):
                w1 = i / 100
                w2 = j / 100
                w3 = (100 - i - j) / 100
                if (w3>0):
                    p1_wt = w1 * p1 + w2 * p3 + w3 * p5
                    p2_wt = w1 * p2 + w2 * p4 + w3 * p6
                    wts_acc_.append((w1, w2, w3, np.round((p2_wt < p1_wt).mean() * 100, 2)))
        w1_, w2_, w3_, score = sorted(wts_acc_, key = lambda x: x[3], reverse = True)[0]

In [55]:
w1_ = 0.7
w2_ = 0.15
w3_ = 0.15

In [56]:
if PRIVATE:
    sub1 = pd.read_csv("./submission_tfidf.csv")
    sub2 = pd.read_csv("./submission_DL.csv")
    sub3 = pd.read_csv("./submission_fasttext.csv")

In [57]:
if PRIVATE:
    sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
    sub["score"] = sub1["score"]*w1_ + sub2["score"]*w2_ + sub3["score"]*w3_
    sub[["comment_id", "score"]].to_csv("submission.csv", index = None)

In [58]:
if not PRIVATE:
    sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
    sub["score"] = 0
    sub[["comment_id", "score"]].to_csv("submission.csv", index = None)