In [1]:
import os
import gc
import copy
import time
import random
import string

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader

# Utils
from tqdm import tqdm
from collections import defaultdict

# Sklearn Imports
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, KFold

# For Transformer Models
from transformers import AutoTokenizer, AutoModel, AdamW

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [2]:
CONFIG = {
    "seed": 2021,
    "epochs": 3,
    "model_name": "../input/roberta-base",
    "valid_batch_size": 16,
    "max_length": 128,
    "learning_rate": 2e-5,
    "scheduler": 'CosineAnnealingLR',
    "min_lr": 1e-6,
    "T_max": 500,
    "weight_decay": 1e-6,
    "n_fold": 5,
    "n_accumulate": 1,
    "num_classes": 1,
    "margin": 0.5,
    "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
}

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])

In [3]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(CONFIG['seed'])

In [4]:
df = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
df.head()

Unnamed: 0,comment_id,text
0,114890,"""\n \n\nGjalexei, you asked about whether ther..."
1,732895,"Looks like be have an abuser , can you please ..."
2,1139051,I confess to having complete (and apparently b...
3,1434512,"""\n\nFreud's ideas are certainly much discusse..."
4,2084821,It is not just you. This is a laundry list of ...


In [5]:
import joblib

puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', '$', '&',
          '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
          '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›', 
          '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', 
          '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯',
          '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔',
          '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', '\n', '\r']

with open('../input/reducing-oov-of-crawl300d2m-no-appos-result/jigsaw-crawl-300d-2M.joblib', 'rb') as f:
    crawl_emb_dict = joblib.load(f)

with open('../input/googleprofanitywords/google-profanity-words/profanity.js', 'r') as handle:
    p_words = handle.readlines()
    
set_puncts = set(puncts)

p_word_set = set([t.replace('\n', '') for t in p_words])

In [6]:
def sentence_fetures(text):
    word_list = text.split()
    word_count = len(word_list)
    n_upper = len([word for word in word_list if any([c.isupper() for c in word])])
    n_unique = len(set(word_list))
    n_ex = word_list.count('!')
    n_que = word_list.count('?')
    n_puncts = len([word for word in word_list if word in set_puncts])
    n_prof = len([word for word in word_list if word in p_word_set])
    n_oov = len([word for word in word_list if word not in crawl_emb_dict])

    return word_count, n_upper, n_unique, n_ex, n_que, n_puncts, n_prof, n_oov

sentence_feature_cols = ['word_count', 'n_upper', 'n_unique', 'n_ex', 'n_que', 'n_puncts', 'n_prof', 'n_oov']

In [7]:
from collections import defaultdict

feature_dict = defaultdict(list)

for text in tqdm(df['text']):
    feature_list = sentence_fetures(text)
    for i_feature, feature_name in enumerate(sentence_feature_cols):
        feature_dict[sentence_feature_cols[i_feature]].append(feature_list[i_feature])
        
sentence_df = pd.DataFrame.from_dict(feature_dict)

for col in ['n_upper', 'n_unique', 'n_ex', 'n_que', 'n_puncts', 'n_prof', 'n_oov']:
    sentence_df[col + '_ratio'] = sentence_df[col] / sentence_df['word_count']
    
sentence_df.head()

100%|██████████| 7537/7537 [00:00<00:00, 12163.56it/s]


Unnamed: 0,word_count,n_upper,n_unique,n_ex,n_que,n_puncts,n_prof,n_oov,n_upper_ratio,n_unique_ratio,n_ex_ratio,n_que_ratio,n_puncts_ratio,n_prof_ratio,n_oov_ratio
0,117,10,93,0,0,2,0,9,0.08547,0.794872,0.0,0.0,0.017094,0.0,0.076923
1,14,1,14,0,0,1,0,1,0.071429,1.0,0.0,0.0,0.071429,0.0,0.071429
2,76,9,62,0,0,0,0,7,0.118421,0.815789,0.0,0.0,0.0,0.0,0.092105
3,284,40,191,0,0,3,0,26,0.140845,0.672535,0.0,0.0,0.010563,0.0,0.091549
4,26,3,24,0,0,0,0,1,0.115385,0.923077,0.0,0.0,0.0,0.0,0.038462


In [8]:
del crawl_emb_dict, feature_dict
gc.collect()

21

In [9]:
SENTENCE_FEATURE_USED = [
    'word_count', 'n_upper', 'n_unique', 'n_ex', 'n_que', 'n_puncts',
    'n_prof', 'n_oov', 'n_upper_ratio', 'n_unique_ratio', 'n_ex_ratio',
    'n_que_ratio', 'n_puncts_ratio', 'n_prof_ratio', 'n_oov_ratio'
]

In [10]:
class JigsawDataset(Dataset):
    def __init__(self, df, data, tokenizer, max_length):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.text = df['text'].values
        self.data = data
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.text[index]
        data = self.data[index]
        
        inputs = self.tokenizer.encode_plus(
                                text,
                                truncation=True,
                                add_special_tokens=True,
                                max_length=self.max_len,
                                padding='max_length'
                            )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']     
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'data': torch.tensor(data, dtype=torch.float),
        }

In [11]:
from transformers import RobertaModel
import torch.nn.functional as F

OUT_DROPOUT = 0.3
BERT_N_LAST_LAYER = 4
BERT_HIDDEN_SIZE = 768

BERT_MODEL_PATH = '../input/roberta-base'

class JigsawModel(nn.Module):
    def __init__(self, num_aux_targets, num_sentence_features):
        super(JigsawModel, self).__init__()
        self.bert_model = RobertaModel.from_pretrained(BERT_MODEL_PATH)
        self.dropout = nn.Dropout(OUT_DROPOUT)
        
        self.linear_sentence1 = nn.Linear(num_sentence_features, num_sentence_features)
        
        n_hidden = BERT_HIDDEN_SIZE + num_sentence_features
        self.linear1 = nn.Linear(n_hidden, n_hidden)
        
        self.linear_out = nn.Linear(n_hidden, 1)
        self.linear_aux_out = nn.Linear(n_hidden, num_aux_targets)
        
    def forward(self, ids, attention, sentence_features):
        
        bert_output = self.bert_model(ids, attention)[1]
        
        bert_output = self.dropout(bert_output)
        
        h_sentence = self.linear_sentence1(sentence_features)
        
        h_cat = torch.cat((bert_output, h_sentence), 1)
        
        h_conc_linear1  = F.relu(self.linear1(h_cat))
        
        hidden = h_cat + h_conc_linear1
        
        result = self.linear_out(hidden)
        aux_result = self.linear_aux_out(hidden)
        out = torch.cat([result, aux_result], 1)
        
        return out

In [12]:
@torch.no_grad()
def valid_one_epoch(model, dataloader, device):
    model.eval()
    
    all_preds = []
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    
    for step, data in bar:        
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        data = data['data'].to(device)
        outputs = model(ids, mask, data)
        
        all_preds += [outputs.detach().cpu().numpy()]
    
    gc.collect()
    
    return all_preds

In [13]:
from sklearn.preprocessing import StandardScaler
import pickle
import joblib

def prepare_loaders():
    sc = joblib.load(
        '../input/jigsawrobertaunintended/scaler-seed0-fold0.joblib'
    )
    
    data_test = sc.transform(sentence_df[SENTENCE_FEATURE_USED].values)
    
    test_dataset = JigsawDataset(
        df, data_test, tokenizer=CONFIG['tokenizer'], max_length=CONFIG['max_length']
    )

    test_loader = DataLoader(
        test_dataset, batch_size=CONFIG['valid_batch_size'], 
        shuffle=False
    )
    
    return test_loader

In [14]:
# Create Dataloaders
test_loader = prepare_loaders()

model = JigsawModel(6, 15)
model.load_state_dict(
    torch.load(
        '../input/jigsawrobertaunintended/Loss-Fold-4_roberta_base_exp.bin'
    )
)
model.to(CONFIG['device']);

Some weights of the model checkpoint at ../input/roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
preds = valid_one_epoch(model, test_loader, CONFIG['device'])

100%|██████████| 472/472 [00:40<00:00, 11.70it/s]


In [16]:
out_preds = np.concatenate(preds)[:, 0]

In [17]:
df['score'] = out_preds
df = df.drop(columns=['text'])
df['score'] = df['score'].rank(method='first')
df[['comment_id', 'score']].to_csv('submission.csv', index=False)
df.head()

Unnamed: 0,comment_id,score
0,114890,643.0
1,732895,302.0
2,1139051,2503.0
3,1434512,204.0
4,2084821,5162.0
