In [3]:
import sys
sys.path.append('../')

In [4]:
from jigsaw.models.linear_models.base_model import LinearModel, KernelModel, SVRModel
from jigsaw.models.cnn_models.base_model import CnnModel
from jigsaw.models.rnn_models.base_model import RnnModel
from transformers import AutoTokenizer
from jigsaw.utils.tokenizer import Tokenizer
import optuna
from box import Box
from jigsaw.utils.glove import load_glove
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from jigsaw.scripts.training import rnn_train
from jigsaw.scripts.inference import linear_predict
from jigsaw.utils.cleaning import *
import nltk
import pandas as pd
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/alexander/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alexander/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/alexander/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /Users/alexander/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
cfg = {
    'seed': 42,
    'logger': {
        'save_dir': 'models',
        'project': 'Jigsaw',
        'log_model': True
    },
    'dataset': {
        'type': 'regression', #paired, regression
        'name': 'toxic-comment-preprocessed',
        'text_col': 'comment_text',
        'target_col': 'y'
    },
    'model_name': 'roberta-base',
    'load_embeddings': False,
    'freeze_embeddings': False,
    'rnn_type': 'lstm', #only lstm, gru doesn't work for some reasons
    'emb_size': 100,
    'hidden_size': 100,
    'num_layers': 1,
    'bidirectional': False,
    'max_length': 256,
    'bucket_seq': True,
    'rnn_embeddings': False,
    'margin': 0.5,
    'batch_size': 16,
    'acc_step': 1,
    'epoch': 5,
    'num_classes': 1,
    'optimizer': {
        'name': 'optim.AdamW',
        'params': {
            'lr': 1e-2,
            'weight_decay': 1e-5
        }
    },
    'scheduler': {
        'name': 'get_cosine_schedule_with_warmup',
        'params': {
            'num_warmup_steps': 0.06
        }
    },
    'trainer': {
        'progress_bar_refresh_rate': 3,
        'num_sanity_val_steps': 2
    }
}
  
cfg = Box(cfg)

In [5]:
train_df = pd.read_csv('../data/preprocessed_data/jigsaw-toxic-comment_lemmatize.csv')
val_df = pd.read_csv('../data/preprocessed_data/validation_data_lemmatize.csv')
test_df = pd.read_csv('../data/jigsaw-rate-severity/comments_to_score.csv')

In [5]:
ne = train_df[train_df['y'] != 0].sample(10_000)
ye = train_df[train_df['y'] == 0].sample(len(ne))
train_df = pd.concat([ne, ye])
train_df = train_df.sample(len(train_df))

In [None]:
def objective(trial):
    emb_type = trial.suggest_categorical('emb_type', ['glove', 'fasttext'])
    hidden_size = trial.suggest_categorical('emb_size', [100, 200, 300])
    tokenizer = 'pretrained'
    
    cfg.emb_type = emb_type
    cfg.load_embeddings = True
    cfg.freeze_embeddings = True
    cfg.hidden_size = hidden_size
    cfg.num_layers = 2
    cfg.bidirectional = True
    if tokenizer == 'own':
        t = Tokenizer()
        t.fit(train_df)
        cfg['tokenizer'] = t
    else:
        cfg['tokenizer'] = AutoTokenizer.from_pretrained(cfg['model_name'])
    
    if emb_type == 'glove':
        emb_path_dict = {
            'twitter_27B_25d': '/Users/alexander/Documents/jigsaw/vectors/glove/glove.twitter.27B.25d.txt',
            'twitter_27B_50d': '/Users/alexander/Documents/jigsaw/vectors/glove/glove.twitter.27B.50d.txt',
            'twitter_27B_100d': '/Users/alexander/Documents/jigsaw/vectors/glove/glove.twitter.27B.100d.txt',
            'twitter_27B_200d': '/Users/alexander/Documents/jigsaw/vectors/glove/glove.twitter.27B.200d.txt',
            '840B_300d': '/Users/alexander/Documents/jigsaw/vectors/glove/glove.840B.300d.txt'
        }
        emb_path_key = trial.suggest_categorical('emb_path_glove', list(emb_path_dict.keys()))
        emb_path = emb_path_dict[emb_path_key]
        cfg.emb_path = emb_path
        
    if emb_type == 'fasttext':
        emb_path_dict = {
            'model_100_5': '/Users/alexander/Documents/jigsaw/vectors/fast_text/model_100_5.bin',
            'model_200_5': '/Users/alexander/Documents/jigsaw/vectors/fast_text/model_200_5.bin',
            'model_300_5': '/Users/alexander/Documents/jigsaw/vectors/fast_text/model_300_5.bin'
        }
        emb_path_key = trial.suggest_categorical('emb_path_fasttext', list(emb_path_dict.keys()))
        emb_path = emb_path_dict[emb_path_key]
        cfg.emb_path = emb_path
    
    if tokenizer == 'own':
        acc = rnn_train(cfg, train_df, val_df, checkpoint_args=['optuna'], limit_train_batches = 1245)
    else:
        acc = rnn_train(cfg, train_df, val_df, checkpoint_args=['optuna'])
    return acc

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)