In [None]:
import errno
import os
import json
import numpy as np
import torch

def assert_exits(path):
    assert os.path.exists(path), 'Does not exist : {}'.format(path)
    
def equal_info(a,b):
    assert len(a)==len(b),'File info not equal!'
    
def same_question(a,b):
    assert a==b,'Not the same question!'
    
class Logger(object):
	def __init__(self,output_dir):
		dirname=os.path.dirname(output_dir)
		if not os.path.exists(dirname):
			os.mkdir(dirname)
		self.log_file=open(output_dir,'w')
		self.infos={}
		
	def append(self,key,val):
		vals=self.infos.setdefault(key,[])
		vals.append(val)

	def log(self,extra_msg=''):
		msgs=[extra_msg]
		for key, vals in self.infos.iteritems():
			msgs.append('%s %.6f' %(key,np.mean(vals)))
		msg='\n'.joint(msgs)
		self.log_file.write(msg+'\n')
		self.log_file.flush()
		self.infos={}
		return msg
		
	def write(self,msg):
		self.log_file.write(msg+'\n')
		self.log_file.flush()
		print(msg)

class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NpEncoder, self).default(obj)


def attention_dump(opt,model,test_set, idx):
    # print ("The information for task {} # iterations : {} last batch : {}".format(source, len(test_set),test_set.last_batch))
    tasks=opt.TASKS.split(',')
    total = len(test_set)
    words_list = []

    weights_info = {"0":[],"1":[],"2":[],"3":[]}
    preds_info = {"0":[],"1":[],"2":[],"3":[]}
    
    for i in range(total):
        with torch.no_grad():
            batch_info=test_set.next_batch()
            tokens=batch_info['tokens'].cuda()
            labels=batch_info['label'].float().cuda()
            bert_tokens=batch_info['bert_tokens'].cuda()
            masks=batch_info['masks'].cuda()
            att_masks = batch_info['att_masks'].cuda()
            words=batch_info['words']
            words_list.extend(words)
            for task_idx in range(len(tasks)):
                pred,weights=model(tokens,task_idx,bert_tokens,masks,att_masks)
                weights_info[str(task_idx)].extend(weights.cpu().numpy())
                preds_info[str(task_idx)].extend(pred.cpu().numpy())
        if i==0:
            #print ('yes')
            t1_labels=labels
        else:
            t1_labels=torch.cat((t1_labels,labels),0)
    t1_labels = t1_labels.cpu().numpy()
    write_json(words_list, weights_info, preds_info, t1_labels, tasks[0],idx)


def write_json(sent_tokens, weights, preds, labels, source,cv_idx):
    items = []
    for idx in range(len(sent_tokens)):
        weights_tuple = (weights["0"][idx],weights["1"][idx],weights["2"][idx],weights["3"][idx])
        preds_tuple = (preds["0"][idx],preds["1"][idx],preds["2"][idx],preds["3"][idx])
        item = {"words": sent_tokens[idx], "weights": weights_tuple, "predictions": preds_tuple, "label": np.argmax(labels[idx])}
        items.append(item)

    with open("weights/{}.attentions.json".format(source+str(cv_idx)), "w") as f:
        json.dump(items, f, cls=NpEncoder)

In [None]:
!curl 'https://gitlab.com/bottle_shop/safe/angrybert/-/archive/master/angrybert-master.zip' --output 'repo.zip'

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  254M    0  254M    0     0  5190k      0 --:--:--  0:00:50 --:--:-- 7311k


In [None]:
!curl 'https://ia803006.us.archive.org/1/items/glove.6B.50d-300d/glove.6B.300d.txt' --output 'glove.6B.100d.txt'

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  989M  100  989M    0     0  37.4M      0  0:00:26  0:00:26 --:--:-- 39.3M


In [None]:
!unzip '/content/repo.zip'

Archive:  /content/repo.zip
4812983eace58fd7e2f3b5dce115e3967dea4ba4
   creating: angrybert-master/
 extracting: angrybert-master/.gitignore  
   creating: angrybert-master/Bert-MTL/
   creating: angrybert-master/Bert-MTL/.ipynb_checkpoints/
  inflating: angrybert-master/Bert-MTL/.ipynb_checkpoints/samples-checkpoint.ipynb  
  inflating: angrybert-master/Bert-MTL/attention.py  
  inflating: angrybert-master/Bert-MTL/baseline.py  
  inflating: angrybert-master/Bert-MTL/classifier.py  
  inflating: angrybert-master/Bert-MTL/config.py  
  inflating: angrybert-master/Bert-MTL/dataset.py  
   creating: angrybert-master/Bert-MTL/dictionary/
  inflating: angrybert-master/Bert-MTL/dictionary/dictionary.pkl  
  inflating: angrybert-master/Bert-MTL/dictionary/glove_embedding.npy  
   creating: angrybert-master/Bert-MTL/dt/
  inflating: angrybert-master/Bert-MTL/dt/dictionary.pkl  
  inflating: angrybert-master/Bert-MTL/dt/final_0.txt  
  inflating: angrybert-master/Bert-MTL/dt/final_1.txt  
  in

In [None]:
import argparse 

def parse_opt():
    parser=argparse.ArgumentParser()
    
    """
    basic: shared and private rnns, concatenation of both to the fc layer
    dnn: shared and private embeddings, private rnn (mt-dnn similar)
    cnn: shared and private embeddings, private cnn
    uniform: different embeddings, shared rnn, 2016 IJCAI first baseline
    local: 2016 IJCAI, shared layer in the paper
    sp-mtl: 2016 IJCAI, for the implementation of global fusion
    mtl-gatedencoder: joint modeling network, Rajamanickam et al. 20
    shared-bert: bert baseline model for multitask shared learning
    angrybert: AngryBERT model, this is our proposed model
    angrybert-attn: AngryBERT model with attention on top, visualization purpose
    """
    
    parser.add_argument('--MODEL',type=str,default='angrybert')
    
    '''path configuration'''
    parser.add_argument('--GLOVE_PATH',type=str,default='/content/glove.6B.300d.txt')
    #path for pre-precessing and result saving
    parser.add_argument('--DT',type=str,default='./dt')
    parser.add_argument('--WZ_RESULT',type=str,default='./wz')
    parser.add_argument('--FOUNTA_RESULT',type=str,default='./founta')
    parser.add_argument('--HATELINGO_RESULT',type=str,default='./hatelingo')
    parser.add_argument('--DICT_INFO',type=str,default='./dictionary')
    
    #path for the split dataset
    parser.add_argument('--SPLIT_DATASET',type=str,default='/content/angrybert-master/resource')
    
    
    '''hyper parameters configuration'''
    parser.add_argument('--EMB_DROPOUT',type=float,default=0.5)
    parser.add_argument('--FC_DROPOUT',type=float,default=0.2) 
    parser.add_argument('--MIN_OCC',type=int,default=3)
    parser.add_argument('--BATCH_SIZE',type=int,default=60)
    parser.add_argument('--EMB_DIM',type=int,default=300)
    parser.add_argument('--MID_DIM',type=int,default=256)
    parser.add_argument('--PROJ_DIM',type=int,default=32)
    parser.add_argument('--NUM_HIDDEN',type=int,default=128)
    parser.add_argument('--NUM_FILTER',type=int,default=150)
    parser.add_argument('--FILTER_SIZE',type=str,default="2,3,4")
    parser.add_argument('--NUM_LAYER',type=int,default=1)
    parser.add_argument('--BIDIRECT',type=bool,default=True)
    parser.add_argument('--L_RNN_DROPOUT',type=float,default=0.1) 
    """
    wz, dt, founta, hatelingo, offenseval_c, semeval_a
    it is a str and the first position of the str is the hate speech detection dataset
    """
    parser.add_argument('--TASKS',type=str,default='hatelingo,offenseval_c')
    parser.add_argument('--LENGTH',type=int,default=30)
    
    parser.add_argument('--CREATE_DICT',type=bool,default=True)
    parser.add_argument('--CREATE_EMB',type=bool,default=True)
    parser.add_argument('--SAVE_NUM',type=int,default=1)
    parser.add_argument('--EPOCHS',type=int,default=6)
    parser.add_argument('--CROSS_VAL',type=int,default=5)
    
    parser.add_argument('--SEED', type=int, default=1112, help='random seed')
    parser.add_argument('--CUDA_DEVICE', type=int, default=0)
    
    
    parser.add_argument('--EVAL_ITERS', type=int, default=180)
    parser.add_argument('--TOTAL_ITERS', type=int, default=1800)
    
    args=parser.parse_args(args = [])
    return args

In [None]:
!apt update
!apt install enchant --fix-missing
!apt install -qq enchant

[33m0% [Working][0m            Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
[33m0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com (185.1[0m[33m0% [1 InRelease gpgv 1,581 B] [Connecting to archive.ubuntu.com (185.125.190.39[0m                                                                               Get:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
[33m0% [1 InRelease gpgv 1,581 B] [Connecting to archive.ubuntu.com (185.125.190.39[0m                                                                               Get:3 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
[33m0% [1 InRelease gpgv 1,581 B] [Waiting for headers] [3 InRelease 14.2 kB/88.7 k[0m                                                                               Get:4 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
[33m0% [1 InRelease gpg

In [None]:
!pip install wordninja compound-word-splitter nltk pyenchant

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wordninja
  Downloading wordninja-2.0.0.tar.gz (541 kB)
[K     |████████████████████████████████| 541 kB 4.3 MB/s 
[?25hCollecting compound-word-splitter
  Downloading compound-word-splitter-0.4.tar.gz (2.1 kB)
Collecting pyenchant
  Downloading pyenchant-3.2.2-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 3.9 MB/s 
Building wheels for collected packages: wordninja, compound-word-splitter
  Building wheel for wordninja (setup.py) ... [?25l[?25hdone
  Created wheel for wordninja: filename=wordninja-2.0.0-py3-none-any.whl size=541551 sha256=d711e3c2a755ab43f950406bd0df469b835c3769bb9f3d3fb2998c3a5da5ab56
  Stored in directory: /root/.cache/pip/wheels/dd/3f/eb/a2692e3d2b9deb1487b09ba4967dd6920bd5032bfd9ff7acfc
  Building wheel for compound-word-splitter (setup.py) ... [?25l[?25hdone
  Created wheel for compound-word-splitter: filename=compound_wor

In [None]:
import json
import pickle
import re
import string

import wordninja
import splitter
from nltk.tokenize.treebank import TreebankWordTokenizer
import numpy as np

# Aphost lookup dict
symbols_to_delete  = '@＼・ω+=”“[]^>\\°<~•≠™ˈʊɒ∞§{}·τα❤☺ɡ|¢→̶`❥━┣┫┗Ｏ►★©―ɪ✔®\x96\x92●£♥➤´¹☕≈÷♡◐║▬′ɔː€۩۞†μ✒➥═☆ˌ◄½ʻπδηλσερνʃ✬ＳＵＰＥＲＩＴ☻±♍µº¾✓◾؟．⬅℅»Вав❣⋅¿¬♫ＣＭβ█▓▒░⇒⭐›¡₂₃❧▰▔◞▀▂▃▄▅▆▇↙γ̄″☹➡«φ⅓„✋：¥̲̅́∙‛◇✏▷❓❗¶˚˙）сиʿ✨。ɑ\x80◕！％¯−ﬂﬁ₁²ʌ¼⁴⁄₄⌠♭✘╪▶☭✭♪☔☠♂☃☎✈✌✰❆☙○‣⚓年∎ℒ▪▙☏⅛ｃａｓǀ℮¸ｗ‚∼‖ℳ❄←☼⋆ʒ⊂、⅔¨͡๏⚾⚽Φ×θ￦？（℃⏩☮⚠月✊❌⭕▸■⇌☐☑⚡☄ǫ╭∩╮，例＞ʕɐ̣Δ₀✞┈╱╲▏▕┃╰▊▋╯┳┊≥☒↑☝ɹ✅☛♩☞ＡＪＢ◔◡↓♀⬆̱ℏ\x91⠀ˤ╚↺⇤∏✾◦♬³の｜／∵∴√Ω¤☜▲↳▫‿⬇✧ｏｖｍ－２０８＇‰≤∕ˆ⚜☁.,?!;*"…:—()%$&/\n🍕\r🐵😑\xa0\ue014\t\uf818\uf04a\xad😢🐶️\uf0e0😜😎👊\u200b\u200e😁عدويهصقأناخلىبمغر😍💖💵Е👎😀😂\u202a\u202c🔥😄🏻💥ᴍʏʀᴇɴᴅᴏᴀᴋʜᴜʟᴛᴄᴘʙғᴊᴡɢ😋👏שלוםבי😱‼\x81エンジ故障\u2009🚌ᴵ͞🌟😊😳😧🙀😐😕\u200f👍😮😃😘אעכח💩💯⛽🚄🏼ஜ😖ᴠ🚲‐😟😈💪🙏🎯🌹😇💔😡\x7f👌ἐὶήιὲκἀίῃἴξ🙄Ｈ😠\ufeff\u2028😉😤⛺🙂\u3000تحكسة👮💙فزط😏🍾🎉😞\u2008🏾😅😭👻😥😔😓🏽🎆🍻🍽🎶🌺🤔😪\x08‑🐰🐇🐱🙆😨🙃💕𝘊𝘦𝘳𝘢𝘵𝘰𝘤𝘺𝘴𝘪𝘧𝘮𝘣💗💚地獄谷улкнПоАН🐾🐕😆ה🔗🚽歌舞伎🙈😴🏿🤗🇺🇸мυтѕ⤵🏆🎃😩\u200a🌠🐟💫💰💎эпрд\x95🖐🙅⛲🍰🤐👆🙌\u2002💛🙁👀🙊🙉\u2004ˢᵒʳʸᴼᴷᴺʷᵗʰᵉᵘ\x13🚬🤓\ue602😵άοόςέὸתמדףנרךצט😒͝🆕👅👥👄🔄🔤👉👤👶👲🔛🎓\uf0b7\uf04c\x9f\x10成都😣⏺😌🤑🌏😯ех😲Ἰᾶὁ💞🚓🔔📚🏀👐\u202d💤🍇\ue613小土豆🏡❔⁉\u202f👠》कर्मा🇹🇼🌸蔡英文🌞🎲レクサス😛外国人关系Сб💋💀🎄💜🤢َِьыгя不是\x9c\x9d🗑\u2005💃📣👿༼つ༽😰ḷЗз▱ц￼🤣卖温哥华议会下降你失去所有的钱加拿大坏税骗子🐝ツ🎅\x85🍺آإشء🎵🌎͟ἔ油别克🤡🤥😬🤧й\u2003🚀🤴ʲшчИОРФДЯМюж😝🖑ὐύύ特殊作戦群щ💨圆明园קℐ🏈😺🌍⏏ệ🍔🐮🍁🍆🍑🌮🌯🤦\u200d𝓒𝓲𝓿𝓵안영하세요ЖљКћ🍀😫🤤ῦ我出生在了可以说普通话汉语好极🎼🕺🍸🥂🗽🎇🎊🆘🤠👩🖒🚪天一家⚲\u2006⚭⚆⬭⬯⏖新✀╌🇫🇷🇩🇪🇮🇬🇧😷🇨🇦ХШ🌐\x1f杀鸡给猴看ʁ𝗪𝗵𝗲𝗻𝘆𝗼𝘂𝗿𝗮𝗹𝗶𝘇𝗯𝘁𝗰𝘀𝘅𝗽𝘄𝗱📺ϖ\u2000үսᴦᎥһͺ\u2007հ\u2001ɩｙｅ൦ｌƽｈ𝐓𝐡𝐞𝐫𝐮𝐝𝐚𝐃𝐜𝐩𝐭𝐢𝐨𝐧Ƅᴨןᑯ໐ΤᏧ௦Іᴑ܁𝐬𝐰𝐲𝐛𝐦𝐯𝐑𝐙𝐣𝐇𝐂𝐘𝟎ԜТᗞ౦〔Ꭻ𝐳𝐔𝐱𝟔𝟓𝐅🐋ﬃ💘💓ё𝘥𝘯𝘶💐🌋🌄🌅𝙬𝙖𝙨𝙤𝙣𝙡𝙮𝙘𝙠𝙚𝙙𝙜𝙧𝙥𝙩𝙪𝙗𝙞𝙝𝙛👺🐷ℋ𝐀𝐥𝐪🚶𝙢Ἱ🤘ͦ💸ج패티Ｗ𝙇ᵻ👂👃ɜ🎫\uf0a7БУі🚢🚂ગુજરાતીῆ🏃𝓬𝓻𝓴𝓮𝓽𝓼☘﴾̯﴿₽\ue807𝑻𝒆𝒍𝒕𝒉𝒓𝒖𝒂𝒏𝒅𝒔𝒎𝒗𝒊👽😙\u200cЛ‒🎾👹⎌🏒⛸公寓养宠物吗🏄🐀🚑🤷操美𝒑𝒚𝒐𝑴🤙🐒欢迎来到阿拉斯ספ𝙫🐈𝒌𝙊𝙭𝙆𝙋𝙍𝘼𝙅ﷻ🦄巨收赢得白鬼愤怒要买额ẽ🚗🐳𝟏𝐟𝟖𝟑𝟕𝒄𝟗𝐠𝙄𝙃👇锟斤拷𝗢𝟳𝟱𝟬⦁マルハニチロ株式社⛷한국어ㄸㅓ니͜ʖ𝘿𝙔₵𝒩ℯ𝒾𝓁𝒶𝓉𝓇𝓊𝓃𝓈𝓅ℴ𝒻𝒽𝓀𝓌𝒸𝓎𝙏ζ𝙟𝘃𝗺𝟮𝟭𝟯𝟲👋🦊多伦🐽🎻🎹⛓🏹🍷🦆为和中友谊祝贺与其想象对法如直接问用自己猜本传教士没积唯认识基督徒曾经让相信耶稣复活死怪他但当们聊些政治题时候战胜因圣把全堂结婚孩恐惧且栗谓这样还♾🎸🤕🤒⛑🎁批判检讨🏝🦁🙋😶쥐스탱트뤼도석유가격인상이경제황을렵게만들지않록잘관리해야합다캐나에서대마초와화약금의품런성분갈때는반드시허된사용🔫👁凸ὰ💲🗯𝙈Ἄ𝒇𝒈𝒘𝒃𝑬𝑶𝕾𝖙𝖗𝖆𝖎𝖌𝖍𝖕𝖊𝖔𝖑𝖉𝖓𝖐𝖜𝖞𝖚𝖇𝕿𝖘𝖄𝖛𝖒𝖋𝖂𝕴𝖟𝖈𝕸👑🚿💡知彼百\uf005𝙀𝒛𝑲𝑳𝑾𝒋𝟒😦𝙒𝘾𝘽🏐𝘩𝘨ὼṑ𝑱𝑹𝑫𝑵𝑪🇰🇵👾ᓇᒧᔭᐃᐧᐦᑳᐨᓃᓂᑲᐸᑭᑎᓀᐣ🐄🎈🔨🐎🤞🐸💟🎰🌝🛳点击查版🍭𝑥𝑦𝑧ＮＧ👣\uf020っ🏉ф💭🎥Ξ🐴👨🤳🦍\x0b🍩𝑯𝒒😗𝟐🏂👳🍗🕉🐲چی𝑮𝗕𝗴🍒ꜥⲣⲏ🐑⏰鉄リ事件ї💊「」\uf203\uf09a\uf222\ue608\uf202\uf099\uf469\ue607\uf410\ue600燻製シ虚偽屁理屈Г𝑩𝑰𝒀𝑺🌤𝗳𝗜𝗙𝗦𝗧🍊ὺἈἡχῖΛ⤏🇳𝒙ψՁմեռայինրւդձ冬至ὀ𝒁🔹🤚🍎𝑷🐂💅𝘬𝘱𝘸𝘷𝘐𝘭𝘓𝘖𝘹𝘲𝘫کΒώ💢ΜΟΝΑΕ🇱♲𝝈↴💒⊘Ȼ🚴🖕🖤🥘📍👈➕🚫🎨🌑🐻𝐎𝐍𝐊𝑭🤖🎎😼🕷ｇｒｎｔｉｄｕｆｂｋ𝟰🇴🇭🇻🇲𝗞𝗭𝗘𝗤👼📉🍟🍦🌈🔭《🐊🐍\uf10aლڡ🐦\U0001f92f\U0001f92a🐡💳ἱ🙇𝗸𝗟𝗠𝗷🥜さようなら🔼'
tokenizer = TreebankWordTokenizer()
#isolate_dict = {ord(c): ' {} '.format(c) for c in symbols_to_isolate}
remove_dict = {ord(c): '' for c in symbols_to_delete}
def handle_contractions(x):
    x = tokenizer.tokenize(x)
    return x

def handle_punctuation(x):
    x = x.translate(remove_dict)
    #x = x.translate(isolate_dict)
    return x

def fix_quote(x):
    x = [x_[1:] if x_.startswith("'") else x_ for x_ in x]
    x = ' '.join(x)
    return x

def preprocess(x):
    x = handle_punctuation(x)
    #x = handle_contractions(x)
    #x = fix_quote(x)
    return x

fill = {"ain't": "is not", "aren't": "are not", "can't": "cannot",
        "can't've": "cannot have", "'cause": "because", "could've": "could have",
        "couldn't": "could not", "couldn't've": "could not have", "didn't": "did not",
        "doesn't": "does not", "don't": "do not", "hadn't": "had not",
        "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not",
        "he'd": "he would", "he'd've": "he would have", "he'll": "he will",
        "he'll've": "he he will have", "he's": "he is", "how'd": "how did",
        "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
        "I'd": "I would", "I'd've": "I would have", "I'll": "I will",
        "I'll've": "I will have", "I'm": "I am", "I've": "I have",
        "i'd": "i would", "i'd've": "i would have", "i'll": "i will",
        "i'll've": "i will have", "i'm": "i am", "i've": "i have",
        "isn't": "is not", "it'd": "it would", "it'd've": "it would have",
        "it'll": "it will", "it'll've": "it will have", "it's": "it is",
        "let's": "let us", "ma'am": "madam", "mayn't": "may not",
        "might've": "might have", "mightn't": "might not", "mightn't've": "might not have",
        "must've": "must have", "mustn't": "must not", "mustn't've": "must not have",
        "needn't": "need not", "needn't've": "need not have", "o'clock": "of the clock",
        "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
        "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would",
        "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have",
        "she's": "she is", "should've": "should have", "shouldn't": "should not",
        "shouldn't've": "should not have", "so've": "so have", "so's": "so as",
        "this's": "this is",
        "that'd": "that would", "that'd've": "that would have", "that's": "that is",
        "there'd": "there would", "there'd've": "there would have", "there's": "there is",
        "they'd": "they would", "they'd've": "they would have", "they'll": "they will",
        "they'll've": "they will have", "they're": "they are", "they've": "they have",
        "to've": "to have", "wasn't": "was not", "we'd": "we would",
        "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have",
        "we're": "we are", "we've": "we have", "weren't": "were not",
        "what'll": "what will", "what'll've": "what will have", "what're": "what are",
        "what's": "what is", "what've": "what have", "when's": "when is",
        "when've": "when have", "where'd": "where did", "where's": "where is",
        "where've": "where have", "who'll": "who will", "who'll've": "who will have",
        "who's": "who is", "who've": "who have", "why's": "why is",
        "why've": "why have", "will've": "will have", "won't": "will not",
        "won't've": "will not have", "would've": "would have", "wouldn't": "would not",
        "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
        "y'all'd've": "you all would have", "y'all're": "you all are", "y'all've": "you all have",
        "you'd": "you would", "you'd've": "you would have", "you'll": "you will",
        "you'll've": "you will have", "you're": "you are", "you've": "you have", "u.s.": "united states",
        "#lol": "laughing out loud", "#lamo": "laughing my ass off", "#rof": "rolling on the floor laughing",
        "#covfefe": "ironic", "wtf": "what the fuck", "#wtf": "what the fuck",
        "tbh": "to be honest"}

slang = {
    "4ward": "forward",
    "brb": "be right back",
    "b4": "before",
    "bfn": "bye for now",
    "bgd": "background",
    "btw": "by the way",
    "br": "best regards",
    "clk": "click",
    "da": "the",
    "deet": "detail",
    "deets": "details",
    "dm": "direct message",
    "f2f": "face to face",
    "ftl": " for the loss",
    "ftw": "for the win",
    "f**k": "fuck",
    "f**ked": "fucked",
    "b***ch": "bitch",
    "kk": "cool cool",
    "kewl": "cool",
    "smh": "so much hate",
    "yaass": "yes",
    "a$$": "ass",
    "bby": "baby",
    "bc": "because",
    "coz": "because",
    "cuz": "because",
    "cause": "because",
    "cmon": "come on",
    "cmonn": "come on",
    "dafuq": "what the fuck",
    "dafuk": "what the fuck",
    "dis": "this",
    "diss": "this",
    "ma": "my",
    "dono": "do not know",
    "donno": "do not know",
    "dunno": "do not know",
    "fb": "facebook",
    "couldnt": "could not",
    "n": "and",
    "gtg": "got to go",
    "yep": "yes",
    "yw": "you are welcome",
    "im": "i am",
    "youre": "you are",
    "hes": "he is",
    "shes": "she is",
    "theyre": "they are",
    "af": "as fuck",
    "fam": "family",
    "fwd": "forward",
    "ffs": "for fuck sake",
    "fml": "fuck my life",
    "lol": "laugh out loud",
    "lel": "laugh out loud",
    "lool": "laugh out loud",
    "lmao": "laugh my ass off",
    "lmaoo": "laugh my ass off",
    "omg": "oh my god",
    "oomg": "oh my god",
    "omgg": "oh my god",
    "omfg": "oh my fucking god",
    "stfu": "shut the fuck up",
    "awsome": "awesome",
    "imo": "in my opinion",
    "imho": "in my humble opinion",
    "ily": "i love you",
    "ilyy": "i love you",
    "ikr": "i know right",
    "ikrr": "i know right",
    "idk": "i do not know",
    "jk": "joking",
    "lmk": "let me know",
    "nsfw": "not safe for work",
    "hehe": "haha",
    "tmrw": "tomorrow",
    "yt": "youtube",
    "hahaha": "haha",
    "hihi": "haha",
    "pls": "please",
    "ppl": "people",
    "wtf": "what the fuck",
    "wth": "what teh hell",
    "obv": "obviously",
    "nomore": "no more",
    "u": "you",
    "ur": "your",
    "wanna": "want to",
    "luv": "love",
    "imma": "i am",
    "&": "and",
    "thanx": "thanks",
    "til": "until",
    "till": "until",
    "thx": "thanks",
    "pic": "picture",
    "pics": "pictures",
    "gp": "doctor",
    "xmas": "christmas",
    "rlly": "really",
    "boi": "boy",
    "boii": "boy",
    "rly": "really",
    "whch": "which",
    "awee": "awsome",  # or maybe awesome is better
    "sux": "sucks",
    "nd": "and",
    "fav": "favourite",
    "frnds": "friends",
    "info": "information",
    "loml": "love of my life",
    "bffl": "best friend for life",
    "gg": "goog game",
    "xx": "love",
    "xoxo": "love",
    "thats": "that is",
    "homie": "best friend",
    "homies": "best friends"
}


def word_splitter(token):
    _splits = splitter.split(token[1:])
    _ninjas = wordninja.split(token[1:])

    word_splits = _splits if len(_ninjas) > len(_splits) else _ninjas

    if len(token) + 1 / 2 < len(word_splits):
        return token

    if word_splits:
        token = ' '.join(word_splits)
    else:
        token = token
    return token


def hashtag_solver(token):
    if token.startswith('#') and token[1:].isupper():
        return token[1:]

    if token.startswith('#'):
        # word_splits = re.sub(r"([A-Z])", r" \1", word[1:]).split()
        myString = re.sub(r'((?<=[a-z])[A-Z]|(?<!\A)[A-Z](?=[a-z]))', r' \1', token[1:])
        word_splits = myString.split()
        # print("Splitter {}".format(word_splits))
        if len(word_splits) > 1:
            token = ' '.join(word_splits)
            # print("if {} ### {}".format(tmp, token))
        else:
            token = word_splitter(token)
            # print("Else {} ### {}".format(tmp, token))

    return token

def normalize_word(word):
    temp = word
    while True:
        w = re.sub(r"([a-zA-Z])\1\1", r"\1\1", temp)
        if (w == temp):
            break
        else:
            temp = w
    return w


def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)


def refer_normalize(tokens):
    words = []
    for idx in range(len(tokens)):
        if idx + 1 != len(tokens) and tokens[idx].startswith("@") and tokens[idx + 1].startswith("@"):
            continue
        else:
            words.append(tokens[idx])
    return words


def clean_text(text):
    # remove url
    text = re.sub(r'http\S+', '', text)

    # fixing apostrope
    text = text.replace("’", "'")

    # remove &amp;
    text = text.replace('&amp;', 'and ')

    # remove \n
    text = re.sub("\\n", "", text)

    # remove leaky elements like ip,user
    text = re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", "", text)

    # (')aphostophe  replacement (ie)   you're --> you are
    # ( basic dictionary lookup : master dictionary present in a hidden block of code)
    # tokenizer = TweetTokenizer()
    tokens = text.split()
    tokens = refer_normalize(tokens)
    tokens = [fill[word] if word in fill else word for word in tokens]
    tokens = [fill[word.lower()] if word.lower() in fill else word for word in tokens]
    tokens = [slang[word] if word in slang else word for word in tokens]
    tokens = [slang[word.lower()] if word.lower() in slang else word for word in tokens]
    tokens = [hashtag_solver(word) for word in tokens]
    tokens = [normalize_word(word) for word in tokens]
    exclude = set(string.punctuation)
    text = ' '.join(ch for ch in tokens if ch not in exclude)

    # removing usernames
    text = re.sub("'s", "", text)
    text = re.sub("'", "", text)
    mention_pattern=re.compile(r'@\w*')
    text=re.sub(pattern=mention_pattern, repl='<USER>', string=text)
    # emoji remover
    text = remove_emoji(text)

    text = re.sub("&#\S+", "", text)
    text = re.sub("RT :", "", text)
    text=preprocess(text).lower()
    # remove non-ascii
    # text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    # text = re.sub(r'[^\w\s]', '', text)

    # to lower
    return text


def dump(data):
    def default(obj):
        if type(obj).__module__ == np.__name__:
            if isinstance(obj, np.ndarray):
                return obj.tolist()
            else:
                return obj.item()
        raise TypeError('Unknown type:', type(obj))

    json.dumps(data, default=default)


if __name__ == '__main__':
    txt = "@Jeremy_Hunt @yttr trying to sneak Junior doctors contracts through while media focussed on Chilcott?? You utter cunt. #Fuckyouandyourtoryparty"
    re = clean_text(txt)
    print(re)

user trying to sneak junior doctors contracts through while media focussed on chilcott you utter cunt fuck you and your tory party


In [None]:
import os
import pandas as pd
import re
import json
import pickle as pkl
import numpy as np
import h5py
import torch
from torch.utils.data import Dataset
from tqdm import tqdm
import itertools
import random
import math
import string

def load_pkl(path):
    data=pkl.load(open(path,'rb'))
    return data

def read_hdf5(path):
    data=h5py.File(path,'rb')
    return data

def read_csv(path):
    data=pd.read_csv(path)
    return data

def read_csv_sep(path):
    data=pd.read_csv(path,sep='\t')
    return data

def dump_pkl(path,info):
    pkl.dump(info,open(path,'wb'))  

def read_json(path):
    assert_exits(path)
    data=json.load(open(path,'rb'))
    '''in anet-qa returns a list'''
    return data

def pd_pkl(path):
    data=pd.read_pickle(path)
    return data

def read_jsonl(path):
    total_info=[]
    with open(path,'rb')as f:
        d=f.readlines()
    for i,info in enumerate(d):
        data=json.loads(info)
        total_info.append(data)
    return total_info

class Base_Op(object):
    def __init__(self):
        self.opt= parse_opt()
        self.tasks=self.opt.TASKS.split(',')

    def tokenize(self,x):
        #x = clean_text(x).split()
        #print (x)
        x=x.lower().split()
        #print (x)
        return x

    def get_tokens(self,sent):
        tokens=self.tokenize(sent)
        #print tokens
        token_num=[]
        for t in tokens:
            if t in self.word2idx:
                token_num.append(self.word2idx[t])
            else:
                token_num.append(self.word2idx['UNK'])
        if not token_num:
            token_num.append(self.word2idx['UNK'])
        return token_num

    def get_words(self, sent):
        tokens = self.tokenize(sent)
        # print tokens
        words = []
        for t in tokens:
            if t in self.word2idx:
                words.append(t)
            else:
                words.append('UNK')
        return words

    def token_sent(self):
        cur=0
        datasets=[]
        for dataset in self.tasks:
            name=os.path.join(self.opt.SPLIT_DATASET,dataset)+'.pkl'
            data=pkl.load(open(name,'rb'))
            datasets.append(data)
        print ('Total number of datasets:',len(self.tasks))
        
       
        for i in range(6):
            cur_total=[]
            for data in datasets:
                try:
                    cur_total.extend(data[str(i)])
                except Exception as e:
                    pass

            for info in cur_total:
                tweet=info['sent']
                tokens=self.tokenize(tweet)
                for t in tokens:
                    if t not in self.word_count:
                        self.word_count[t]=1
                    else:
                        self.word_count[t]+=1
            print ('Length of current sentences:',len(cur_total))
        for word in self.word_count.keys():
            if self.word_count[word]>=self.opt.MIN_OCC:
                self.word2idx[word]=cur
                self.idx2word.append(word)
                cur+=1

        if 'PAD' not in self.word2idx:
            self.idx2word.append('UNK')
            self.word2idx['UNK']=0

        if 'UNK' not in self.word2idx:
            self.idx2word.append('UNK')
            self.word2idx['UNK']=len(self.idx2word)-1   

        dump_pkl(os.path.join(self.tasks[0],'dictionary.pkl'),[self.word2idx,self.idx2word])

    def create_dict(self):
        self.word_count={}
        self.word2idx={}
        self.idx2word=[]
        self.token_sent()

    def create_embedding(self):

        word2emb={}
        with open(self.opt.GLOVE_PATH,'r') as f:
            entries=f.readlines()
        emb_dim=len(entries[0].split(' '))-1
        weights=np.zeros((len(self.idx2word),emb_dim),dtype=np.float32)
        for entry in entries:
            word=entry.split(' ')[0]
            word2emb[word]=np.array(list(map(float,entry.split(' ')[1:])))
        for idx,word in enumerate(self.idx2word):
            if word not in word2emb:
                continue
            weights[idx]=word2emb[word]

        np.save(os.path.join(self.tasks[0],'glove_embedding.npy'),weights)
        return weights

    def init_dict(self):
        if self.opt.CREATE_DICT:
            print ('Creating Dictionary...')
            self.create_dict()
        else:
            created_dict=load_pkl(os.path.join(self.tasks[0],'dictionary.pkl'))


            self.word2idx=created_dict[0]
            self.idx2word=created_dict[1]

        if self.opt.CREATE_EMB:
            print ('Creating Embedding...;')
            self.create_embedding()

        self.ntoken()

    def ntoken(self):
        self.ntokens=len(self.word2idx)
        print ('Number of Tokens:',self.ntokens)
        return self.ntokens


    def __len__(self):
        return len(self.word2idx)

class Wraped_Data(Base_Op):
    def __init__(self,opt,dictionary,split_data,test_num,mode='training',source='dt'):
        #hate for hate speech detection dataset and lingo for hatelingo
        super(Wraped_Data,self).__init__()
        self.opt= parse_opt()
        random.seed(opt.SEED)
        self.dictionary=dictionary
        self.split_data=split_data
        self.test_num=test_num
        self.mode=mode
        self.source=source
        #self.batch_size_dict = {"dt": 64, "wz": 50, "hatelingo": 28, "founta": 260}
        self.class_size_dict ={
                'wz':3,'dt':3,'founta':4,'hatelingo':5,
                'offenseval_c':3,'semeval_a':11
                }
        #self.batch_size = self.batch_size_dict.get(source, opt.BATCH_SIZE)
        self.batch_size=opt.BATCH_SIZE
        self.classes = self.class_size_dict.get(source, 2)
        #loading the data: later used for batch iteration
        self.entries=self.load_tr_val_entries()
        self.num_iters=int(math.ceil(len(self.entries) * 1.0 / self.batch_size))
        self.last_batch=len(self.entries) % self.batch_size
        #print (type(self.num_iters),type(self.last_batch))
        self.cur_iter=0
        print(mode)
        print("Task name {} Batch size {} Class size {}".format(self.source, self.batch_size, self.classes))
        print('The length of all entries is:',len(self.entries))
        print ('Information about batch loader: number of iteration:',self.num_iters,'number of last batch:',self.last_batch)

        self.length=opt.LENGTH

    def load_tr_val_entries(self):
        all_data=[]
        if self.mode=='training':
            for i in range(self.opt.CROSS_VAL):
                if i==self.test_num:
                    continue
                all_data.extend(self.split_data[str(i)])
        else:
            all_data.extend(self.split_data[str(self.test_num)])
        entries=[]
        for i,info in enumerate(all_data):
            sent=info['sent']
            label=info['label']
            bert_token=info['bert_token']
            entry={
                    'sent':sent,
                    'answer':label,
                    'bert_token':bert_token
                    }
            entries.append(entry)
        #shuffle the dataset
        random.shuffle(entries)
        return entries

    def padding_bert(self,tokens,length):
        if len(tokens)<length:
            padding=[0]*(length-len(tokens))
            tokens=tokens+padding
        else:
            tokens=tokens[:length]
        return tokens

    def padding_sent(self,tokens,length):
        if len(tokens)<length:
            padding=[0]*(length-len(tokens))
            tokens=tokens+padding
        else:
            tokens=tokens[:length]
        return tokens

    def get_masks(self,tokens,length):
        masks = [1]*(len(tokens)) + [0]*(length-len(tokens))
        masks = masks[:length]
        return masks


    def next_batch(self):
        batch_info={}
        #if self.source in ["wz","dt","founta"] and self.mode == "test":
            #print("Loading iter {} # {}".format(self.mode, self.cur_iter))
        if self.cur_iter==self.num_iters-1 :
            if self.last_batch>0:
                cur_entry=self.entries[self.cur_iter*self.batch_size:self.last_batch+self.cur_iter*self.batch_size]
                if self.mode=='training':
                    self.cur_iter=-1
                    #tmp_entry = self.entries[self.cur_iter*self.batch_size:(1+self.cur_iter)*abs(len(cur_entry)-self.batch_size)]
                    #cur_entry.extend(tmp_entry)
                    random.shuffle(self.entries)
                elif self.mode=='test' or self.mode=="val":
                    self.cur_iter=-1
                    #tmp_entry = self.entries[self.cur_iter*self.batch_size:(1+self.cur_iter)*abs(len(cur_entry)-self.batch_size)]
                    #cur_entry.extend(tmp_entry)
            elif self.last_batch==0:
                cur_entry=self.entries[self.cur_iter*self.batch_size:(1+self.cur_iter)*self.batch_size]
                if self.mode=='training': 
                    self.cur_iter=-1
                    random.shuffle(self.entries)
                elif self.mode=='test' or self.mode=="val":
                    self.cur_iter=-1
            """
            if the mode is training, then restart from the beginning
            cur_iters=0
            and shuffle the dataset again
            """
        else:
            cur_entry=self.entries[self.cur_iter*self.batch_size:(1+self.cur_iter)*self.batch_size]
        if len(cur_entry)>0:
            cur_entry = sorted(cur_entry, key=lambda k:len(k['sent'].split()), reverse=True)
            self.length = len(cur_entry[0]['sent'].split())
            batch_tokens=np.zeros([len(cur_entry),self.length],dtype=np.int64)
            batch_label=np.zeros([len(cur_entry),self.classes],dtype=np.int64)
            batch_bert=np.zeros([len(cur_entry),64],dtype=np.int64)
            batch_masks=np.zeros([len(cur_entry),64],dtype=np.int64)
            batch_att_masks = np.zeros([len(cur_entry), self.length], dtype=np.int64)
            batch_words=[]
            for k in range(len(cur_entry)):
                chunck=cur_entry[k]
                #for debuging print key of data, make sure load all data
                #print (chunck['key'],self.classes)
                #get batch tokens
                sent=chunck['sent']
                bert_tokens=chunck['bert_token']
                tokens=self.dictionary.get_tokens(sent)
                pad_tokens=self.padding_sent(tokens,self.length)
                bert_pad=self.padding_bert(bert_tokens,64)
                #print (tokens,bert_tokens)
                masks=mask=[int(num>0) for num in bert_pad]
                batch_tokens[k,:]=np.array((pad_tokens),dtype=np.int64)
                batch_bert[k,:]=np.array((bert_pad),dtype=np.int64)
                batch_masks[k,:]=np.array((mask),dtype=np.int64)
                att_masks = self.get_masks(tokens, self.length)
                batch_att_masks[k,:] = np.array((att_masks),dtype=np.int64)

                words=self.dictionary.get_words(sent)
                words = words + ['PAD']*(self.length-len(words))
                words = words[:self.length]
                batch_words.append(words)

                #get batch labels
                label=chunck['answer']
                if self.source == 'semeval_a':
                    target=np.array(label)
                else:
                    target=np.zeros((self.classes),dtype=np.float32)
                    target[label]=1.0
                batch_label[k,:]=target
            batch_tokens=torch.from_numpy(batch_tokens)
            batch_bert=torch.from_numpy(batch_bert)
            batch_label=torch.from_numpy(batch_label)
            batch_masks=torch.from_numpy(batch_masks)
            batch_att_masks=torch.from_numpy(batch_att_masks)
            batch_info['tokens']=batch_tokens
            batch_info['label']=batch_label
            batch_info['bert_tokens']=batch_bert
            batch_info['words']=batch_words
            batch_info['masks']=batch_masks
            batch_info['att_masks'] = batch_att_masks
            self.cur_iter+=1
            return batch_info

    def __len__(self):
        return self.num_iters


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.0-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 4.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 11.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 56.4 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 37.9 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling P

In [None]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_packed_sequence

from torch.autograd import Variable
import torch.nn.functional as F
import copy 
from transformers import BertForSequenceClassification,BertConfig

def clones(module,N):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

class LSTM_Update(nn.Module):
    def __init__(self,in_dim,hidden_dim,dropout):
        super(LSTM_Update,self).__init__()
        self.in_dim=in_dim
        self.hidden_dim=hidden_dim
        self.dropout=nn.Dropout(dropout)
        
        self.in_proj=clones(nn.Linear(in_dim,hidden_dim),4)
        self.hidden_proj=clones(nn.Linear(hidden_dim,hidden_dim),4)
        self.context_proj=clones(nn.Linear(hidden_dim,hidden_dim),3)
        
    def forward(self,word,hidden,context,c_hate):
        total=[] #stores i_t,f_t,o_t
        for i in range(3):
            result=torch.sigmoid(self.in_proj[i](word)+self.hidden_proj[i](hidden)+self.context_proj[i](context))
            total.append(result)
        c_t=total[1] * context + total[0] * c_hate
        h_t=total[2] * torch.tanh(c_t)
        return h_t,c_t
    
class Bi_LSTM_Update(nn.Module):
    def __init__(self,in_dim,hidden_dim):
        super(Bi_LSTM_Update,self).__init__()
        self.in_dim=in_dim
        self.hidden_dim=hidden_dim
        
        self.in_proj=clones(nn.Linear(in_dim,hidden_dim),4)
        self.hidden_proj=clones(nn.Linear(hidden_dim,hidden_dim),4)
        self.context_proj=clones(nn.Linear(hidden_dim,hidden_dim),3)
        
    def forward(self,word,hidden,context):
        total=[] #stores i_t,f_t,o_t
        for i in range(3):
            result=torch.sigmoid(self.in_proj[i](word)+self.hidden_proj[i](hidden)+self.context_proj[i](context))
            total.append(result)
        c_hate=torch.tanh(self.in_proj[3](word) + self.hidden_proj[3](hidden))
        c_t=total[1] * context + total[0] * c_hate
        h_t=total[2] * torch.tanh(c_t)
        return h_t,c_t
    
class Bi_LSTM(nn.Module):
    def __init__(self,in_dim,hidden_dim,dropout):
        super(Bi_LSTM,self).__init__()
        self.lstms=clones(Bi_LSTM_Update(in_dim,hidden_dim),2)
        self.hidden_dim=hidden_dim
        
    def forward(self,seq):
        h0=[Variable(torch.zeros(seq.size()[0],self.hidden_dim).cuda()),Variable(torch.zeros(seq.size()[0],self.hidden_dim).cuda())]
        c0=[Variable(torch.zeros(seq.size()[0],self.hidden_dim).cuda()),Variable(torch.zeros(seq.size()[0],self.hidden_dim).cuda())]
        hidden=[[h0[0].unsqueeze(1)],[h0[1].unsqueeze(1)]]
        context=[[c0[0]],[c0[1]]]
        for idx in range(seq.size()[1]):
            word=[seq[:,idx,:],seq[:,-(idx+1),:]]
            for i, w in enumerate(word):
                h=hidden[i][0].squeeze()
                c=context[i][0]
                new_h,new_c=self.lstms[i](word[i],h,c)
                if i==0:
                    hidden[i].append(new_h.unsqueeze(1))
                    context[i].append(new_c)
                else:
                    hidden[i].insert(0,new_h.unsqueeze(1))
                    context[i].insert(0,new_c)
        forward_h=torch.cat(hidden[0],dim=1)
        backward_h=torch.cat(hidden[1],dim=1)
        final_h=torch.cat((forward_h,backward_h),dim=2)
        return final_h

'''class Coupled_Layer(nn.Module):
    def __init__(self,in_dim,hidden_dim,dropout=0.3):
        super(Coupled_Layer,self).__init__()
        """
        two LSTMs sharing information with each other
        """
        self.lstm=clones(LSTM_Update(in_dim,hidden_dim,dropout),2)
        
        self.proj_word=clones(nn.Linear(in_dim,hidden_dim),2)
        
        self.gate_proj_w=clones(nn.Linear(in_dim,1),2)#W_g
        self.gate_proj_u=clones(nn.Linear(hidden_dim,1),2)#U_g
        
        self.proj_hidden=clones(nn.Linear(hidden_dim,hidden_dim),2)
        
        self.hidden_dim=hidden_dim
        
    def forward(self,seq,num_task=0):
        h0=[Variable(torch.zeros(seq.size()[0],self.hidden_dim).cuda()),Variable(torch.zeros(seq.size()[0],self.hidden_dim).cuda())]
        c0=[Variable(torch.zeros(seq.size()[0],self.hidden_dim).cuda()),Variable(torch.zeros(seq.size()[0],self.hidden_dim).cuda())]
        
        other=1-num_task
        for idx in range(seq.size()[1]):
            word=seq[:,idx,:]
            proj_word=self.proj_word[num_task](word)
            g_own=torch.sigmoid(self.gate_proj_w[num_task](word) + self.gate_proj_u[num_task](h0[num_task]))
            g_other=torch.sigmoid(self.gate_proj_w[num_task](word) + self.gate_proj_u[other](h0[other]))
            c_hate=torch.tanh(proj_word + g_own*self.proj_hidden[num_task](h0[num_task]) + g_other*self.proj_hidden[other](h0[other]))
            h,c=self.lstm[num_task](word,h0[num_task],c0[num_task],c_hate)
            #print(h.shape,c.shape)
            h0[num_task]=h
            c0[num_task]=c
            
            """
           updating the hidden states and context in the other LSTM 
            """
            proj_word=self.proj_word[other](word)
            c_hate=torch.tanh(proj_word + g_own + g_other)
            h,c=self.lstm[other](word,h0[other],c0[other],c_hate)
            h0[other]=h
            c0[other]=c
            
        return h0[num_task],c0[num_task]'''

#extension for the coupling layer
#previously for binary tasks now extend to multiple tasks
class Coupled_Layer(nn.Module):
    def __init__(self,in_dim,hidden_dim,num_task,dropout=0.3):
        super(Coupled_Layer,self).__init__()
        """
        two LSTMs sharing information with each other
        """
        self.lstm=clones(LSTM_Update(in_dim,hidden_dim,dropout),num_task)
        
        self.proj_word=clones(nn.Linear(in_dim,hidden_dim),num_task)#W_c
        
        self.gate_proj_w=clones(nn.Linear(in_dim,1),num_task)#W_gc
        self.gate_proj_u=clones(nn.Linear(hidden_dim,1),num_task)#U_gc
        
        self.proj_hidden=clones(nn.Linear(hidden_dim,hidden_dim),num_task*num_task)#U_c
        
        self.hidden_dim=hidden_dim
        self.tasks=num_task
        
    def forward(self,seq,num_task=0):
        h0=[Variable(torch.zeros(seq.size()[0],self.hidden_dim).cuda())] * self.tasks
        c0=[Variable(torch.zeros(seq.size()[0],self.hidden_dim).cuda())] * self.tasks
        
        h_latest=[Variable(torch.zeros(seq.size()[0],self.hidden_dim).cuda())] * self.tasks
        c_latest=[Variable(torch.zeros(seq.size()[0],self.hidden_dim).cuda())] * self.tasks
        for idx in range(seq.size()[1]):
            word=seq[:,idx,:]
            #proj_word=self.proj_word[num_task](word)
            #print(h.shape,c.shape)  
            for k in range(self.tasks):
                total=self.proj_word[k](word)
                for j in range(self.tasks):
                    g_own=torch.sigmoid(self.gate_proj_w[k](word) + self.gate_proj_u[j](h0[j]))#g_jk
                    cur_other=g_own * self.proj_hidden[k*self.tasks+j](h0[j])
                    total=total+cur_other
                #update hidden states in other lstms
                c_hate=torch.tanh(total)
                h,c=self.lstm[num_task](word,h0[k],c0[k],c_hate) 
                h_latest[k]=h
                c_latest[k]=c 
            h0=h_latest
            c0=c_latest
        return h0[num_task],c0[num_task]

class Local_Layer(nn.Module):
    def __init__(self,in_dim,hidden_dim,num_task,dropout=0.3):
        super(Coupled_Layer,self).__init__()
        """
        two LSTMs sharing information with each other
        """
        self.lstm=clones(LSTM_Update(in_dim,hidden_dim,dropout),num_task)
        
        self.proj_first=clones(nn.Linear(in_dim,hidden_dim),num_task)
        self.proj_word=clones(nn.Linear(in_dim,hidden_dim),num_task)#W_c
        
        self.gate_proj_w=clones(nn.Linear(in_dim,1),num_task)#W_gc
        self.gate_proj_u=clones(nn.Linear(hidden_dim,1),num_task)#U_gc
        
        self.proj_hidden=clones(nn.Linear(hidden_dim,hidden_dim),num_task*num_task)#U_cc
        self.proj_hidden=clones(nn.Linear(hidden_dim,hidden_dim),num_task*num_task)#U_c
        self.proj_hidden=clones(nn.Linear(hidden_dim,hidden_dim),num_task)#U_gf
        self.hidden_dim=hidden_dim
        self.tasks=num_task
        
    def forward(self,seq,num_task=0):
        h0=[Variable(torch.zeros(seq.size()[0],self.hidden_dim).cuda())] * self.tasks
        c0=[Variable(torch.zeros(seq.size()[0],self.hidden_dim).cuda())] * self.tasks
        
        h_latest=[Variable(torch.zeros(seq.size()[0],self.hidden_dim).cuda())] * self.tasks
        c_latest=[Variable(torch.zeros(seq.size()[0],self.hidden_dim).cuda())] * self.tasks
        for idx in range(seq.size()[1]):
            word=seq[:,idx,:]
            #proj_word=self.proj_word[num_task](word)
            #print(h.shape,c.shape)  
            for k in range(self.tasks):
                total=self.proj_word[k](word)
                for j in range(self.tasks):
                    g_own=torch.sigmoid(self.gate_proj_w[k](word) + self.gate_proj_u[j](h0[j]))#g_jk
                    cur_other=g_own * self.proj_hidden[k*self.tasks+j](h0[j])
                    total=total+cur_other
                #update hidden states in other lstms
                c_t=torch.tanh(total)
                proj_x=self.proj_first[k](word)
                #computation of LF
                c_hate=torch.tanh(proj_x+c_t+LF)
                h,c=self.lstm[num_task](word,h0[k],c0[k],c_hate) 
                h_latest[k]=h
                c_latest[k]=c 
            h0=h_latest
            c0=c_latest
        return h0[num_task],c0[num_task]    
    
class Shared_Layer(nn.Module):
    def __init__(self,in_dim,hidden_dim,num_task,gate,dropout=0.3):
        super(Shared_Layer,self).__init__()
        """
        two LSTMs sharing information with each other
        """
        self.tasks=num_task
        self.gate=gate
        
        self.lstm=clones(LSTM_Update(in_dim,hidden_dim,dropout),num_task)
        self.shared_lstm=Bi_LSTM(in_dim,hidden_dim,dropout)
        # self.shared_bert=BertForSequenceClassification.from_pretrained(
        #     'bert-base-uncased',
        #     num_labels=num_task,
        #     output_attentions=False,
        #     output_hidden_states=True
        # )
        self.proj_word=clones(nn.Linear(in_dim,hidden_dim),num_task)
        self.gate_proj_w=clones(nn.Linear(in_dim,1),num_task)#W_g
        self.gate_proj_u=clones(nn.Linear(hidden_dim,1),num_task)#U_g
        self.proj_hidden=clones(nn.Linear(hidden_dim,hidden_dim),num_task)
        
        self.shared_hidden_proj=nn.Linear(2*hidden_dim,hidden_dim)
        self.shared_gate_w=clones(nn.Linear(in_dim,1),num_task)#W_g
        self.shared_gate_u=clones(nn.Linear(2*hidden_dim,1),num_task)#U_g
        
        self.bert_proj=nn.Linear(768,2*hidden_dim)
        self.bert_proj1=nn.Linear(768,hidden_dim)
        self.hidden_dim=hidden_dim
        
    def forward(self,seq,num_task=0):
        h0=[Variable(torch.zeros(seq.size()[0],self.hidden_dim).cuda())] * self.tasks
        c0=[Variable(torch.zeros(seq.size()[0],self.hidden_dim).cuda())] * self.tasks
        
        shared_hidden=self.shared_lstm(seq)
        '''shared=self.shared_lstm(bert_tokens,token_type_ids=None,attention_mask=masks)
        shared_hidden=self.bert_proj(shared[1][-1])
        shared=self.shared_bert(bert_tokens,token_type_ids=None,attention_mask=masks)
        global_bert=self.bert_proj1(shared[1][-1][:,0,:])'''
        #print (global_bert.shape)
        for idx in range(seq.size()[1]):
            word=seq[:,idx,:]
            hidden=shared_hidden[:,idx,:]
            proj_word=self.proj_word[num_task](word)
            g_own=torch.sigmoid(self.gate_proj_w[num_task](word) + self.gate_proj_u[num_task](h0[num_task]))
            g_share=torch.sigmoid(self.shared_gate_w[num_task](word) + self.shared_gate_u[num_task](hidden))
            c_hate=torch.tanh(proj_word + g_own*self.proj_hidden[num_task](h0[num_task]) + g_share*self.shared_hidden_proj(hidden))
            h,c=self.lstm[num_task](word,h0[num_task],c0[num_task],c_hate) 
            #print(h.shape,c.shape)
            h0[num_task]=h
            c0[num_task]=c  
        #global_fusion=self.gate(h0[num_task],global_bert)
        global_fusion=h0[num_task]
        #print (global_fusion.shape)
        return global_fusion,c0[num_task]

    
class Full_RNN(nn.Module):
    def __init__(self,in_dim,num_hidden,num_layer,bidirect,dropout,rnn_type='LSTM'):
        super(Full_RNN,self).__init__()
        rnn_cls=nn.LSTM if rnn_type=='LSTM' else nn.GRU
        self.rnn=rnn_cls(in_dim,num_hidden,num_layer,bidirectional=bidirect,dropout=dropout,batch_first=True)
        self.in_dim=in_dim
        self.num_hidden=num_hidden
        self.num_layer=num_layer
        self.rnn_type=rnn_type
        self.num_bidirect=1+int(bidirect)
        self.bidirect=bidirect
        
    def init_hidden(self,batch):
        weight=next(self.parameters()).data
        hid_shape=(self.num_layer * self.num_bidirect,batch,self.num_hidden)
        if self.rnn_type =='LSTM':
            return (Variable(weight.new(*hid_shape).zero_().cuda()),
                    Variable(weight.new(*hid_shape).zero_().cuda()))
        else:
            return Variable(weight.new(*hid_shape).zero_()).cuda()
    
    def forward(self,x):
        batch=x.size(0)
        hidden=self.init_hidden(batch)
        self.rnn.flatten_parameters()
        output,hidden=self.rnn(x,hidden)
        if self.bidirect:
            hidden = torch.cat((hidden[0][0], hidden[0][1]), dim=1)
            hidden = hidden.squeeze()
        else:
            hidden = hidden[0].squeeze()
        return output,hidden

class BiLSTMPacked(nn.Module):
    def __init__(self, in_dim, num_hidden, num_layer, bidirect, dropout, rnn_type='LSTM'):
        super(BiLSTMPacked, self).__init__()
        rnn_cls = nn.LSTM if rnn_type == 'LSTM' else nn.GRU
        self.rnn = rnn_cls(in_dim, num_hidden, num_layer, bidirectional=bidirect, dropout=dropout, batch_first=True)
        self.num_layer=num_layer
        self.num_hidden=num_hidden
        self.rnn_type=rnn_type
        self.num_bidirect=1+int(bidirect)
        self.bidirect=bidirect

    def init_hidden(self,batch):
        weight=next(self.parameters()).data
        hid_shape=(self.num_layer * self.num_bidirect,batch,self.num_hidden)
        if self.rnn_type =='LSTM':
            return (Variable(weight.new(*hid_shape).zero_().cuda()),
                    Variable(weight.new(*hid_shape).zero_().cuda()))
        else:
            return Variable(weight.new(*hid_shape).zero_()).cuda()

    def forward(self, x):
        hidden = self.init_hidden(64)
        self.rnn.flatten_parameters()
        pack_output, hidden = self.rnn(x)
        #print(pack_output.shape, hidden.shape)
        output, _ = pad_packed_sequence(pack_output, batch_first=True)
        batch = output.size(0)
        if self.bidirect:
            hidden = hidden[0].view(self.num_layer,self.num_bidirect,batch,self.num_hidden)[-1]
            hidden = torch.cat((hidden[0], hidden[1]), dim=1)
        else:
            hidden = hidden[0].view(self.num_layer,self.num_bidirect,batch,self.num_hidden)[-1]
            hidden = hidden.squeeze()
        return output, hidden

class CNN_Model(nn.Module):
    def __init__(self,in_dim,filter_size,num_filter):
        super(CNN_Model,self).__init__()
        self.in_dim=in_dim
        filter_sizes=[int(fsz) for fsz in filter_size.split(',')]
        self.conv=nn.ModuleList([nn.Conv2d(1,num_filter,(fsz,in_dim)) for fsz in filter_sizes])
        self.pool=nn.MaxPool1d(kernel_size=4, stride=4)
        
    def forward(self,emb):
        emb=emb.unsqueeze(1)#B,1,L,D
        conv_result=[F.relu(conv(emb)) for conv in self.conv]
        pool_result=[F.max_pool2d(input=x_i,kernel_size=(x_i.shape[2],x_i.shape[3])) for x_i in conv_result]
        mid=[torch.squeeze(x_i) for x_i in pool_result]
        final=torch.cat(mid,1)
        return final
    
class Part_RNN(nn.Module):
    def __init__(self,in_dim,num_hidden,num_layer,bidirect,dropout,rnn_type='LSTM'):
        super(Part_RNN,self).__init__()
        rnn_cls=nn.LSTM if rnn_type=='LSTM' else nn.GRU
        self.rnn=rnn_cls(in_dim,num_hidden,num_layer,bidirectional=bidirect,dropout=dropout,batch_first=True)
        self.in_dim=in_dim
        self.num_hidden=num_hidden
        self.num_layer=num_layer
        self.rnn_type=rnn_type
        self.num_bidirect=1+int(bidirect)
        
    def init_hidden(self,batch):
        weight=next(self.parameters()).data
        hid_shape=(self.num_layer * self.num_bidirect,batch,self.num_hidden)
        if self.rnn_type =='LSTM':
            return (Variable(weight.new(*hid_shape).zero_().cuda()),
                    Variable(weight.new(*hid_shape).zero_().cuda()))
        else:
            return Variable(weight.new(*hid_shape).zero_()).cuda()
    
    def forward(self,x):
        batch=x.size(0)
        hidden=self.init_hidden(batch)
        self.rnn.flatten_parameters()
        output,hidden=self.rnn(x,hidden)
        return output[:,-1,:]



In [None]:
import torch
import torch.nn as nn
import numpy as np
import os
import torch.nn.functional as F
from torch.autograd import Variable
import math

opt= parse_opt()
class Word_Embedding(nn.Module):
    def __init__(self,ntoken,emb_dim,dropout):
        super(Word_Embedding,self).__init__()
        self.emb=nn.Embedding(ntoken+1,emb_dim,padding_idx=ntoken)
        self.dropout=nn.Dropout(dropout)
        self.ntoken=ntoken
        self.emb_dim=emb_dim
        self.tasks=opt.TASKS.split(',')

    def init_embedding(self):
        print ('Initializing glove Embedding...')
        
        glove_weight=torch.from_numpy(np.load(os.path.join(self.tasks[0],'glove_embedding.npy')))
        #glove_weight=torch.from_numpy(np.load('./glove_embedding.npy'))
        self.emb.weight.data[:self.ntoken]=glove_weight
  
    def forward(self,x):
        emb=self.emb(x)
        emb=self.dropout(emb)
        return emb

In [None]:
import torch.nn as nn
from torch.nn.utils.weight_norm import weight_norm

class SimpleClassifier(nn.Module):
    def __init__(self,in_dim,hid_dim,out_dim,dropout):
        super(SimpleClassifier,self).__init__()
        layer=[
            weight_norm(nn.Linear(in_dim,hid_dim),dim=None),
            nn.ReLU(),
            nn.Dropout(dropout,inplace=True),
            weight_norm(nn.Linear(hid_dim,out_dim),dim=None)
        ]
        self.main=nn.Sequential(*layer)
        
    def forward(self,x):
        logits=self.main(x)
        return logits
    
    
class SingleClassifier(nn.Module):
    def __init__(self,in_dim,out_dim,dropout):
        super(SingleClassifier,self).__init__()
        layer=[
            weight_norm(nn.Linear(in_dim,out_dim),dim=None),
            nn.Dropout(dropout,inplace=True)
        ]
        self.main=nn.Sequential(*layer)
        
    def forward(self,x):
        logits=self.main(x)
        return logits   

class FC(nn.Module):
    def _init_(self,in_dim,out_dim,dropout):
        super(FC,self).__init__()
        self.main = nn.Linear(in_dim,out_dim)
        self.drop = nn.Dropout(dropout)
    def forward(self, x):
        logits = self.main(x)
        logits = self.drop(logits)
        return logits

In [None]:
import os
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import h5py
import pickle as pkl
import json
import random
from transformers import get_linear_schedule_with_warmup,AdamW
from torch.utils.data import DataLoader
from sklearn.utils.multiclass import type_of_target
from sklearn.metrics import f1_score,recall_score,precision_score,accuracy_score,classification_report,precision_recall_fscore_support,roc_auc_score


def log_hyperpara(logger,opt):
    dic = vars(opt)
    for k,v in dic.items():
        logger.write(k + ' : ' + str(v))

def bce_for_loss(logits,labels):
    loss=nn.functional.binary_cross_entropy_with_logits(logits, labels)
    # loss*=labels.size(1)
    #print (loss)
    return loss

def compute_auc(logits,labels):
    result=roc_auc_score(labels.cpu().numpy(),logits.cpu().numpy(),average='weighted')
    return result

def compute_score(logits,labels):
    logits=torch.max(logits,1)[1]
    labels=torch.max(labels,1)[1]
    score=logits.eq(labels)
    score=score.sum().float()
    return score

def compute_other(logits,labels,source=None):
    acc=compute_score(logits,labels)
    logits=np.argmax(logits.cpu().numpy(),axis=1)
    label=np.argmax(labels.cpu().numpy(),axis=1)
    length=logits.shape[0]

    f1=f1_score(label,logits,average='weighted',labels=np.unique(label))
    recall=recall_score(label,logits,average='weighted',labels=np.unique(label))
    precision=precision_score(label,logits,average='weighted',labels=np.unique(label))

    result=classification_report(label,logits)
    #if source in ["wz","dt","founta"]:
     #   print (result)
    information=result.split('\n')
    #print(information,result)
    cur=information[2].split('     ')
    h_p=float(cur[3].strip())
    h_r=float(cur[4].strip())
    h_f=float(cur[5].strip())
    total=[]
    
    total.append(precision*100)
    total.append(recall*100)
    total.append(f1*100)
    total.append(h_p*100)
    total.append(h_r*100)
    total.append(h_f*100)
    return total


def setup_optimizer(model,mtl_loss):
    exclude = "s_rnn"
    exclude_2 = "proj"
    bert_params = list(filter(lambda kv:kv[0] if kv[0].startswith(exclude) or kv[0].startswith(exclude_2) else None, model.named_parameters()))
    other_params = list(filter(lambda kv:kv[0] if not kv[0].startswith(exclude) and not kv[0].startswith(exclude_2) else None, model.named_parameters()))
                 
    bert_params_value= []
    other_params_value = []
    for name, value in bert_params:
        bert_params_value.append(value)
    for name, value in other_params:
        other_params_value.append(value)
    for name, value in mtl_loss.named_parameters():
        other_params_value.append(value)

    
    optimizer = torch.optim.AdamW([{"params":bert_params_value[:100], "lr":2e-5}, {"params":bert_params_value[100:], "lr":5e-5}, {"params":other_params_value}], lr=1e-3, eps=1e-9)
    return optimizer

def freeze(model):
    exclude = "s_rnn"
    exclude_2 = "proj"
    bert_params = list(filter(lambda kv:kv[0] if kv[0].startswith(exclude) or kv[0].startswith(exclude_2) else None, model.named_parameters()))
                
    for name, value in bert_params:
        value.requires_grad = False


class MultiTaskLoss(nn.Module):
    def __init__(self, tasks):
        super(MultiTaskLoss, self).__init__()
        self.tasks = tasks
        self.log_vars = nn.Parameter(torch.zeros(len(self.tasks)).cuda(), requires_grad=True)

    def forward(self, preds, targets, task_idx):
        loss = bce_for_loss(preds, targets)
        precision_0 = torch.exp(-self.log_vars[task_idx])
        loss_ = precision_0 * loss
        return loss_

def train_for_deep(opt,model,total_train,total_test,total_val):
    tasks=opt.TASKS.split(',')
    #total_iters=[total_train[i].num_iters for i in range(len(tasks))]
    total_iters=[total_train[i].num_iters for i in range(len(tasks))]
    max_iters=sum(total_iters)
    if tasks[0]=='dt':
        logger=utils.Logger(os.path.join(opt.DT,'log'+str(opt.SAVE_NUM)+'.txt'))
    elif tasks[0]=='wz':
        logger=utils.Logger(os.path.join(opt.WZ_RESULT,'log'+str(opt.SAVE_NUM)+'.txt'))
    elif tasks[0]=='founta':
        logger=utils.Logger(os.path.join(opt.FOUNTA_RESULT,'log'+str(opt.SAVE_NUM)+'.txt'))
    log_hyperpara(logger,opt)

    mtl_loss=MultiTaskLoss(tasks)
    """
    all tasks have the same number of training epochs
    now for evaluating on other tasks, I just wanna to 

    WARNING!!!! swith optimizer to setup_optimizer() for angrybert
    """
    #total_loss=0.0
    
    if opt.MODEL in ['angrybert','angrybert-attn','shared-bert']:
        optimizer = setup_optimizer(model, mtl_loss)
    else:
        optimizer= torch.optim.AdamW(list(model.parameters())+list(mtl_loss.parameters()),lr=1e-3,eps=1e-8)

    scheduler=get_linear_schedule_with_warmup(optimizer,
                                              num_warmup_steps=0,
                                              num_training_steps=max_iters*opt.EPOCHS
                                             )
    best_on_epoch = None
    f1_max = 0
    for epoch in range(opt.EPOCHS):
        total_loss = [0.0 for _ in range(len(tasks))]
        cur_iters = [0 for _ in range(len(tasks))]
        # task to task index
        cur_tasks = {task: i for i, task in enumerate(tasks)}
        for iters in range(max_iters):
            """
            choose a task among the remaining tasks
            """
            choices=[cur_tasks[name] for name in cur_tasks.keys()]
            task_idx=random.choice(choices)
            batch_info=total_train[task_idx].next_batch()
            tokens=batch_info['tokens'].cuda()
            bert_tokens=batch_info['bert_tokens'].cuda()
            labels=batch_info['label'].float().cuda()
            masks=batch_info['masks'].cuda()
            att_masks=batch_info['att_masks'].cuda()
            pred,_=model(tokens,task_idx,bert_tokens,masks,att_masks)
            loss=mtl_loss(pred,labels,task_idx)
            total_loss[task_idx]+=loss
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(),1.0)
            optimizer.step()
            scheduler.step()#updating the learning rate
            optimizer.zero_grad()

            cur_iters[task_idx]+=1
            upper_bound = total_iters[task_idx]
            if cur_iters[task_idx]>=upper_bound:
                cur_tasks.pop(tasks[task_idx])

        logger.write('epoch %d' %(epoch))
        ttl_loss=np.sum(total_loss)
        logger.write('\t multi task train_loss: %.2f' % (ttl_loss))
        #re-initialize the records for loss
        total=[]
        print ('Evaluating on each task')
        #for basic MTL, roc is the F1 for the lingo task
        model.train(False)
        for j,task in enumerate(tasks):
            print(j, task)
            cur_total=evaluate_for_offensive(opt,model,total_test[j],j)
            total.append(cur_total)
            if j == 0:
                eval_score=evaluate_for_offensive(opt,model,total_val,j)
                f1_cur = eval_score[2]
                
                if f1_cur >= f1_max:
                    f1_max = f1_cur
                    best_on_epoch = cur_total

                    print('\tVALIDATION task %s precision: %.2f recall: %.2f f1: %.2f' % (task, eval_score[0], eval_score[1], eval_score[2]))
            if task in ['dt','founta']:
                logger.write('\teval task %s precision: %.2f recall: %.2f f1: %.2f' % (task, cur_total[0], cur_total[1], cur_total[2]))
                logger.write('\teval task %s hate precision: %.2f recall: %.2f f1: %.2f\n' % (task, cur_total[3],cur_total[4],cur_total[5]))
            elif task == 'semeval_a':
                logger.write('\teval task %s roc auc score for multi label classification: %.2f \n' % (task,  cur_total[0]))
            else:
                logger.write('\teval task %s precision: %.2f recall: %.2f f1: %.2f' % (task, cur_total[0], cur_total[1], cur_total[2]))
        model.train(True)
    total[0] = best_on_epoch
    return total

def evaluate_for_offensive(opt,model,test_set,task_idx):
    # print ('The information for task 1 iterations is:',len(test_set),test_set.last_batch)
    task=opt.TASKS.split(',')[task_idx]
    total = len(test_set)
    for i in range(total):
        with torch.no_grad():
            batch_info=test_set.next_batch()
            tokens=batch_info['tokens'].cuda()
            bert_tokens=batch_info['bert_tokens'].cuda()
            labels=batch_info['label'].float().cuda()
            masks=batch_info['masks'].cuda()
            att_masks=batch_info['att_masks'].cuda()
            pred,_=model(tokens,task_idx,bert_tokens,masks,att_masks)

        if i==0:
            t1_labels=labels
            t1_pred=pred
        else:
            t1_labels=torch.cat((t1_labels,labels),0)
            t1_pred=torch.cat((t1_pred,pred),0)
    total=compute_other(t1_pred,t1_labels,task)
    if task == "semeval_a":
        total[0]=compute_auc(t1_pred,t1_labels)*100
    return total


def analysis_dump(opt,model,test_set,idx):
    tasks = opt.TASKS.split(',')
    total = len(test_set)

    result = dict()
    for task in tasks:
        result[task] = {"weights":[], "preds":[]}
        result["words"] = []
        result["labels"] = []
    for i in range(total):
        with torch.no_grad():
            batch_info = test_set.next_batch()
            tokens = batch_info['tokens'].cuda()
            labels = batch_info['label'].float().cuda()
            bert_tokens = batch_info['bert_tokens'].cuda()
            masks = batch_info['masks'].cuda()
            att_masks = batch_info['att_masks'].cuda()
            words = batch_info['words']
            result["words"].extend(words)
            result["labels"].extend(labels.cpu().numpy())
            for task_idx, task in enumerate(tasks):
                pred, _ = model(tokens, task_idx, bert_tokens, masks, att_masks)
                pred = pred.cpu().numpy()
                result[task]["preds"].extend(pred)

    write_json(result, tasks[0],idx)

def write_json(result, source, idx):
    with open("weights/{}.mtddn.analysis.json".format(source+str(idx)), "w") as f:
        json.dump(result, f, cls=NpEncoder)


In [None]:
import torch.nn as nn
from torch.nn.utils.weight_norm import weight_norm

class FCNet(nn.Module):
    def __init__(self,in_dim,out_dim,dropout):
        super(FCNet,self).__init__()
        self.in_dim=in_dim
        self.out_dim=out_dim
        self.relu=nn.ReLU()
        self.linear=weight_norm(nn.Linear(in_dim,out_dim),dim=None)
        self.dropout=nn.Dropout(dropout)
        
    def forward(self,x):
        logits=self.dropout(self.linear(x))
        return logits

In [None]:
import torch
import torch.nn as nn
import math
import torch.nn.functional as F

class Attention(nn.Module):
    def __init__(self,opt):
        super(Attention,self).__init__()
        self.opt=opt
        self.v_proj=FCNet(self.opt.NUM_HIDDEN,self.opt.PROJ_DIM,self.opt.FC_DROPOUT)
        self.q_proj=FCNet(self.opt.NUM_HIDDEN,self.opt.PROJ_DIM,self.opt.FC_DROPOUT)
        self.att=FCNet(self.opt.PROJ_DIM,1,self.opt.FC_DROPOUT)
        self.softmax=nn.Softmax()
        
    def forward(self,v,q):
        v_proj=self.v_proj(v)
        q_proj=torch.unsqueeze(self.q_proj(q),1)
        vq_proj=F.relu(v_proj +q_proj)
        proj=torch.squeeze(self.att(vq_proj))
        w_att=torch.unsqueeze(self.softmax(proj),2)
        vatt=v * w_att
        att=torch.sum(vatt,1)
        return att

class SelfAttention(nn.Module):
  def __init__(self, query_dim, key_dim, value_dim):
    super(SelfAttention, self).__init__()
    self.scale = 1. / math.sqrt(query_dim)

  def forward(self, query, keys, values, masks=None):
    # Query = [BxQ]
    # Keys = [TxBxK]
    # Values = [TxBxV]
    # Outputs = a:[TxB], lin_comb:[BxV]

    # Here we assume q_dim == k_dim (dot product attention)
    
    query = query.unsqueeze(1) # [BxQ] -> [Bx1xQ]
    keys = keys.transpose(1,2) # [TxBxK] -> [BxKxT]
    energy = torch.bmm(query, keys) # [Bx1xQ]x[BxKxT] -> [Bx1xT]
    masks = (1.0 - masks) * -10000.0
    masks = masks.unsqueeze(1)
    energy = energy + masks
    energy = F.softmax(energy.mul_(self.scale), dim=2) # scale, normalize
    # values = values.transpose(0,1) # [TxBxV] -> [BxTxV]
    linear_combination = torch.bmm(energy, values).squeeze(1) #[Bx1xT]x[BxTxV] -> [BxV]
    energy = energy.squeeze(1)
    return linear_combination, energy


class Bilinear_Att(nn.Module):
    def __init__(self,in_a,in_b,bilinear_dim,dropout):
        super(Bilinear_Att,self).__init__()
        self.proj_a=SingleClassifier(in_a,bilinear_dim,dropout)
        self.proj_b=SingleClassifier(in_b,bilinear_dim,dropout)
        self.proj=SingleClassifier(bilinear_dim,1,dropout)
        self.softmax=nn.Softmax(dim=1)
        
        
    def forward(self,a,b):
        proj_a=self.proj_a(a)
        proj_b=self.proj_b(b)
        modi_a=proj_a.transpose(1,2).unsqueeze(3)
        modi_b=proj_b.transpose(1,2).unsqueeze(2)
        final=torch.matmul(modi_a,modi_b).transpose(1,2).transpose(2,3)
        final=torch.squeeze(self.proj(final)) #B* N_a * N_b
        modi=final.view(-1,a.size()[1]*b.size()[1]).contiguous()
        norm_weight=self.softmax(modi).view(-1,a.size()[1],b.size()[1]).contiguous()
        return norm_weight
    
class MFB(nn.Module):
    '''this version does not add nolinear layer'''
    def __init__(self,opt):
        super(MFB,self).__init__()    
        self.proj_x=nn.Linear(opt.NUM_HIDDEN,opt.NUM_HIDDEN*3)
        self.proj_y=nn.Linear(opt.NUM_HIDDEN+5,opt.NUM_HIDDEN*3)
        self.dropout=nn.Dropout(opt.FC_DROPOUT)
        self.opt=opt
        self.final_proj=nn.Linear(opt.NUM_HIDDEN,opt.NUM_HIDDEN)
        
        
    def forward(self,x,y):
        batch_size=x.shape[0]
        proj_x=self.dropout(self.proj_x(x))
        proj_y=self.dropout(self.proj_y(y))
        xy=proj_x * proj_y #B,3H
        reshape_xy=xy.view(batch_size,self.opt.NUM_HIDDEN,-1)
        pool_xy=torch.sum(reshape_xy,dim=2)
        final_xy=self.final_proj(pool_xy)
        sqrt_xy=torch.sqrt(F.relu(final_xy))-torch.sqrt(F.relu(-final_xy))
        norm_xy=F.normalize(sqrt_xy)
        return norm_xy

class Intra(nn.Module):
    '''this version does not add nolinear layer'''
    def __init__(self,opt):
        super(Intra,self).__init__()
        self.opt=opt
        self.softmax=nn.Softmax(dim=2)
        self.proj=nn.Linear(opt.NUM_HIDDEN * 2,opt.NUM_HIDDEN)
        
        
    '''considering the impact of b'''    
    def forward(self,a,b):
        simi_matrix=torch.bmm(a,b.transpose(1,2))/math.sqrt(self.opt.NUM_HIDDEN)# B*dim_a*dim_b
        norm_weight=self.softmax(simi_matrix)
        up_1=torch.bmm(norm_weight,b) + a
        return up_1


class Gate_Attention(nn.Module):
    def __init__(self,num_hidden_a,num_hidden_b,num_hidden):
        super(Gate_Attention,self).__init__()
        self.hidden=num_hidden
        self.w1=nn.Parameter(torch.Tensor(num_hidden_a,num_hidden))
        self.w2=nn.Parameter(torch.Tensor(num_hidden_b,num_hidden))
        self.bias=nn.Parameter(torch.Tensor(num_hidden))
        self.reset_parameter()
        
    def reset_parameter(self):
        stdv1=1. / math.sqrt(self.hidden)
        stdv2=1. / math.sqrt(self.hidden)
        stdv= (stdv1 + stdv2) / 2.
        self.w1.data.uniform_(-stdv1,stdv1)
        self.w2.data.uniform_(-stdv2,stdv2)
        self.bias.data.uniform_(-stdv,stdv)
        
    def forward(self,a,b):
        wa=torch.matmul(a,self.w1)
        wb=torch.matmul(b,self.w2)
        gated=wa+wb+self.bias
        gate=torch.sigmoid(gated)
        output=gate * a + (1-gate) * b
        return output

In [None]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence
from torch.autograd import Function
import torch.nn.functional as F

import copy
from transformers import BertForSequenceClassification,BertConfig

def clones(module,N):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

"""
all previous implementations have been removed 
(since our previous operation is for two inputs in the forward function)
if wanna to check 
please refer to previous codes
"""

class Joint_Multi(nn.Module):
    def __init__(self,shared_emb,private_emb,shared_rnn,private_rnn,gates,fcs,mode,opt,atts=None,rnn_last=None):
        super(Joint_Multi,self).__init__()
        self.w_emb=shared_emb
        self.embs=private_emb
        self.s_rnn=shared_rnn
        self.p_rnn=private_rnn
        self.gates=gates
        self.fcs=fcs
        self.atts = atts
        self.mode=mode
        self.rnn_last = rnn_last
        self.drop = nn.Dropout(0.20)
    def forward(self,text,task_idx=0,bert_tokens=None,masks=None,att_masks=None): 
        shared_emb=self.w_emb(text)
        private_emb=self.embs(text)
        
        lens = (att_masks != 0).sum(dim=1).cuda()

        packed_input = pack_padded_sequence(private_emb, lengths=lens, batch_first=True)
        s_packed_input = pack_padded_sequence(shared_emb, lengths=lens, batch_first=True)
        s_out, shared_rnn=self.s_rnn(s_packed_input)

        if task_idx == 0:
            p_out, private_rnn=self.p_rnn(packed_input)  
            p_out = self.drop(p_out)
            gated_private_rnn = p_out*0.1 + s_out*0.9
            gated_private_rnn = pack_padded_sequence(gated_private_rnn, lengths=lens, batch_first=True)
            p_out, rnn_o = self.rnn_last(gated_private_rnn)
        else: 
            s_out = self.drop(s_out)
            s_out = pack_padded_sequence(s_out, lengths=lens, batch_first=True)
            p_out, rnn_o = self.rnn_last(s_out)
        result, _ = self.atts[task_idx](rnn_o, p_out, p_out, att_masks)

        logits=self.fcs[task_idx](result)
        return logits, _


class Deep_Multi(nn.Module):
    def __init__(self,shared_emb,private_emb,shared_rnn,private_rnn,gates,fcs,mode,opt,atts=None):
        super(Deep_Multi,self).__init__()
        self.w_emb=shared_emb
        self.embs=private_emb
        self.s_rnn=shared_rnn
        self.rnns=private_rnn
        self.gates=gates
        self.fcs=fcs
        self.atts = atts
        self.proj=nn.Linear(768,opt.NUM_HIDDEN*2)
        self.mode=mode 
        
    def forward(self,text,task_idx=0,bert_tokens=None,masks=None,att_masks=None):
        weights = None
        shared_emb=self.w_emb(text)
        private_emb=self.embs[task_idx](text)
        if self.mode in ['gate','dnn','cnn','uniform']:
            embedding=torch.cat((shared_emb,private_emb),dim=2)
        elif self.mode in ['basic','angrybert','angrybert-attn']:
            embedding = private_emb
        
        if self.mode=='cnn':
            shared_rnn=self.s_rnn(embedding)
            private_rnn=self.rnns[task_idx](embedding)

        elif self.mode == 'shared-bert':
            shared_rnn = self.s_rnn(bert_tokens, token_type_ids=None, attention_mask=masks)
            shared_rnn = shared_rnn[1][-1][:, 0, :]

            result = self.proj(shared_rnn)

        elif self.mode=='angrybert':
            shared_rnn=self.s_rnn(bert_tokens,token_type_ids=None,attention_mask=masks)
            shared_rnn=shared_rnn[1][-1][:,0,:]
            
            shared_rnn=self.proj(shared_rnn)
            shared_rnn=F.relu(shared_rnn)
            lens = (att_masks != 0).sum(dim=1).cuda()
            packed_input = pack_padded_sequence(embedding, lengths=lens, batch_first=True)
            _,private_rnn=self.rnns[task_idx](packed_input)

        elif self.mode=='angrybert-attn':
            shared_rnn=self.s_rnn(bert_tokens,token_type_ids=None,attention_mask=masks)
            shared_rnn=shared_rnn[1][-1][:,0,:]

            shared_rnn=self.proj(shared_rnn)
            lens = (att_masks != 0).sum(dim=1).cuda()
            packed_input = pack_padded_sequence(embedding, lengths=lens, batch_first=True)
            p_out,private_rnn=self.rnns[task_idx](packed_input)
            p_attn, weights = self.atts[task_idx](private_rnn, p_out, p_out, att_masks)
            result = self.gates[task_idx](shared_rnn, p_attn)
        else:
            lens = (att_masks != 0).sum(dim=1).cuda()
            packed_input = pack_padded_sequence(embedding, lengths=lens, batch_first=True)
            _,shared_rnn=self.s_rnn(packed_input)
            _,private_rnn=self.rnns[task_idx](packed_input)
        

        if self.mode in ['gate','angrybert']:
            result=self.gates[task_idx](shared_rnn,private_rnn)
        elif self.mode=='basic':
            result=torch.cat((shared_rnn,private_rnn),dim=1)
        elif self.mode in ['dnn','cnn']:
            result=private_rnn
        elif self.mode=='uniform':
            result=shared_rnn
        logits=self.fcs[task_idx](result)
        return logits, weights
    
class Deep_Coupled(nn.Module):
    def __init__(self,shared_emb,private_emb,couple,fcs):
        super(Deep_Coupled,self).__init__()
        self.embs=private_emb
        self.w_emb=shared_emb #not used
        self.couple=couple
        self.fcs=fcs

    def forward(self,text,task_idx=0,bert_tokens=None,masks=None,att_masks=None):
        w_emb=self.embs[task_idx](text)
        result,_=self.couple(w_emb,task_idx)
        logits=self.fcs[task_idx](result)
        return logits, _
    
def build_baseline(dataset,opt):
    """
    what models do we provide in the baseline part:
    basic: shared and private rnns, concatenation of both to the fc layer
    dnn: shared and private embeddings, private rnn
    cnn: shared and private embeddings, private cnn
    uniform: different embeddings, shared rnnm, 2016 IJCAI first baseline
    local: 2016 IJCAI, shared layer in the paper
    sp-mtl: 2017 IJCAI, for the implementation of global fusion
    mtl-gatedencoder: joint modeling network, Rajamanickam et al. 20
    shared-bert: bert baseline model for multitask shared learning
    angrybert: AngryBERT model, this is our proposed model
    angrybert-attention: AngryBERT model with attention on top, visualization purpose
    """
    opt= parse_opt()
    mode=opt.MODEL
    em_times=1
    fc_times=1
    fin_times = 1

    if mode in ['gate','dnn','cnn','uniform']:
        em_times=2
    if mode in ['basic', 'angrybert','angrybert-attn', 'shared-bert']:
        fc_times=2

    if mode in ["dnn", "uniform", "angrybert", "angrybert-attn", 'shared-bert', "mtl-gatedencoder"]:
        fin_times = 2
    elif mode  == "basic":
        fin_times = 4

    datasets=opt.TASKS.split(',')
    num_tasks=len(datasets)
    fcs=nn.ModuleList()
    task2dim={'wz':3,'dt':3,'founta':4,'hatelingo':5,'offenseval_c':3,'semeval_a':11}
    final_dim=opt.NUM_HIDDEN
    
    if opt.MODEL=='cnn':
        final_dim=len(opt.FILTER_SIZE.split(',')*opt.NUM_FILTER)
    for task in datasets:
        dim=task2dim[task]
        fc=SimpleClassifier(fin_times*final_dim,opt.MID_DIM,dim,opt.FC_DROPOUT).cuda()
        fcs.append(fc)

    if mode=='cnn':
        #this is actually cnn but named as rnn to arrange the code
        shared_rnn=CNN_Model(em_times*opt.EMB_DIM,opt.FILTER_SIZE,opt.NUM_FILTER)
        private_rnn=clones(CNN_Model(em_times*opt.EMB_DIM,opt.FILTER_SIZE,opt.NUM_FILTER),num_tasks)
    else:
        shared_rnn=BiLSTMPacked(em_times*opt.EMB_DIM,opt.NUM_HIDDEN,opt.NUM_LAYER,opt.BIDIRECT,opt.L_RNN_DROPOUT)
        private_rnn=clones(BiLSTMPacked(em_times*opt.EMB_DIM,opt.NUM_HIDDEN,opt.NUM_LAYER,opt.BIDIRECT,opt.L_RNN_DROPOUT),num_tasks)
    
    shared_emb=Word_Embedding(dataset.dictionary.ntoken(),opt.EMB_DIM,opt.EMB_DROPOUT) 
    private_emb=clones(Word_Embedding(dataset.dictionary.ntoken(),opt.EMB_DIM,opt.EMB_DROPOUT),num_tasks)
    
    if mode in ['uniform', 'cnn', 'dnn', 'basic', 'angrybert','angrybert-attn', 'shared-bert']:
        gates=clones(Gate_Attention(opt.NUM_HIDDEN*fc_times,opt.NUM_HIDDEN*fc_times,opt.NUM_HIDDEN*fc_times).cuda(),num_tasks)
        atts = clones(SelfAttention(fc_times*opt.NUM_HIDDEN,fc_times*opt.NUM_HIDDEN,fc_times*opt.NUM_HIDDEN), num_tasks)   
        if mode in ['angrybert','angrybert-attn','shared-bert']:
            shared_rnn=model=BertForSequenceClassification.from_pretrained(
                'bert-base-uncased',
                num_labels=num_tasks,
                output_attentions=False,
                output_hidden_states=True
            )
        return Deep_Multi(shared_emb,private_emb,shared_rnn,private_rnn,gates,fcs,mode,opt,atts)
    elif mode=="mtl-gatedencoder":
        private_emb=Word_Embedding(dataset.dictionary.ntoken(),opt.EMB_DIM,opt.EMB_DROPOUT)
        gates=Gate_Attention(opt.NUM_HIDDEN*2,opt.NUM_HIDDEN*2,opt.NUM_HIDDEN*2).cuda()
        atts = clones(SelfAttention(fc_times*opt.NUM_HIDDEN,fc_times*opt.NUM_HIDDEN,fc_times*opt.NUM_HIDDEN), num_tasks)   
        private_rnn = BiLSTMPacked(em_times*opt.EMB_DIM,opt.NUM_HIDDEN,opt.NUM_LAYER,opt.BIDIRECT,opt.L_RNN_DROPOUT)
        rnn_last =BiLSTMPacked(2*opt.NUM_HIDDEN,opt.NUM_HIDDEN,opt.NUM_LAYER,opt.BIDIRECT,opt.L_RNN_DROPOUT)
        return Joint_Multi(shared_emb,private_emb,shared_rnn,private_rnn,gates,fcs,mode,opt,atts,rnn_last)

    elif mode=='couple':
        couple= Coupled_Layer(opt.EMB_DIM,opt.NUM_HIDDEN,num_tasks)
        return Deep_Coupled(shared_emb,private_emb,couple,fcs)
    elif mode=='local':
        couple=Local_Layer(opt.EMB_DIM,opt.NUM_HIDDEN,num_tasks)
        return Deep_Coupled(shared_emb,private_emb,couple,fcs)
    elif mode=='sp-mtl':
        gate=Gate_Attention(opt.NUM_HIDDEN*2,opt.NUM_HIDDEN,opt.NUM_HIDDEN)
        couple=Shared_Layer(opt.EMB_DIM,opt.NUM_HIDDEN,num_tasks,gate)
        return Deep_Coupled(shared_emb,private_emb,couple,fcs)

In [None]:
import torch
import torch.nn as nn
import numpy as np

import os
import pickle as pkl

if __name__=='__main__':
    opt= parse_opt()
    # torch.cuda.set_device(opt.CUDA_DEVICE)
    torch.manual_seed(opt.SEED)
    
    tasks=opt.TASKS.split(',')
    hate_dataset=tasks[0]
    # result saving
    if hate_dataset=='wz':
        logger=Logger(os.path.join(opt.WZ_RESULT,'final_'+str(opt.SAVE_NUM)+'.txt'))
    elif hate_dataset=='dt':
        logger=Logger(os.path.join(opt.DT,'final_'+str(opt.SAVE_NUM)+'.txt'))
    elif hate_dataset=='founta':
        logger=Logger(os.path.join(opt.FOUNTA_RESULT,'final_'+str(opt.SAVE_NUM)+'.txt'))
    
    dictionary=Base_Op()
    dictionary.init_dict()
    hate_dataset=pkl.load(open(os.path.join(opt.SPLIT_DATASET,hate_dataset+'.pkl'),'rb'))
    
    constructor='build_baseline'
    #definitions for criteria
    
    """
    recording for both total F1 and hate F1
    change to dynamic number of tasks
    """
    total=[]
    for i in range(opt.CROSS_VAL):
        """
        construct a list
        the first position of the list is the hate speech detection dataset
        """
        total_train=[]
        total_test=[]
        for task_idx,task in enumerate(tasks):
            print (task_idx,task)
            dataset=pkl.load(open(os.path.join(opt.SPLIT_DATASET,task)+'.pkl','rb'))
            train_set=Wraped_Data(opt,dictionary,dataset,i, source=task)
            test_set=Wraped_Data(opt,dictionary,dataset,i,'test', source=task)
            total_train.append(train_set)
            total_test.append(test_set)
            if task_idx == 0:
                val_set=Wraped_Data(opt,dictionary,dataset,opt.CROSS_VAL,'val', source=task)
        model=build_baseline(total_test[0],opt)   
        model=model.cuda()
        model.w_emb.init_embedding()
        #cur_total is the result for each task
        cur_total=train_for_deep(opt,model,total_train,total_test,val_set)
        
        total.append(cur_total)

        """
        please don't believe in it for WZ
        since WZ has two hate classes
        this is adaptive for DT and FOUNTA
        """
        logger.write('validation folder %d' %(i+1))
        for j,task in enumerate(tasks):
            if task in ['dt','founta']:
                logger.write('\teval task %s precision: %.2f ' % (task, cur_total[j][0]))
                logger.write('\teval task %s recall: %.2f ' % (task, cur_total[j][1]))
                logger.write('\teval task %s f1: %.2f ' % (task, cur_total[j][2]))
                logger.write('\teval task %s hate precision: %.2f ' % (task, cur_total[j][3]))
                logger.write('\teval task %s hate recall: %.2f ' % (task,  cur_total[j][4]))
                logger.write('\teval task %s hate f1: %.2f \n' % (task,  cur_total[j][5]))
            elif task == 'semeval_a':
                logger.write('\teval task %s roc auc score for multi label classification: %.2f \n' % (task,  cur_total[j][0]))
            else:
                logger.write('\teval task %s precision: %.2f ' % (task, cur_total[j][0]))
                logger.write('\teval task %s recall: %.2f ' % (task, cur_total[j][1]))
                logger.write('\teval task %s f1: %.2f \n' % (task, cur_total[j][2]))
    
    total=np.sum(total,axis=0)

    logger.write('\n final result')
    
    for j,task in enumerate(tasks):
        if task in ['dt','founta']:
            logger.write('\teval task %s precision: %.2f ' % (task,total[j][0]/opt.CROSS_VAL))
            logger.write('\teval task %s recall: %.2f ' % (task, total[j][1]/opt.CROSS_VAL))
            logger.write('\teval task %s f1: %.2f ' % (task, total[j][2]/opt.CROSS_VAL))
            logger.write('\teval task %s hate precision: %.2f ' % (task, total[j][3]/opt.CROSS_VAL))
            logger.write('\teval task %s hate recall: %.2f ' % (task, total[j][4]/opt.CROSS_VAL))
            logger.write('\teval task %s hate f1: %.2f \n' % (task, total[j][5]/opt.CROSS_VAL))
        elif task  == 'semeval_a':
            logger.write('\teval task %s roc auc score for multi label classification: %.2f \n' % (task, total[j][0]/opt.CROSS_VAL))
        else:
            logger.write('\teval task %s precision: %.2f ' % (task,total[j][0]/opt.CROSS_VAL))
            logger.write('\teval task %s recall: %.2f ' % (task, total[j][1]/opt.CROSS_VAL))
            logger.write('\teval task %s f1: %.2f ' % (task, total[j][2]/opt.CROSS_VAL))
    exit(0)