In [1]:
import os
import re
import sys
import time
import math
import string
import random
import collections
from collections import Counter,OrderedDict
from copy import deepcopy
import pickle
import numpy as np
import pandas as pd
import sklearn
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score,f1_score
from sklearn.model_selection import train_test_split,StratifiedKFold
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.distributions as tdist
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
seed = 0
#torch.autograd.set_detect_anomaly(False)
import nltk
from nltk.corpus import stopwords
stopword = stopwords.words('english')

cuda
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
datapath = 'mtl/'
vecpath = 'glove.840B.300d.txt'

In [0]:
!wget https://nlp.stanford.edu/data/glove.840B.300d.zip
!unzip glove.840B.300d.zip
!rm -rf glove.840B.30d.zip

embedding_index = {}
with open(vecpath,'r',encoding='utf-8') as f:
    for line in f.readlines():
      words = line.split(' ')
      word = words[0]
      v = torch.from_numpy(np.asarray([w for w in words[1:]],'float32'))
      embedding_index[word] = v

In [0]:
domainbatchsize = 2
batchsize = 16
fullyhidden1 = 300

In [0]:
domains = os.listdir(datapath)

In [0]:
  def clean_str(sent):
    sent = str(sent)
    sent = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", sent)     
    sent = re.sub(r"\'s", " \'s", sent) 
    sent = re.sub(r"\'ve", " \'ve", sent) 
    sent = re.sub(r"n\'t", " n\'t", sent) 
    sent = re.sub(r"\'re", " \'re", sent) 
    sent = re.sub(r"\'d", " \'d", sent) 
    sent = re.sub(r"\'ll", " \'ll", sent) 
    sent = re.sub(r",", " , ", sent) 
    sent = re.sub(r"!", " ! ", sent) 
    sent = re.sub(r"\(", " \( ", sent) 
    sent = re.sub(r"\)", " \) ", sent) 
    sent = re.sub(r"\?", " \? ", sent) 
    sent = re.sub(r"\s{2,}", " ", sent)    
    sent = re.sub(r"[^a-zA-Z]"," ",sent)
    sents = sent.split()
    return " ".join(word.lower() for word in sents if word not in stopword and len(word)>1)

In [0]:
def preprocess(corpus):
    return [clean_str(sentence) for sentence in corpus]

In [0]:
domains_train_corpus = []
domains_train_labels = []
domains_val_corpus = []
domains_val_labels = []
domains_test_corpus = []
domains_test_labels = []
for domain in domains:
    train = pd.read_csv(datapath+domain+'/train.csv')
    domains_train_labels.append(train['label'].astype(int).values)
    domains_train_corpus.append(preprocess(list(train['text'])))
    
    val = pd.read_csv(datapath+domain+'/val.csv')
    domains_val_labels.append(val['label'].astype(int).values)
    domains_val_corpus.append(preprocess(list(val['text'])))

In [0]:
traindomainsize = [len(d) for d in domains_train_corpus]
valdomainsize = [len(d) for d in domains_val_corpus]

In [0]:
for i in range(len(domains)):
    assert(len(domains_train_corpus[i])==len(domains_train_labels[i]))
    assert(len(domains_val_corpus[i])==len(domains_val_labels[i]))
    assert(len(domains_test_corpus[i])==len(domains_test_labels[i]))

In [0]:
maxlen = 300
embeddim = 300
hiddendim = 128

In [0]:
def get_vectors(sentence,vocabulary):
    temp = [vocabulary[word] for word in sentence.split() if word in vocabulary]
    vector = [0] * maxlen
    curlen = len(temp)
    if(maxlen-curlen<0):
        vector = temp[:maxlen]
    else:
        vector[maxlen-curlen:] = temp

    return torch.from_numpy(np.asarray(vector,dtype='int32'))

In [0]:
def get_vocab(data):
    words = []
    for sentence in data:
        words+=sentence.split()

    counts = Counter(words).most_common()
    counts.insert(0,('<PAD>',0))
    vocabulary = {word:i for i,(word,_) in enumerate(counts)}
    return vocabulary

In [0]:
def get_data(data,vocab):
    vectors = torch.zeros(len(data),maxlen)
    for i in range(len(data)):
        vectors[i] = get_vectors(data[i],vocab)
    return vectors

In [0]:
totaldata = []
for d in domains_train_corpus:
    totaldata.extend(d)
vocabulary = get_vocab(totaldata)

embedding_matrix = torch.zeros(len(vocabulary),embeddim).to(device)
for i,word in enumerate(vocabulary):
    vec = embedding_index.get(word)
    if(vec is not None):
        embedding_matrix[i] = vec[:embeddim]

In [17]:
len(vocabulary)

55521

In [0]:
train_vectors = [get_data(d,vocabulary) for d in domains_train_corpus]
val_vectors = [get_data(d,vocabulary) for d in domains_val_corpus]   

In [0]:
trainlabels = []
for i,d in enumerate(domains_train_corpus):
    trainlabels.append([i]*len(d))

vallabels = []
for i,d in enumerate(domains_val_corpus):
    vallabels.append([i]*len(d))

In [0]:
train_domain_indices = []
for i in range(max(traindomainsize)//domainbatchsize):
    for j in range(len(traindomainsize)):
        if((i+1)*domainbatchsize>traindomainsize[j]):
            train_domain_indices.extend(train_vectors[j][i*domainbatchsize:])
        else:
            train_domain_indices.extend(train_vectors[j][i*domainbatchsize:(i+1)*domainbatchsize])
            
train_domain_labels = []
for i in range(max(traindomainsize)//domainbatchsize):
    for j in range(len(traindomainsize)):
        if((i+1)*domainbatchsize>traindomainsize[j]):
            train_domain_labels.extend(trainlabels[j][i*domainbatchsize:])
        else:
            train_domain_labels.extend(trainlabels[j][i*domainbatchsize:(i+1)*domainbatchsize])
            
train_sentiment_labels = []
for i in range(max(traindomainsize)//domainbatchsize):
    for j in range(len(traindomainsize)):
        if((i+1)*domainbatchsize>traindomainsize[j]):
            train_sentiment_labels.extend(domains_train_labels[j][i*domainbatchsize:])
        else:
            train_sentiment_labels.extend(domains_train_labels[j][i*domainbatchsize:(i+1)*domainbatchsize])
            
train_domain_indices = torch.stack([x for x in train_domain_indices])
train_domain_labels = torch.from_numpy(np.asarray(train_domain_labels,'int32'))
train_sentiment_labels = torch.from_numpy(np.asarray(train_sentiment_labels,'int32'))

In [0]:
val_domain_indices = []
for i in range(max(valdomainsize)//domainbatchsize):
    for j in range(len(valdomainsize)):
        if((i+1)*domainbatchsize>valdomainsize[j]):
            val_domain_indices.extend(val_vectors[j][i*domainbatchsize:])
        else:
            val_domain_indices.extend(val_vectors[j][i*domainbatchsize:(i+1)*domainbatchsize])
            
val_domain_labels = []
for i in range(max(valdomainsize)//domainbatchsize):
    for j in range(len(valdomainsize)):
        if((i+1)*domainbatchsize>valdomainsize[j]):
            val_domain_labels.extend(vallabels[j][i*domainbatchsize:])
        else:
            val_domain_labels.extend(vallabels[j][i*domainbatchsize:(i+1)*domainbatchsize])
            
val_sentiment_labels = []
for i in range(max(valdomainsize)//domainbatchsize):
    for j in range(len(valdomainsize)):
        if((i+1)*domainbatchsize>valdomainsize[j]):
            val_sentiment_labels.extend(domains_val_labels[j][i*domainbatchsize:])
        else:
            val_sentiment_labels.extend(domains_val_labels[j][i*domainbatchsize:(i+1)*domainbatchsize])
            
val_domain_indices = torch.stack([x for x in val_domain_indices])
val_domain_labels = torch.from_numpy(np.asarray(val_domain_labels,'int32'))
val_sentiment_labels = torch.from_numpy(np.asarray(val_sentiment_labels,'int32'))

In [22]:
train_domain_indices.size(),val_domain_indices.size()

(torch.Size([22180, 300]), torch.Size([3200, 300]))

In [0]:
train_array = torch.utils.data.TensorDataset(train_domain_indices,train_domain_labels,train_sentiment_labels)
train_loader = torch.utils.data.DataLoader(train_array,batchsize)

val_array = torch.utils.data.TensorDataset(val_domain_indices,val_domain_labels,val_sentiment_labels)
val_loader = torch.utils.data.DataLoader(val_array,batchsize)

In [0]:
class DomainClassifier(nn.Module):
    def __init__(self):
        super(DomainClassifier,self).__init__()
        self.hiddendim = hiddendim
        self.maxlen = maxlen
        self.embeddim = embeddim
        self.fullyhidden1 = 300
        self.numclasses1 = len(traindomainsize)
        self.embed = nn.Embedding.from_pretrained(embedding_matrix,freeze=True)
        self.lstm = nn.LSTM(self.embeddim,self.hiddendim,batch_first=True,bidirectional=True)
        self.fc1 = nn.Linear(self.hiddendim*2,self.fullyhidden1)
        self.fc2 = nn.Linear(self.fullyhidden1,self.numclasses1)
        self.drop = nn.Dropout(0.5)
    
    def forward(self,x):
        x = self.embed(x)
        lstmout,_ = self.lstm(x,None)
        out = torch.mean(lstmout,1)
        logits = self.drop(F.relu(self.fc1(out)))
        logits = self.fc2(logits)
        return lstmout,out,logits

In [0]:
class Attention(nn.Module):
    def __init__(self,hidden_dim,attention_dim=embeddim):
        super(Attention,self).__init__()
        self.maxlen = maxlen
        self.hidden_dim = hidden_dim
        self.attention_dim = attention_dim
        self.w = nn.Parameter(torch.from_numpy(np.random.normal(loc=0.0,scale=0.1,size=(hidden_dim,attention_dim)))).float().to(device)
        self.b = nn.Parameter(torch.from_numpy(np.random.normal(loc=0.0,scale=0.1,size=attention_dim))).float().to(device)
        self.u = nn.Parameter(torch.from_numpy(np.random.normal(loc=0.0,scale=0.1,size=(attention_dim,1)))).float().to(device)
        
    def forward(self,x):
        xlstm = x.view(-1,self.hidden_dim)
        v = torch.tanh(torch.mm(xlstm,self.w) + self.b)
        vu = torch.mm(v,self.u).view(-1,self.maxlen)
        alpha = F.softmax(vu,dim=1).unsqueeze(2)
        out = torch.bmm(x.transpose(1,2),alpha).squeeze()
        return out

In [0]:
class SentimentClassifier(nn.Module):
    def __init__(self):
        super(SentimentClassifier,self).__init__()
        self.hiddendim = hiddendim
        self.maxlen = maxlen
        self.embeddim = embeddim
        self.fchidden = 300
        self.numclasses2 = 2
        self.embed = nn.Embedding.from_pretrained(embedding_matrix)
        self.lstm = nn.LSTM(self.embeddim+self.hiddendim,self.hiddendim,batch_first=True,bidirectional=True)
        self.fclstm = nn.Linear(self.hiddendim*2,self.hiddendim)
        self.attention = Attention(self.hiddendim*4,self.hiddendim)
        self.fc1 = nn.Linear(self.hiddendim*4,self.fchidden)
        self.fc2 = nn.Linear(self.fchidden,self.numclasses2)
        
    def forward(self,x,out1,out2):
        lstmout1 = out1.contiguous().view(-1,self.hiddendim*2)
        lstmout1 = self.fclstm(lstmout1).view(out2.size(0),-1,self.hiddendim)
        x = self.embed(x)
        x = torch.cat([lstmout1,x],2)
        a1,_ = self.lstm(x,None)
        out2 = out2.unsqueeze(1).repeat(1,self.maxlen,1)
        a1 = torch.cat([a1,out2],2)
        a1 = self.attention(a1)
        a1 = F.dropout(F.relu(self.fc1(a1),1),0.5)
        a1 = self.fc2(a1)
        return a1

In [0]:
def domainevaluate(model1,loader):
    curloss = 0.0
    acc = 0
    total = 0
    model1.eval()
    with torch.no_grad():
        for domind,domlab,senlab in loader:
            domainind,domlab,senlab = domind.long().to(device),domlab.long().to(device),senlab.long().to(device)
            lstmout,out,logits = model1(domainind)
            
            total+=logits.size(0)
            l = F.cross_entropy(logits,domlab,reduction='sum')
            curloss+=l.item()
            output = torch.max(logits,1)[1]
            acc+=torch.sum(domlab==output).item()

        return (curloss/total),((acc/total)*100)

In [0]:
def sentimentevaluate(model1,model2,loader):
    curloss = 0.0
    acc = 0
    total = 0
    model1.eval()
    model2.eval()
    with torch.no_grad():
        for domind,domlab,senlab in loader:
            domainind,domlab,senlab = domind.long().to(device),domlab.long().to(device),senlab.long().to(device)
            lstmout,mpout,_ = model1(domainind)
            output = model2(domainind,lstmout,mpout)
            total+=output.size(0)
            l = F.cross_entropy(output,senlab,reduction='sum')
            curloss+=l.item()
            output = torch.max(output,1)[1]
            acc+=torch.sum(senlab==output).item()

        return (curloss/total),((acc/total)*100)

In [0]:
def get_test_acc(vectors,labels,model1,model2):
    acc = 0.0
    total = 0.0
    bs = vectors.size(0)//testbatchsize
    model1.eval()
    model2.eval()
    with torch.no_grad():
        for i in range(bs):
            if((i+1)*testbatchsize>vectors.size(0)):
                vec = vectors[i*testbatchsize:]
                lbs = labels[i*testbatchsize:]
            else:
                vec = vectors[i*testbatchsize:(i+1)*testbatchsize]
                lbs = labels[i*testbatchsize:(i+1)*testbatchsize]
            total+=vec.size(0)
            vec,lbs = vec.long().to(device),lbs.long().to(device)
            lstmout,out,logits = model1(vec)
            output = model2(vec,lstmout,out)
            total+=output.size(0)
            output = torch.max(output,1)[1]
            acc+=torch.sum(lbs==output).item()

        return round((acc/total)*100,3)

In [43]:
lamda_d = 0.5
lamda_s = 1.0
patience = 10
curpatience = patience

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

domainmodel = DomainClassifier().to(device)
sentimentmodel = SentimentClassifier().to(device)

optim1 = torch.optim.Adam(domainmodel.parameters(),lr=0.004)
optim2 = torch.optim.Adam(sentimentmodel.parameters(),lr=0.004)

scheduler1 = torch.optim.lr_scheduler.StepLR(optim1, 100, gamma=0.1, last_epoch=-1)
scheduler2 = torch.optim.lr_scheduler.StepLR(optim2, 100, gamma=0.1, last_epoch=-1)

best_domainmodel_wts = deepcopy(domainmodel.state_dict())
best_sentimentmodel_wts = deepcopy(sentimentmodel.state_dict())

print("---------Training Domain Classifier-----------------")
epochs1 = 100
domainmodel.train()
sentimentmodel.train()
bestdomainloss = np.Inf
for epoch in range(1,epochs1+1):
    domainmodel.train()
    sentimentmodel.train()
    for domind,domlab,senlab in train_loader:
        domainmodel.zero_grad()
        sentimentmodel.zero_grad()
        domainind,domlab,senlab = domind.long().to(device),domlab.long().to(device),senlab.long().to(device)
        a1,a2,logits = domainmodel(domainind)
        output = sentimentmodel(domainind,a1,a2)
        loss1 = F.cross_entropy(logits,domlab)
        loss2 = F.cross_entropy(output,senlab)
        loss_temp = lamda_d * loss1 + lamda_s * loss2
        loss_temp.backward()
        optim1.step()
        optim2.step()

    scheduler1.step()
    scheduler2.step()

    curdomaintrainloss,curdomaintrainacc = domainevaluate(domainmodel,train_loader)
    curdomainvalloss,curdomainvalacc = domainevaluate(domainmodel,val_loader)
    #if(epoch%10==0):
    print("Epoch {} Train Loss {} Train Accuracy {} ".format(epoch,curdomaintrainloss,curdomaintrainacc))
    print("Validation Loss {} Validation Accuracy {} ".format(curdomainvalloss,curdomainvalacc))
    print("-----------------------------------------------------------------------------------")

    if(curdomainvalloss<bestdomainloss):
        bestdomainloss = curdomainvalloss
        best_domainmodel_wts = deepcopy(domainmodel.state_dict())
        best_sentimentmodel_wts = deepcopy(sentimentmodel.state_dict())
        curpatience = patience
    else:
      curpatience = curpatience - 1
      if(curpatience==0):
        break
 
domainmodel = DomainClassifier().to(device)
sentimentmodel = SentimentClassifier().to(device)
domainmodel.load_state_dict(best_domainmodel_wts)
sentimentmodel.load_state_dict(best_sentimentmodel_wts)

---------Training Domain Classifier-----------------
Epoch 1 Train Loss 0.8895491048922078 Train Accuracy 68.10640216411181 
Validation Loss 0.9768868735432625 Validation Accuracy 66.21875 
-----------------------------------------------------------------------------------
Epoch 2 Train Loss 0.7532550389106688 Train Accuracy 72.15058611361587 
Validation Loss 0.9560525470972061 Validation Accuracy 67.1875 
-----------------------------------------------------------------------------------
Epoch 3 Train Loss 0.6315838692583836 Train Accuracy 76.08205590622183 
Validation Loss 0.9174105374515057 Validation Accuracy 69.25 
-----------------------------------------------------------------------------------
Epoch 4 Train Loss 0.5102433297470736 Train Accuracy 81.28944995491435 
Validation Loss 0.8990827013552188 Validation Accuracy 71.375 
-----------------------------------------------------------------------------------
Epoch 5 Train Loss 0.45764198553142943 Train Accuracy 83.115419296663

<All keys matched successfully>

In [44]:
print("---------Training sentiment Classifier-----------------")
epochs2 = 500

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

optim2 = torch.optim.Adam(sentimentmodel.parameters(),lr=0.004)
scheduler2 = torch.optim.lr_scheduler.StepLR(optim2, 100, gamma=0.1, last_epoch=-1)

patience = 50
curpatience = patience
bestsentimentloss = np.Inf
best_sentimentmodel_wts = deepcopy(sentimentmodel.state_dict())
sentimentmodel.train()
domainmodel.eval()
for epoch in range(1,epochs2+1):
    sentimentmodel.train()
    for domind,domlab,senlab in train_loader:
        domainind,domlab,senlab = domind.long().to(device),domlab.long().to(device),senlab.long().to(device)
        with torch.no_grad():
          lstmoutput,mpoutput,_ = domainmodel(domainind)
        pred = sentimentmodel(domainind,lstmoutput,mpoutput)
        loss2 =  F.cross_entropy(pred,senlab)
        lossnew = lamda_s * loss2
        sentimentmodel.zero_grad()
        lossnew.backward()
        optim2.step()

    scheduler2.step()
    cursentimenttrainloss,cursentimenttrainacc = sentimentevaluate(domainmodel,sentimentmodel,train_loader)
    cursentimentvalloss,cursentimentvalacc = sentimentevaluate(domainmodel,sentimentmodel,val_loader)
    print("Epoch {} Train Loss {} Train Accuracy {} ".format(epoch,cursentimenttrainloss,cursentimenttrainacc))
    print("Validation Loss {} Validation Accuracy {} ".format(cursentimentvalloss,cursentimentvalacc))
    print("-----------------------------------------------------------------------------------------")
    #if(epoch%10==0):
      #print("Epoch {} Loss {} ".format(epoch,cursentimentvalloss))
    if(cursentimentvalloss<bestsentimentloss):
        bestsentimentloss = cursentimentvalloss
        best_sentimentmodel_wts = deepcopy(sentimentmodel.state_dict())
        curpatience = patience
    else:
      curpatience = curpatience - 1
      if(curpatience==0):
        break
  
sentimentmodel = SentimentClassifier().to(device)
sentimentmodel.load_state_dict(best_sentimentmodel_wts)

---------Training sentiment Classifier-----------------
Epoch 1 Train Loss 0.2799194816402533 Train Accuracy 89.66185752930568 
Validation Loss 0.6848115152865648 Validation Accuracy 83.21875 
-----------------------------------------------------------------------------------------
Epoch 2 Train Loss 0.2761978342914173 Train Accuracy 89.7385031559964 
Validation Loss 0.6923473070561886 Validation Accuracy 83.25 
-----------------------------------------------------------------------------------------
Epoch 3 Train Loss 0.27842604858419506 Train Accuracy 89.78358881875563 
Validation Loss 0.7378285152465105 Validation Accuracy 82.75 
-----------------------------------------------------------------------------------------
Epoch 4 Train Loss 0.2781976039542391 Train Accuracy 89.85572587917042 
Validation Loss 0.7466972509585321 Validation Accuracy 82.25 
-----------------------------------------------------------------------------------------
Epoch 5 Train Loss 0.27055331486855894 Train 

<All keys matched successfully>

In [49]:
for domain in domains:
  test = pd.read_csv(datapath+domain+'/test.csv')
  domains_test_labels = torch.from_numpy(test['label'].astype(int).values)
  domains_test_corpus = preprocess(list(test['text']))
  domains_test_corpus = get_data(domains_test_corpus,vocabulary)
  acc = get_test_acc(domains_test_corpus,domains_test_labels,domainmodel,sentimentmodel)
  print("Domain {} ---> Test Accuracy {} ".format(domain[0].upper()+domain[1:],acc))

Domain Music ---> Test Accuracy 40.234 
Domain Mr ---> Test Accuracy 36.849 
Domain Health_personal_care ---> Test Accuracy 40.625 
Domain Kitchen_housewares ---> Test Accuracy 39.844 
Domain Software ---> Test Accuracy 41.406 
Domain Sports_outdoors ---> Test Accuracy 39.974 
Domain Imdb ---> Test Accuracy 43.62 
Domain Toys_games ---> Test Accuracy 41.146 
Domain Video ---> Test Accuracy 43.359 
Domain Magazines ---> Test Accuracy 42.969 
Domain Books ---> Test Accuracy 40.755 
Domain Apparel ---> Test Accuracy 41.276 
Domain Baby ---> Test Accuracy 40.625 
Domain Camera_photo ---> Test Accuracy 42.839 
Domain Electronics ---> Test Accuracy 40.104 
Domain Dvd ---> Test Accuracy 41.927 
