In [1]:
import json
import os.path
import codecs
import re

from keras.preprocessing import sequence, text
from keras.models import Sequential, Model
from keras.layers import Dense, Embedding, Bidirectional, Input
from keras.layers import LSTM
from keras.layers import Conv1D, MaxPooling1D
from keras.layers import SpatialDropout1D, Concatenate, Maximum, Activation, Reshape, Flatten
from keras.layers import RepeatVector, Permute, Merge, TimeDistributed, Multiply, Lambda, Concatenate
from keras import backend as K

max_features = 100
maxlen = 80
embedding_size = 32
lstm_output_size = 32
batch_size = 128
epochs_polarity = 50
epochs_irony= 10

kernel_size = 5
filters = 64
pool_size = 4

Using TensorFlow backend.


# Datos ironía

Se cargan los datos de ironía, por cada tarea:

* taskA: Clasificador binario
* taskB: Clasificador multi-clase (4?)

In [2]:
def load_files(files):
    json_files=[]
    print("Opening files")
    for filename in files:
        print(filename)
        data=[]
        for line in codecs.open(filename):
            data.append(json.loads(line))
        json_files.append(data)
        print("Size",len(json_files[-1]))
    return json_files


tasks=["taskA"]
#tasks=["taskA","taskB"]
dirname="../SemEval2018-Task3/infotec_train_dev"
basename="SemEval2018-T3-{0}_{1}.json"
train_files=[os.path.join(dirname,basename.format(task,'train')) for task in tasks]
test_files=[os.path.join(dirname,basename.format(task,'test')) for task in tasks]

train_json=load_files(train_files)
test_json=load_files(test_files)

Opening files
../SemEval2018-Task3/infotec_train_dev/SemEval2018-T3-taskA_train.json
Size 2683
Opening files
../SemEval2018-Task3/infotec_train_dev/SemEval2018-T3-taskA_test.json
Size 1151


## Datos de polaridad

Cargando datos de polaridad, una sola tarea

In [3]:
train_polarity_json=load_files(["../extras/En.json"])[0]

Opening files
../extras/En.json
Size 45748


In [4]:
# Configurando datos de ironía

from nltk.tokenize.casual import TweetTokenizer
nltk_tok=TweetTokenizer(reduce_len=True)

data_train=[]
data_test=[]
for i,t in enumerate(tasks):
    text_train=[" ".join(nltk_tok.tokenize(j['text'])) for j in train_json[i]]
    class_train=[j['klass'] for j in train_json[i]]
    data_train.append(list(zip(text_train,class_train)))
    text_test=[" ".join(nltk_tok.tokenize(j['text'])) for j in test_json[i]]
    class_test=[j['klass'] for j in test_json[i]]
    data_test.append(list(zip(text_test,class_test)))


for i,t in enumerate(data_train):
    print("Size:",len(data_train[i]))
    print("Size:",len(data_test[i]))
    
# Configurando datos de polaridad

def pol2class(k):
    onehot=[[1,0,0],[0,1,0],[0,0,1]]
    return onehot[['neutral','positive','negative'].index(k)]

text_polarity_train=[" ".join(nltk_tok.tokenize(j['text'])) for j in train_polarity_json]
class_polarity_train=[pol2class(j['klass']) for j in train_polarity_json]
data_polarity_train=list(zip(text_polarity_train,class_polarity_train))

print("Size:",len(data_polarity_train))


Size: 2683
Size: 1151
Size: 45748


In [None]:
from keras.preprocessing.text import Tokenizer
  

def normalizeText(tweet):
    #tweet = re.sub(r'#(S|s)arcasm|#(I|i)rony','',tweet)
    #tweet = re.sub(r'#SARCASM|#IRONY','',tweet)
    tweet = re.sub(r'https?://t\.co/.(\w|\d)+','http', tweet) #tweet link
    tweet = re.sub(r'fb\.me/.(\w|\d)+','fb', tweet) #tweet link
    tweet = re.sub(r'https?://.+','http', tweet) #tweet link
    tweet = re.sub(r'@.\w*','@',tweet)
    #tweet = re.sub(r'#','',tweet)
    return tweet

def buildTokenizer(tweets):
    
    tokenizer = Tokenizer(num_words=max_features,lower=False, filters='\t\n', split=" ",char_level=True )
    tokenizer.fit_on_texts([" ".join(nltk_tok.tokenize(t)) for t in tweets])
    return tokenizer

def text2seq(tok,tweet):
    return tok.texts_to_sequences([normalizeText(tweet)])[0]

In [None]:
toks=[]
maps_=[]
for i,t in enumerate(tasks):
    tweets=[t for t,c in data_train[i]]
    tweets_=[t for t,c in data_polarity_train]
    toks.append(buildTokenizer([normalizeText(t) for t in tweets+tweets_]))
    maps_.append({v: k for k, v in toks[-1].word_index.items()})
    print("word_counts",len(toks[-1].word_counts))
    print("word_docs",len(toks[-1].word_docs))
    print("word_index",len(toks[-1].word_index))
    print("Top words\n",", ".join(["{0} ({1}) ".format(maps_[-1][f+1],toks[i].word_counts[maps_[-1][f+1]]) for f in range(50)]))
    print("Last words\n",", ".join(["{0} ({1}) ".format(maps_[-1][f],toks[i].word_counts[maps_[-1][f]]) for f in range(len(maps_[i])-50,len(maps_[i]))]))

In [None]:
def ir2class(k):
    onehot=[[1,0,0,0,0],[0,1,0,0,0],[0,0,1,0,0],[0,0,0,1,0],[0,0,0,0,1]]
    return onehot[['0','1','2',"3","4"].index(k)]

X_train=[]
y_train=[]
X_test=[]
y_test=[]
for i,t in enumerate(tasks):
    X_train.append([text2seq(toks[i],t) for t,c in data_train[i]])
    if t=="taskA":
        y_train.append([int(c) for t,c in data_train[i]])
    else:
        y_train.append([ir2class(c) for t,c in data_train[i]])
    X_test.append([text2seq(toks[i],t) for t,c in data_test[i]])
    if t=="taskA":
        y_test.append([int(c) for t,c in data_test[i]])
    else:
        y_test.append([ir2class(c) for t,c in data_test[i]])
        

    print("Example train in",t)
    #print("\n".join([" ".join([str(w) for w in S]) for S in X_train[i][:5]]))
    original=[t for t,c in data_train[i][:50]]
    tokenized=[" ".join([maps_[i][w] for w in S if w]) for S in X_train[i][:50]]
        
    print("\n>>".join(["{0}\n{1}".format(a,b) for a,b in zip(original,tokenized)]))
    print()
    print([c for c in y_train[i][:5]])
    print()




In [None]:
X_polarity_train=[]
y_polarity_train=[]
for i,t in enumerate(tasks):
    X_polarity_train.append([text2seq(toks[i],t) for t,c in data_polarity_train])
    y_polarity_train.append([c for t,c in data_polarity_train])
    
print("Example train")
print("\n".join([" ".join([maps_[i][w] for w in S if w]) for S in X_polarity_train[0][:5]]))
print(y_polarity_train[0][:5])

In [None]:
def build_model(task):
    
    inputs = Input(shape=(maxlen,))
    embeedings = Embedding(max_features, embedding_size)(inputs)
    dropout=SpatialDropout1D(0.25)(embeedings)
    
    conv1 = Conv1D(16,
                 3,
                 padding='same',
                 activation='relu',
                 strides=1)(dropout)
    maxpool1 = MaxPooling1D(pool_size=pool_size)(conv1)
    conv2 = Conv1D(16,
                 5,
                 padding='same',
                 activation='relu',
                 strides=1)(dropout)
    maxpool2 = MaxPooling1D(pool_size=pool_size)(conv2)
    conv3= Conv1D(16,
                 7,
                 padding='same',
                 activation='relu',
                 strides=1)(dropout)
    maxpool3= MaxPooling1D(pool_size=pool_size)(conv3)
    conv4= Conv1D(16,
                 9,
                 padding='same',
                 activation='relu',
                 strides=1)(dropout)
    maxpool4=MaxPooling1D(pool_size=pool_size)(conv4)
    
    concatenate = Concatenate(axis=1)([maxpool1,maxpool2,maxpool3,maxpool4])
    bidirectional =  Bidirectional(LSTM(lstm_output_size, return_sequences=True))(concatenate)
    
    
    a = Permute((2, 1))(bidirectional)
    print(bidirectional.shape)
    #a = Reshape((, maxlen*2))(a) # this line is not useful. It's just to know which dimension is what.
    a = Dense(maxlen, activation='softmax')(a)
    #if SINGLE_ATTENTION_VECTOR:
    #    a = Lambda(lambda x: K.mean(x, axis=1), name='dim_reduction')(a)
    #    a = RepeatVector(input_dim)(a)
    a_probs = Permute((2, 1), name='attention_vec')(a)
    attention_mul = Multiply()([bidirectional, a_probs])

    attention_mul = Flatten()(attention_mul)
    
    
    if task=="taskA":
        irony=Dense(1, activation='sigmoid')(attention_mul)
    if task=="taskB":
        irony=Dense(5, activation='softmax')(attention_mul)
    
    polarity=Dense(3, activation='softmax')(attention_mul)
    model_irony = Model(inputs=inputs, outputs=irony)
    model_polarity = Model(inputs=inputs, outputs=polarity)
    
    #model.compile(loss='mse', optimizer='sgd')
    return model_polarity, model_irony

In [None]:
models=[]
for i,t in enumerate(tasks):
    print("Creating model task",t)
    models.append(build_model(t))

In [None]:
scores=[]

from sklearn.metrics import classification_report, f1_score
import numpy as np

    
for i,t in enumerate(tasks):
    model_pol=models[i][0]
    model_pol.summary()
    
    model_pol.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    X_polarity_train_=sequence.pad_sequences(X_polarity_train[i], maxlen=maxlen)
   
    model_pol.fit(X_polarity_train_,y_polarity_train[i],
                  batch_size=batch_size,
                  epochs=epochs_polarity,
                  validation_split=0.1)

In [None]:


for i,t in enumerate(tasks):
    print("Evaluating task",t)
    model_ir=models[i][1]

    model_ir.summary()
    
    if t=="taskA":
        model_ir.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    else:
        model_ir.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    X_train_=sequence.pad_sequences(X_train[i], maxlen=maxlen)
    X_test_=sequence.pad_sequences(X_test[i], maxlen=maxlen)
    

    for it in range(epochs_irony):
        model_ir.fit(X_train_,y_train[i],
                  batch_size=batch_size,
                  epochs=1,
                  validation_split=0)
    
    
        #score, acc = model.evaluate(X_test_,y_test[i],batch_size=batch_size)
    
        y_test_ = model_ir.predict(X_test_, batch_size=batch_size, verbose=1)
        if t=="taskA":
            y_test_ = np.round(y_test_)
            ori = y_test[i]
        else:
            y_test_ = np.argmax(y_test_, axis=1)
            ori = np.argmax(y_test[i],axis=1)
        
        print(classification_report(ori, y_test_))
        print("Macro f-score:", f1_score(ori, y_test_, average="macro"))
    
   