In [11]:
import json
import os.path
import codecs
import re

from keras.preprocessing import sequence, text
from keras.models import Sequential, Model
from keras.layers import Dense, Embedding, Bidirectional, Input
from keras.layers import LSTM

max_features = 20000
maxlen = 30
embedding_size = 128
lstm_output_size = 128
batch_size = 256
epochs_polarity = 5
epochs_irony= 5

# Datos ironía

Se cargan los datos de ironía, por cada tarea:

* taskA: Clasificador binario
* taskB: Clasificador multi-clase (4?)

In [12]:
def load_files(files):
    json_files=[]
    print("Opening files")
    for filename in files:
        print(filename)
        data=[]
        for line in codecs.open(filename):
            data.append(json.loads(line))
        json_files.append(data)
        print("Size",len(json_files[-1]))
    return json_files


tasks=["taskA","taskB"]
dirname="../SemEval2018-Task3/infotec_train_dev"
basename="SemEval2018-T3-{0}_{1}.json"
train_files=[os.path.join(dirname,basename.format(task,'train')) for task in tasks]
test_files=[os.path.join(dirname,basename.format(task,'test')) for task in tasks]

train_json=load_files(train_files)
test_json=load_files(test_files)

Opening files
../SemEval2018-Task3/infotec_train_dev/SemEval2018-T3-taskA_train.json
Size 2683
../SemEval2018-Task3/infotec_train_dev/SemEval2018-T3-taskB_train.json
Size 2683
Opening files
../SemEval2018-Task3/infotec_train_dev/SemEval2018-T3-taskA_test.json
Size 1151
../SemEval2018-Task3/infotec_train_dev/SemEval2018-T3-taskB_test.json
Size 1151


## Datos de polaridad

Cargando datos de polaridad, una sola tarea

In [13]:
train_polarity_json=load_files(["../extras/En.json"])[0]

Opening files
../extras/En.json
Size 45748


In [14]:
# Configurando datos de ironía

data_train=[]
data_test=[]
for i,t in enumerate(tasks):
    text_train=[j['text'] for j in train_json[i]]
    class_train=[j['klass'] for j in train_json[i]]
    data_train.append(list(zip(text_train,class_train)))
    text_test=[j['text'] for j in test_json[i]]
    class_test=[j['klass'] for j in test_json[i]]
    data_test.append(list(zip(text_test,class_test)))


for i,t in enumerate(data_train):
    print("Size:",len(data_train[i]))
    print("Size:",len(data_test[i]))
    
# Configurando datos de polaridad

def pol2class(k):
    onehot=[[1,0,0],[0,1,0],[0,0,1]]
    return onehot[['neutral','positive','negative'].index(k)]

text_polarity_train=[j['text'] for j in train_polarity_json]
class_polarity_train=[pol2class(j['klass']) for j in train_polarity_json]
data_polarity_train=list(zip(text_polarity_train,class_polarity_train))

print("Size:",len(data_polarity_train))


Size: 2683
Size: 1151
Size: 2683
Size: 1151
Size: 45748


In [15]:
from keras.preprocessing.text import Tokenizer

def normalizeText(tweet):
    #tweet = re.sub(r'#(S|s)arcasm|#(I|i)rony','',tweet)
    #tweet = re.sub(r'#SARCASM|#IRONY','',tweet)
    #tweet = re.sub(r'https?://t\.co/.(\w|\d)+','http://link', tweet) #tweet link
    #tweet = re.sub(r'@.\w*','@',tweet)
    #tweet = re.sub(r'#','',tweet)
    return tweet

def buildTokenizer(tweets):
    tokenizer = Tokenizer(num_words=max_features,lower=True, filters='!"$%&()*+,-.;<=>?[\\]^_`{|}~\t\n')
    tokenizer.fit_on_texts(tweets)
    return tokenizer

def text2seq(tok,tweet):
    return tok.texts_to_sequences([tweet])[0]

In [16]:
toks=[]
for i,t in enumerate(tasks):
    tweets=[t for t,c in data_train[i]]
    tweets_=[t for t,c in data_polarity_train]
    toks.append(buildTokenizer([normalizeText(t) for t in tweets+tweets_]))
    print("word_counts",len(toks[-1].word_counts))
    print("word_docs",len(toks[-1].word_docs))
    print("word_index",len(toks[-1].word_index))

word_counts 84651
word_docs 84651
word_index 84651
word_counts 84644
word_docs 84644
word_index 84644


In [17]:
X_train=[]
y_train=[]
X_test=[]
y_test=[]
for i,t in enumerate(tasks):
    X_train.append([text2seq(toks[i],t) for t,c in data_train[i]])
    y_train.append([int(c) for t,c in data_train[i]])
    X_test.append([text2seq(toks[i],t) for t,c in data_test[i]])
    y_test.append([int(c) for t,c in data_test[i]])

print("Example train")
print(X_train[0][0])
print(y_train[0][0])

Example train
[1, 102, 16636, 3121, 27, 24, 1]
0


In [18]:
X_polarity_train=[]
y_polarity_train=[]
for i,t in enumerate(tasks):
    X_polarity_train.append([text2seq(toks[i],t) for t,c in data_polarity_train])
    y_polarity_train.append([c for t,c in data_polarity_train])
    
print("Example train")
print(X_polarity_train[0][0])
print(y_polarity_train[0][0])

Example train
[621, 5157, 2000, 2735, 140, 285, 10, 5457, 1187, 2843, 578, 621, 145, 68, 19, 107, 1426, 46, 332, 15]
[1, 0, 0]


In [19]:
def build_model(task):
    
    inputs = Input(shape=(maxlen,))
    embeedings = Embedding(max_features, embedding_size)(inputs)
    bidirectional = Bidirectional(
                        LSTM(lstm_output_size,
                        #activation='tanh', 
                        #recurrent_activation='hard_sigmoid', 
                        dropout=0.2, 
                        recurrent_dropout=0.2
                        )
                    )(embeedings)
    if task=="taskA":
        irony=Dense(1, activation='sigmoid')(bidirectional)
    if task=="taskB":
        irony=Dense(1, activation='sigmoid')(bidirectional)
    
    polarity=Dense(3, activation='sigmoid')(bidirectional)
    model_irony = Model(inputs=inputs, outputs=irony)
    model_polarity = Model(inputs=inputs, outputs=polarity)
    return model_polarity, model_irony

In [20]:
scores=[]

from sklearn.metrics import classification_report, f1_score
import numpy as np

for i,t in enumerate(tasks[:1]):
    print("Evaluating task",t)
    model_pol, model_ir = build_model(t)
    model_pol.summary()
    model_ir.summary()
    
    model_pol.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model_ir.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    X_polarity_train_=sequence.pad_sequences(X_polarity_train[i], maxlen=maxlen)
    
    model_pol.fit(X_polarity_train_,y_polarity_train[i],
                  batch_size=batch_size,
                  epochs=epochs_polarity,
                  validation_split=0.1)
    
    X_train_=sequence.pad_sequences(X_train[i], maxlen=maxlen)
    X_test_=sequence.pad_sequences(X_test[i], maxlen=maxlen)
    
    
    model_ir.fit(X_train_,y_train[i],
                  batch_size=batch_size,
                  epochs=epochs_irony,
                  validation_split=0.05)
    
    
    #score, acc = model.evaluate(X_test_,y_test[i],batch_size=batch_size)
    
    y_test_ = model_ir.predict(X_test_, batch_size=batch_size, verbose=1)
    y_test_ = np.round(y_test_)
    print(y_test_)
    
    print(classification_report(y_test[i], y_test_))
    print("Macro f-score:", f1_score(y_test[i], y_test_))
    
   

Evaluating task taskA
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 30)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 30, 128)           2560000   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 256)               263168    
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 771       
Total params: 2,823,939
Trainable params: 2,823,939
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 30)                0         
____________________________________________