In [1]:
import json
import os.path
import codecs
import re

from keras.preprocessing import sequence, text
from keras.models import Sequential, Model
from keras.layers import Dense, Embedding, Bidirectional, Input
from keras.layers import LSTM
from keras.layers import Conv1D, MaxPooling1D

max_features = 15000
maxlen = 30
embedding_size = 128
lstm_output_size = 128
batch_size = 128
epochs_polarity = 0
epochs_irony= 10

kernel_size = 3
filters = 128
pool_size = 2

Using TensorFlow backend.


# Datos ironía

Se cargan los datos de ironía, por cada tarea:

* taskA: Clasificador binario
* taskB: Clasificador multi-clase (4?)

In [2]:
def load_files(files):
    json_files=[]
    print("Opening files")
    for filename in files:
        print(filename)
        data=[]
        for line in codecs.open(filename):
            data.append(json.loads(line))
        json_files.append(data)
        print("Size",len(json_files[-1]))
    return json_files


tasks=["taskA","taskB"]
dirname="../SemEval2018-Task3/infotec_train_dev"
basename="SemEval2018-T3-{0}_{1}.json"
train_files=[os.path.join(dirname,basename.format(task,'train')) for task in tasks]
test_files=[os.path.join(dirname,basename.format(task,'test')) for task in tasks]

train_json=load_files(train_files)
test_json=load_files(test_files)

Opening files
../SemEval2018-Task3/infotec_train_dev/SemEval2018-T3-taskA_train.json
Size 2683
../SemEval2018-Task3/infotec_train_dev/SemEval2018-T3-taskB_train.json
Size 2683
Opening files
../SemEval2018-Task3/infotec_train_dev/SemEval2018-T3-taskA_test.json
Size 1151
../SemEval2018-Task3/infotec_train_dev/SemEval2018-T3-taskB_test.json
Size 1151


## Datos de polaridad

Cargando datos de polaridad, una sola tarea

In [3]:
train_polarity_json=load_files(["../extras/En.json"])[0]

Opening files
../extras/En.json
Size 45748


In [4]:
# Configurando datos de ironía

from nltk.tokenize.casual import TweetTokenizer
nltk_tok=TweetTokenizer(reduce_len=True)

data_train=[]
data_test=[]
for i,t in enumerate(tasks):
    text_train=[" ".join(nltk_tok.tokenize(j['text'])) for j in train_json[i]]
    class_train=[j['klass'] for j in train_json[i]]
    data_train.append(list(zip(text_train,class_train)))
    text_test=[" ".join(nltk_tok.tokenize(j['text'])) for j in test_json[i]]
    class_test=[j['klass'] for j in test_json[i]]
    data_test.append(list(zip(text_test,class_test)))


for i,t in enumerate(data_train):
    print("Size:",len(data_train[i]))
    print("Size:",len(data_test[i]))
    
# Configurando datos de polaridad

def pol2class(k):
    onehot=[[1,0,0],[0,1,0],[0,0,1]]
    return onehot[['neutral','positive','negative'].index(k)]

text_polarity_train=[" ".join(nltk_tok.tokenize(j['text'])) for j in train_polarity_json]
class_polarity_train=[pol2class(j['klass']) for j in train_polarity_json]
data_polarity_train=list(zip(text_polarity_train,class_polarity_train))

print("Size:",len(data_polarity_train))


Size: 2683
Size: 1151
Size: 2683
Size: 1151
Size: 45748


In [5]:
from keras.preprocessing.text import Tokenizer
  

def normalizeText(tweet):
    #tweet = re.sub(r'#(S|s)arcasm|#(I|i)rony','',tweet)
    #tweet = re.sub(r'#SARCASM|#IRONY','',tweet)
    #tweet = re.sub(r'https?://t\.co/.(\w|\d)+','http://link', tweet) #tweet link
    #tweet = re.sub(r'@.\w*','@',tweet)
    #tweet = re.sub(r'#','',tweet)
    return tweet

def buildTokenizer(tweets):
    
    tokenizer = Tokenizer(num_words=max_features,lower=True, filters='\t\n', split=" ")
    tokenizer.fit_on_texts([" ".join(nltk_tok.tokenize(t)) for t in tweets])
    return tokenizer

def text2seq(tok,tweet):
    return tok.texts_to_sequences([tweet])[0]

In [6]:
toks=[]
maps_=[]
for i,t in enumerate(tasks):
    tweets=[t for t,c in data_train[i]]
    tweets_=[t for t,c in data_polarity_train]
    toks.append(buildTokenizer([normalizeText(t) for t in tweets+tweets_]))
    maps_.append({v: k for k, v in toks[-1].word_index.items()})
    print("word_counts",len(toks[-1].word_counts))
    print("word_docs",len(toks[-1].word_docs))
    print("word_index",len(toks[-1].word_index))
    print("Top words\n",", ".join(["{0} ({1}) ".format(maps_[-1][f+1],toks[i].word_counts[maps_[-1][f+1]]) for f in range(50)]))
    print("Last words\n",", ".join(["{0} ({1}) ".format(maps_[-1][f],toks[i].word_counts[maps_[-1][f]]) for f in range(max_features-50,max_features)]))

word_counts 81045
word_docs 81045
word_index 81045
Top words
 the (39125) , . (34484) , , (23124) , to (21713) , ! (16267) , in (14167) , a (13942) , on (13384) , i (13378) , and (13253) , of (11426) , for (10306) , is (10266) , ... (8831) , you (8511) , with (7860) , : (7676) , be (7512) , ? (7444) , at (7158) , it (7094) , may (7008) , tomorrow (6982) , " (6359) , that (5014) , my (4945) , have (4702) , this (4461) , - (4371) , but (4166) , just (4101) , day (3938) , will (3763) , was (3711) , he (3297) , not (3277) , so (3246) , & (3159) , me (3109) , out (3021) , if (2969) , going (2958) , all (2951) , see (2899) , night (2893) , i'm (2833) , from (2775) , are (2767) , ' (2719) , friday (2715) 
Last words
 unload (3) , abdul (3) , rode (3) , scares (3) , optimism (3) , adopt (3) , theopen (3) , #coybig (3) , influencing (3) , exposes (3) , theatrical (3) , ceremonial (3) , brewing's (3) , udinese (3) , foamposite (3) , softbank (3) , brah (3) , 6p (3) , mtn (3) , @ranaayyub (3) , w

In [7]:
def ir2class(k):
    onehot=[[1,0,0,0,0],[0,1,0,0,0],[0,0,1,0,0],[0,0,0,1,0],[0,0,0,0,1]]
    return onehot[['0','1','2',"3","4"].index(k)]

X_train=[]
y_train=[]
X_test=[]
y_test=[]
for i,t in enumerate(tasks):
    X_train.append([text2seq(toks[i],t) for t,c in data_train[i]])
    if t=="taskA":
        y_train.append([int(c) for t,c in data_train[i]])
    else:
        y_train.append([ir2class(c) for t,c in data_train[i]])
    X_test.append([text2seq(toks[i],t) for t,c in data_test[i]])
    if t=="taskA":
        y_test.append([int(c) for t,c in data_test[i]])
    else:
        y_test.append([ir2class(c) for t,c in data_test[i]])
        

    print("Example train in",t)
    #print("\n".join([" ".join([str(w) for w in S]) for S in X_train[i][:5]]))
    print("\n".join([t for t,c in data_train[i][:5]]))
    print()
    print("\n".join([" ".join([maps_[i][w] for w in S if w]) for S in X_train[i][:5]]))
    print()
    print([c for c in y_train[i][:5]])
    print()




Example train in taskA
The mouse's first incepted memory was just the sound : BRAAAWWWP !
I LOVE not sleeping . It's the best .
Religion is unfounded , else , Allah would have saved the kids . . @tariqmushtaqkh @nicpradhan #PeshawarAttack #PakSchoolSiege
Love how I came into work at 8 because Charlie said we were busy ... 3 people in 45 minutes , yeah we got this place packed Charlie .
Thx for catching on #urock

the first memory was just the sound : !
i love not sleeping . it's the best .
religion is , else , allah would have saved the kids . . #peshawarattack
love how i came into work at 8 because charlie said we were busy ... 3 people in 45 minutes , yeah we got this place packed charlie .
thx for catching on

[0, 1, 1, 1, 1]

Example train in taskB
Produce all kinds of Creative Designs #like15 | http://t.co/OXeuznMhY8 http://t.co/w4eZ9mObFJ
there is only 1 race , HUMAN so i dont look at things by the myth of " races " @jtarleta53 @RBRNetwork1
@bigbillybmoney oh haha no they ain't :

In [8]:
X_polarity_train=[]
y_polarity_train=[]
for i,t in enumerate(tasks):
    X_polarity_train.append([text2seq(toks[i],t) for t,c in data_polarity_train])
    y_polarity_train.append([c for t,c in data_polarity_train])
    
print("Example train")
print("\n".join([" ".join([maps_[i][w] for w in S if w]) for S in X_polarity_train[0][:5]]))
print(y_polarity_train[0][:5])

Example train
x maths shine , bayern why trump is impression - busy prepared wrong x live monday that she aren't like gen ...
i'm down for that they'll , but open card want that ? the really may never know
ocean , people now do have the best sister . but if concert can mention a few alas people in january , bryant play for march 3 .
this sunday at naruto blast in plans forget at lives arian & 2015 catch walk on the banned ! top ...
full pull from felicia finishes ! kurt 2nd with the kart !
[[1, 0, 0], [1, 0, 0], [0, 1, 0], [1, 0, 0], [0, 1, 0]]


In [9]:
def build_model(task):
    
    inputs = Input(shape=(maxlen,))
    embeedings = Embedding(max_features, embedding_size)(inputs)
    
    conv1 = Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1)(embeedings)
    maxpool1 = MaxPooling1D(pool_size=pool_size)(conv1)
    bidirectional =  LSTM(lstm_output_size,
                        activation='tanh', 
                        recurrent_activation='hard_sigmoid', 
                        dropout=0.5, 
                        recurrent_dropout=0.5
                        )(maxpool1)
    if task=="taskA":
        irony=Dense(1, activation='sigmoid')(bidirectional)
    if task=="taskB":
        irony=Dense(5, activation='softmax')(bidirectional)
    
    polarity=Dense(3, activation='softmax')(bidirectional)
    model_irony = Model(inputs=inputs, outputs=irony)
    model_polarity = Model(inputs=inputs, outputs=polarity)
    return model_polarity, model_irony

In [None]:
scores=[]

from sklearn.metrics import classification_report, f1_score
import numpy as np

for i,t in enumerate(tasks[:1]):
    print("Evaluating task",t)
    model_pol, model_ir = build_model(t)
    model_pol.summary()
    model_ir.summary()
    
    model_pol.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model_ir.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    X_polarity_train_=sequence.pad_sequences(X_polarity_train[i], maxlen=maxlen)
    
    
    X_train_=sequence.pad_sequences(X_train[i], maxlen=maxlen)
    X_test_=sequence.pad_sequences(X_test[i], maxlen=maxlen)
    
   
    for tmp in range(10):
        model_pol.fit(X_polarity_train_,y_polarity_train[i],
                  batch_size=batch_size,
                  epochs=1,
                  validation_split=0.1)
   

        model_ir.fit(X_train_,y_train[i],
                  batch_size=batch_size,
                  epochs=1,
                  validation_split=0.1)
    
    
    #score, acc = model.evaluate(X_test_,y_test[i],batch_size=batch_size)
    
    y_test_ = model_ir.predict(X_test_, batch_size=batch_size, verbose=1)
    if t=="taskA":
        y_test_ = np.round(y_test_)
    else:
        y_test_ = np.argmax(y_test_)
        
    
    print(classification_report(y_test[i], y_test_))
    print("Macro f-score:", f1_score(y_test[i], y_test_))
    
   

Evaluating task taskA
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 30)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 30, 128)           1920000   
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 28, 128)           49280     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 14, 128)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 387       
Total params: 2,101,251
Trainable params: 2,101,251
Non-trainable params: 0
____________________________________________