In [1]:
import json
import os.path
import codecs
import re

from keras.preprocessing import sequence, text
from keras.models import Sequential, Model
from keras.layers import Dense, Embedding, Bidirectional, Input
from keras.layers import LSTM
from keras.layers import Conv1D, MaxPooling1D
from keras.layers import SpatialDropout1D, Concatenate, Maximum

max_features = 100
maxlen = 156
embedding_size = 32
lstm_output_size = 32
batch_size = 128
epochs_polarity = 40
epochs_irony= 40

kernel_size = 5
filters = 64
pool_size = 4

Using TensorFlow backend.


# Datos ironía

Se cargan los datos de ironía, por cada tarea:

* taskA: Clasificador binario
* taskB: Clasificador multi-clase (4?)

In [2]:
def load_files(files):
    json_files=[]
    print("Opening files")
    for filename in files:
        print(filename)
        data=[]
        for line in codecs.open(filename):
            data.append(json.loads(line))
        json_files.append(data)
        print("Size",len(json_files[-1]))
    return json_files


tasks=["taskA","taskB"]
dirname="../SemEval2018-Task3/infotec_train_dev"
basename="SemEval2018-T3-{0}_{1}.json"
train_files=[os.path.join(dirname,basename.format(task,'train')) for task in tasks]
test_files=[os.path.join(dirname,basename.format(task,'test')) for task in tasks]

train_json=load_files(train_files)
test_json=load_files(test_files)

Opening files
../SemEval2018-Task3/infotec_train_dev/SemEval2018-T3-taskA_train.json
Size 2683
../SemEval2018-Task3/infotec_train_dev/SemEval2018-T3-taskB_train.json
Size 2683
Opening files
../SemEval2018-Task3/infotec_train_dev/SemEval2018-T3-taskA_test.json
Size 1151
../SemEval2018-Task3/infotec_train_dev/SemEval2018-T3-taskB_test.json
Size 1151


## Datos de polaridad

Cargando datos de polaridad, una sola tarea

In [3]:
train_polarity_json=load_files(["../extras/En.json"])[0]

Opening files
../extras/En.json
Size 45748


In [4]:
# Configurando datos de ironía

from nltk.tokenize.casual import TweetTokenizer
nltk_tok=TweetTokenizer(reduce_len=True)

data_train=[]
data_test=[]
for i,t in enumerate(tasks):
    text_train=[" ".join(nltk_tok.tokenize(j['text'])) for j in train_json[i]]
    class_train=[j['klass'] for j in train_json[i]]
    data_train.append(list(zip(text_train,class_train)))
    text_test=[" ".join(nltk_tok.tokenize(j['text'])) for j in test_json[i]]
    class_test=[j['klass'] for j in test_json[i]]
    data_test.append(list(zip(text_test,class_test)))


for i,t in enumerate(data_train):
    print("Size:",len(data_train[i]))
    print("Size:",len(data_test[i]))
    
# Configurando datos de polaridad

def pol2class(k):
    onehot=[[1,0,0],[0,1,0],[0,0,1]]
    return onehot[['neutral','positive','negative'].index(k)]

text_polarity_train=[" ".join(nltk_tok.tokenize(j['text'])) for j in train_polarity_json]
class_polarity_train=[pol2class(j['klass']) for j in train_polarity_json]
data_polarity_train=list(zip(text_polarity_train,class_polarity_train))

print("Size:",len(data_polarity_train))


Size: 2683
Size: 1151
Size: 2683
Size: 1151
Size: 45748


In [5]:
from keras.preprocessing.text import Tokenizer
  

def normalizeText(tweet):
    #tweet = re.sub(r'#(S|s)arcasm|#(I|i)rony','',tweet)
    #tweet = re.sub(r'#SARCASM|#IRONY','',tweet)
    tweet = re.sub(r'https?://t\.co/.(\w|\d)+','http', tweet) #tweet link
    tweet = re.sub(r'fb\.me/.(\w|\d)+','fb', tweet) #tweet link
    tweet = re.sub(r'https?://.+','http', tweet) #tweet link
    #tweet = re.sub(r'@.\w*','@',tweet)
    #tweet = re.sub(r'#','',tweet)
    return tweet

def buildTokenizer(tweets):
    
    tokenizer = Tokenizer(num_words=max_features,lower=False, filters='\t\n', split=" ",char_level=True )
    tokenizer.fit_on_texts([" ".join(nltk_tok.tokenize(t)) for t in tweets])
    return tokenizer

def text2seq(tok,tweet):
    return tok.texts_to_sequences([normalizeText(tweet)])[0]

In [6]:
toks=[]
maps_=[]
for i,t in enumerate(tasks):
    tweets=[t for t,c in data_train[i]]
    tweets_=[t for t,c in data_polarity_train]
    toks.append(buildTokenizer([normalizeText(t) for t in tweets+tweets_]))
    maps_.append({v: k for k, v in toks[-1].word_index.items()})
    print("word_counts",len(toks[-1].word_counts))
    print("word_docs",len(toks[-1].word_docs))
    print("word_index",len(toks[-1].word_index))
    print("Top words\n",", ".join(["{0} ({1}) ".format(maps_[-1][f+1],toks[i].word_counts[maps_[-1][f+1]]) for f in range(50)]))
    print("Last words\n",", ".join(["{0} ({1}) ".format(maps_[-1][f],toks[i].word_counts[maps_[-1][f]]) for f in range(len(maps_[i])-50,len(maps_[i]))]))

word_counts 135
word_docs 135
word_index 135
Top words
   (1009790) , e (410768) , t (350042) , a (326811) , o (313591) , n (266978) , i (256924) , r (225927) , s (221057) , h (189295) , l (156551) , d (134555) , u (112173) , y (104607) , m (104043) , g (88165) , c (88139) , w (77363) , p (70882) , . (68693) , f (64999) , b (61752) , k (45592) , v (36452) , S (33038) , I (30219) , T (26785) , ' (25133) , , (23257) , M (22501) , A (22400) , C (21282) , B (19320) , @ (19209) , ! (16269) , D (16146) , 1 (15610) , W (15076) , N (14801) , # (14525) , R (14434) , P (14226) , F (14016) , H (13373) , O (12814) , L (12536) , G (12415) , E (12021) , 2 (11765) , : (10709) 
Last words
 > (311) , < (292) , % (272) , = (258) , ] (207) , [ (206) , ~ (200) , ^ (142) , ` (35) , \ (33) , ️ (25) , } (9) , – (8) , { (7) , £ (5) , 
 (5) , ’ (3) , ☆ (3) , ・ (3) , — (2) , Ｏ (2) , Ｌ (2) , İ (2) , ℃ (2) , 你 (2) , ï (1) , 󾍁 (1) , ° (1) , ง (1) , ว (1) , ย (1) , Ｆ (1) , Ｗ (1) , « (1) , ó (1) , í (1) , ⁰ (1) , Ã 

In [7]:
def ir2class(k):
    onehot=[[1,0,0,0,0],[0,1,0,0,0],[0,0,1,0,0],[0,0,0,1,0],[0,0,0,0,1]]
    return onehot[['0','1','2',"3","4"].index(k)]

X_train=[]
y_train=[]
X_test=[]
y_test=[]
for i,t in enumerate(tasks):
    X_train.append([text2seq(toks[i],t) for t,c in data_train[i]])
    if t=="taskA":
        y_train.append([int(c) for t,c in data_train[i]])
    else:
        y_train.append([ir2class(c) for t,c in data_train[i]])
    X_test.append([text2seq(toks[i],t) for t,c in data_test[i]])
    if t=="taskA":
        y_test.append([int(c) for t,c in data_test[i]])
    else:
        y_test.append([ir2class(c) for t,c in data_test[i]])
        

    print("Example train in",t)
    #print("\n".join([" ".join([str(w) for w in S]) for S in X_train[i][:5]]))
    print("\n".join([t for t,c in data_train[i][:5]]))
    print()
    print("\n".join([" ".join([maps_[i][w] for w in S if w]) for S in X_train[i][:5]]))
    print()
    print([c for c in y_train[i][:5]])
    print()




Example train in taskA
The mouse's first incepted memory was just the sound : BRAAAWWWP !
I LOVE not sleeping . It's the best .
Religion is unfounded , else , Allah would have saved the kids . . @tariqmushtaqkh @nicpradhan #PeshawarAttack #PakSchoolSiege
Love how I came into work at 8 because Charlie said we were busy ... 3 people in 45 minutes , yeah we got this place packed Charlie .
Thx for catching on #urock

T h e   m o u s e ' s   f i r s t   i n c e p t e d   m e m o r y   w a s   j u s t   t h e   s o u n d   :   B R A A A W W W P   !
I   L O V E   n o t   s l e e p i n g   .   I t ' s   t h e   b e s t   .
R e l i g i o n   i s   u n f o u n d e d   ,   e l s e   ,   A l l a h   w o u l d   h a v e   s a v e d   t h e   k i d s   .   .   @ t a r i q m u s h t a q k h   @ n i c p r a d h a n   # P e s h a w a r A t t a c k   # P a k S c h o o l S i e g e
L o v e   h o w   I   c a m e   i n t o   w o r k   a t   8   b e c a u s e   C h a r l i e   s a i d   w e   w e r e   b u s

In [8]:
X_polarity_train=[]
y_polarity_train=[]
for i,t in enumerate(tasks):
    X_polarity_train.append([text2seq(toks[i],t) for t,c in data_polarity_train])
    y_polarity_train.append([c for t,c in data_polarity_train])
    
print("Example train")
print("\n".join([" ".join([maps_[i][w] for w in S if w]) for S in X_polarity_train[0][:5]]))
print(y_polarity_train[0][:5])

Example train
F i o r i n a   b l a s t s   C l i n t o n   ,   a s k s   w h y   T r u m p   i s   M I A   -   R e p u b l i g a n   h o p e f u l   C a r l y   F i o r i n a   s a i d   M o n d a y   t h a t   s h e   f e e l s   l i k e   t   . . .   h t t p
@ k s i z z l e N J   I ' m   d o w n   f o r   t h a t   o b v i o u s l y   ,   b u t   d o e s   N i n t e n d o   w a n t   t h a t   ?   T h e   w o r l d   m a y   n e v e r   k n o w
@ l u g a s m c e m   T r u e   ,   r i c h t   n o w   t h e y   h a v e   t h e   b e s t   s q u a d   .   B u t   i f   M i l a n   g a n   p u l l   a   f e w   t r a n s f e r s   r i c h t   i n   J a n u a r y   ,   t h e r e ' s   h o p e   f o r   t o p   3   .
# E M A   # M A S S I V E A C T I O N   t h i s   S u n d a y   a t   1 2   N o o n   i n   V e n i g e   B e a g h   a t   R o s e   A v e   &   O g e a n   F r o n t   W a l k   o n   t h e   s a n d   !   F r e e   . . .   h t t p
f u l l   r e s u l t s   f r o m   E l d 

In [9]:
def build_model(task):
    
    inputs = Input(shape=(maxlen,))
    embeedings = Embedding(max_features, embedding_size)(inputs)
    dropout=SpatialDropout1D(0.25)(embeedings)
    
    conv1 = Conv1D(16,
                 3,
                 padding='same',
                 activation='relu',
                 strides=1)(dropout)
    maxpool1 = MaxPooling1D(pool_size=pool_size)(conv1)
    conv2 = Conv1D(16,
                 5,
                 padding='same',
                 activation='relu',
                 strides=1)(dropout)
    maxpool2 = MaxPooling1D(pool_size=pool_size)(conv2)
    conv3= Conv1D(16,
                 7,
                 padding='same',
                 activation='relu',
                 strides=1)(dropout)
    maxpool3= MaxPooling1D(pool_size=pool_size)(conv3)
    conv4= Conv1D(16,
                 9,
                 padding='same',
                 activation='relu',
                 strides=1)(dropout)
    maxpool4=MaxPooling1D(pool_size=pool_size)(conv4)
    
    concatenate = Maximum()([maxpool1,maxpool2,maxpool3,maxpool4])
    bidirectional =  Bidirectional(LSTM(lstm_output_size))(concatenate)
    if task=="taskA":
        irony=Dense(1, activation='sigmoid')(bidirectional)
    if task=="taskB":
        irony=Dense(5, activation='softmax')(bidirectional)
    
    polarity=Dense(3, activation='softmax')(bidirectional)
    model_irony = Model(inputs=inputs, outputs=irony)
    model_polarity = Model(inputs=inputs, outputs=polarity)
    return model_polarity, model_irony

In [None]:
models=[]
for i,t in enumerate(tasks):
    print("Creating model task",t)
    models.append(build_model(t))

Creating model task taskA
Creating model task taskB


In [None]:
scores=[]

from sklearn.metrics import classification_report, f1_score
import numpy as np

    
for i,t in enumerate(tasks):
    model_pol=models[i][0]
    model_pol.summary()
    
    model_pol.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    X_polarity_train_=sequence.pad_sequences(X_polarity_train[i], maxlen=maxlen)
   
    model_pol.fit(X_polarity_train_,y_polarity_train[i],
                  batch_size=batch_size,
                  epochs=epochs_polarity,
                  validation_split=0.1)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 156)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 156, 32)      3200        input_1[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDro (None, 156, 32)      0           embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 156, 16)      1552        spatial_dropout1d_1[0][0]        
__________________________________________________________________________________________________
conv1d_2 (

In [None]:


for i,t in enumerate(tasks):
    print("Evaluating task",t)
    model_ir=models[i][1]

    model_ir.summary()
    
    if t=="taskA":
        model_ir.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    else:
        model_ir.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    X_train_=sequence.pad_sequences(X_train[i], maxlen=maxlen)
    X_test_=sequence.pad_sequences(X_test[i], maxlen=maxlen)
    

    for it in range(epochs_irony):
        model_ir.fit(X_train_,y_train[i],
                  batch_size=batch_size,
                  epochs=1,
                  validation_split=0)
    
    
        #score, acc = model.evaluate(X_test_,y_test[i],batch_size=batch_size)
    
        y_test_ = model_ir.predict(X_test_, batch_size=batch_size, verbose=1)
        if t=="taskA":
            y_test_ = np.round(y_test_)
            ori = y_test[i]
        else:
            y_test_ = np.argmax(y_test_, axis=1)
            ori = np.argmax(y_test[i],axis=1)
        
        print(classification_report(ori, y_test_))
        print("Macro f-score:", f1_score(ori, y_test_, average="macro"))
    
   