In [1]:
import json
import os.path
import codecs
import re

In [2]:
def load_files(files):
    json_files=[]
    print("Opening files")
    for filename in files:
        print(filename)
        data=[]
        for line in codecs.open(filename):
            data.append(json.loads(line))
        json_files.append(data)
        print("Size",len(json_files[-1]))
    return json_files


tasks=["taskA","taskB"]
dirname="../SemEval2018-Task3/infotec_train_dev"
basename="SemEval2018-T3-{0}_{1}.json"
train_files=[os.path.join(dirname,basename.format(task,'train')) for task in tasks]
test_files=[os.path.join(dirname,basename.format(task,'test')) for task in tasks]


train_json=load_files(train_files)
test_json=load_files(test_files)


Opening files
../SemEval2018-Task3/infotec_train_dev/SemEval2018-T3-taskA_train.json
Size 2683
../SemEval2018-Task3/infotec_train_dev/SemEval2018-T3-taskB_train.json
Size 2683
Opening files
../SemEval2018-Task3/infotec_train_dev/SemEval2018-T3-taskA_test.json
Size 1151
../SemEval2018-Task3/infotec_train_dev/SemEval2018-T3-taskB_test.json
Size 1151


In [3]:
data_train=[]
data_test=[]
for i,t in enumerate(tasks):
    text_train=[j['text'] for j in train_json[i]]
    class_train=[j['klass'] for j in train_json[i]]
    data_train.append(list(zip(text_train,class_train)))
    text_test=[j['text'] for j in test_json[i]]
    class_test=[j['klass'] for j in test_json[i]]
    data_test.append(list(zip(text_test,class_test)))


for i,t in enumerate(data_train):
    print("Size:",len(data_train[i]))
    print("Size:",len(data_test[i]))

Size: 2683
Size: 1151
Size: 2683
Size: 1151


In [4]:
from keras.preprocessing.text import Tokenizer

def normalizeText(tweet):
    tweet = re.sub(r'#(S|s)arcasm|#(I|i)rony','',tweet)
    tweet = re.sub(r'#SARCASM|#IRONY','',tweet)
    tweet = re.sub(r'https?://t\.co/.(\w|\d)+','http://link', tweet) #tweet link
    tweet = re.sub(r'@.\w*','@',tweet)
    tweet = re.sub(r'#','',tweet)
    return tweet

def buildTokenizer(tweets):
    tokenizer = Tokenizer(num_words=20000,lower=False, filters='!"$%&()*+,-.;<=>?[\\]^_`{|}~\t\n')
    tokenizer.fit_on_texts(tweets)
    return tokenizer

def text2seq(tok,tweet):
    return tok.texts_to_sequences([tweet])[0]

Using Theano backend.


In [5]:
toks=[]
for i,t in enumerate(tasks):
    tweets=[t for t,c in data_train[i]]
    toks.append(buildTokenizer(tweets))

In [6]:
X_train=[]
y_train=[]
X_test=[]
y_test=[]
for i,t in enumerate(tasks):
    X_train.append([text2seq(toks[i],t) for t,c in data_train[i]])
    y_train.append([c for t,c in data_train[i]])
    X_test.append([text2seq(toks[i],t) for t,c in data_test[i]])
    y_test.append([c for t,c in data_test[i]])

print("Example train")
print(X_train[0][0])

Example train
[35, 3054, 173, 3055, 959, 29, 26, 1, 3056, 3057]


In [7]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Bidirectional
from keras.layers import LSTM

In [8]:
max_features = 20000
maxlen = 20
embedding_size = 128
lstm_output_size = 128
batch_size = 32
epochs = 20

In [9]:
def build_model():
    model = Sequential()
    model.add(Embedding(max_features, embedding_size))
    model.add(Bidirectional(
                LSTM(lstm_output_size,
                   activation='tanh', 
                   recurrent_activation='hard_sigmoid', 
                   dropout=0.2, 
                   recurrent_dropout=0.2
                    )
                )
             )
    model.add(Dense(1, activation='sigmoid'))
    return model

In [11]:
scores=[]
for i,t in enumerate(tasks):
    print("Evaluating task",t)
    model = build_model()
    model.summary()
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    X_train_=sequence.pad_sequences(X_train[i], maxlen=maxlen)
    X_test_=sequence.pad_sequences(X_test[i], maxlen=maxlen)
    
    
    model.fit(X_train_,y_train[i],
                  batch_size=batch_size,
                  epochs=epochs,
                  validation_split=0.1)
    score, acc = model.evaluate(X_test_,y_test[i],batch_size=batch_size)
    scores.append(acc * 100)

Evaluating task taskA
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 128)         2560000   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 256)               263168    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total params: 2,823,425
Trainable params: 2,823,425
Non-trainable params: 0
_________________________________________________________________
Train on 2414 samples, validate on 269 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


NameError: name 'scores' is not defined