In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from tqdm import tqdm
from keras.models import Sequential, Model
from keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Input, Conv1D, GlobalMaxPooling1D, Concatenate, Dropout, Add
from keras.initializers import Constant
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

def create_corpus(tweets):
    stop=set(stopwords.words('english'))
    corpus=[]
    for tweet in tqdm(tweets):
#         words=[word.lower() for word in nltk.casual_tokenize(tweet) if((word.isalpha()==1) & (word not in stop))]
        words=[word.lower() for word in nltk.casual_tokenize(tweet)]
        corpus.append(words)
    return corpus

corpus=create_corpus(olid['tweet_cleaned'])

embedding_dict={}
# with open('./data/glove6B/glove.6B.100d.txt','r') as f:
with open('./data/gloveTwitter27B/glove.twitter.27B.100d.txt','r') as f:
    for line in f:
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()

MAX_LEN=50
filter_sizes = (3, 8)
num_filters = 20
embedding_dim = 100


tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(corpus) #??
sequences = tokenizer_obj.texts_to_sequences(corpus) #??

tweet_pad = pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')

word_index = tokenizer_obj.word_index
print('Number of unique words:',len(word_index))

num_words = len(word_index)+1
embedding_matrix = np.zeros((num_words,embedding_dim))

for word,i in tqdm(word_index.items()):
    if i > num_words:
        continue
    
    emb_vec=embedding_dict.get(word)
    if emb_vec is not None:
        embedding_matrix[i]=emb_vec
        
# channel 1
inputs1 = Input(shape=(MAX_LEN,), name = "trainable")
embedding1 = Embedding(num_words,100,embeddings_initializer=Constant(embedding_matrix),
                   input_length=MAX_LEN,trainable=True)(inputs1)

convs1 = []
for fsz in filter_sizes:
    conv = Conv1D(filters=num_filters,
                         kernel_size=fsz,
                         padding='valid',
                         activation='tanh',
                         use_bias = True)(embedding1)

    convs1.append(conv)

# channel 2
inputs2 = Input(shape=(MAX_LEN,), name = "fixed")
embedding2 = Embedding(num_words,100,embeddings_initializer=Constant(embedding_matrix),
                   input_length=MAX_LEN,trainable=False)(inputs2)
convs2 = []
for fsz in filter_sizes:
    conv = Conv1D(filters=num_filters,
                         kernel_size=fsz,
                         padding='valid',
                         activation='tanh',
                         use_bias = True)(embedding2)

    convs2.append(conv)
    
pool1 = GlobalMaxPooling1D() (Add()([convs1[0], convs2[0]]))
pool2 = GlobalMaxPooling1D() (Add()([convs1[1], convs2[1]]))

out = Concatenate()([pool1, pool2])

outputs = Dense(1, activation='sigmoid', name = "output")(out)
model = Model(inputs=[inputs1, inputs2], outputs=outputs)
optimzer = Adam(learning_rate=1e-4)
# compile
model.compile(loss='binary_crossentropy', optimizer=optimzer, metrics=['accuracy'])

model.summary()



In [None]:
from keras.utils.vis_utils import plot_model
plot_model(model, show_shapes=True, to_file='multichannel.png')

In [None]:
# train=tweet_pad[:tweet.shape[0]]
# test=tweet_pad[tweet.shape[0]:]
le = LabelEncoder()
y = le.fit_transform(olid['subtask_a'])

X_train, X_valid, y_train, y_valid = train_test_split(tweet_pad, y, test_size=0.33, random_state=42)



# X_train,X_test,y_train,y_test=train_test_split(train,tweet['target'].values,test_size=0.15)
print('Shape of train',X_train.shape)
print("Shape of Validation ",X_valid.shape)

callback = EarlyStopping(monitor='val_loss', patience=10, min_delta = 0.005)

history=model.fit({"fixed": X_train, "trainable": X_train},
                  {"output":y_train},
                  batch_size=512,
                  epochs=200,
                  validation_data=([X_valid,X_valid],y_valid),
                  shuffle = True,
                  verbose=2,
                 callbacks = [callback])

proba_valid = model.predict({'fixed':X_valid,'trainable':X_valid})
proba_train = model.predict({'fixed':X_train,'trainable':X_train})

print(classification_report(y_train, np.round(proba_train)))
print('AUC: ', roc_auc_score(y_train, proba_train))

print(classification_report(y_valid, np.round(proba_valid)))
print('AUC: ',  roc_auc_score(y_valid, proba_valid))



In [None]:
le = LabelEncoder()
y = le.fit_transform(olid['subtask_a'])

y_test = le.transform(olid_test['subtask_a'])
corpus_test = create_corpus(olid_test['tweet_cleaned'])
sequences_test = tokenizer_obj.texts_to_sequences(corpus_test) #??

X_test = pad_sequences(sequences_test,maxlen=MAX_LEN,truncating='post',padding='post')

callback = EarlyStopping(monitor='val_loss', patience=10, min_delta = 0.005 )

history=model.fit({"fixed":tweet_pad,"trainable":tweet_pad}, 
                  {"output":y},
                  batch_size=512,
                  epochs=100,
                  validation_data=({"fixed":X_test,"trainable":X_test}, 
                  {"output":y_test}),
                  shuffle = True,
                  verbose=2,
                 callbacks = [callback])



print(X_test.shape, y_test.shape)
preds_test = model.predict({"fixed":X_test,"trainable":X_test})
print(classification_report(y_test, np.round(preds_test)))

print('AUC: ',  roc_auc_score(y_test, preds_test))