In [None]:
test_data_location = 'data/test_tOlRoBf.csv'

# --------------------------------------------------
# Baseline Model
# --------------------------------------------------

## Data

In [None]:
import pandas as pd
import numpy as np

import time

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [None]:
t = pd.read_csv(test_data_location)

In [None]:
dt = pd.read_csv('data/train_F3WbcTw.csv')

In [None]:
t['sentiment'] = -1

t['flag'] = 'test'
dt['flag'] = 'train'

dt_comb = pd.concat([dt, t])

print(dt_comb.shape)

## Vectorize

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

In [None]:

dt_comb = pd.get_dummies(dt_comb, columns=['drug'])

drug_columns = [col for col in dt_comb.columns.map(str) if col.startswith('drug_')]


In [None]:

X_train = dt_comb[['text'] + drug_columns][dt_comb['flag'] == 'train']

y_train = dt_comb['sentiment'][dt_comb['flag'] == 'train']

X_test = dt_comb[['text'] + drug_columns][dt_comb['flag'] == 'test']


In [None]:
sentences_train = X_train['text'].values

sentences_test  = X_test['text'].values

In [None]:
vectorizer.fit(sentences_train)

In [None]:
X_train_cv = vectorizer.transform(sentences_train)
X_test_cv  = vectorizer.transform(sentences_test)

In [None]:
X_train_cv

In [None]:
X_test_cv

In [None]:
X_train_drugs = X_train[drug_columns].reset_index(drop = True)
X_test_drugs = X_test[drug_columns].reset_index(drop = True)

In [None]:
X_train_cv = pd.SparseDataFrame(X_train_cv, default_fill_value=0)
X_test_cv = pd.SparseDataFrame(X_test_cv, default_fill_value=0)

print(X_train_cv.shape)
print(X_test_cv.shape)

In [None]:
print(X_train_cv.shape)
print(X_train_drugs.shape)

X_train_full = pd.concat([X_train_cv, X_train_drugs], axis = 1)
print(X_train_full.shape)
X_train_full.head(2)

In [None]:
print(X_test_cv.shape)
print(X_test_drugs.shape)

X_test_full = pd.concat([X_test_cv, X_test_drugs], axis = 1)
print(X_test_full.shape)
X_test_full.head(2)

## Train

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

In [None]:

model.fit(X_train_full, y_train)

y_hat = model.predict(X_test_full)


In [None]:
pd.DataFrame(y_hat, columns=['y_hat']).head(1)

In [None]:
submission = pd.concat( [t, pd.DataFrame(y_hat, columns=['y_hat'])] , axis=1)[['unique_hash','drug','y_hat']]
submission.columns = ['unique_hash','drug','sentiment']
submission.head()

In [None]:
submission['sentiment'].value_counts()

In [None]:
s1 = submission

# --------------------------------------------------
# Deep and Wide Neural Network | LSTM + GRU + Pretrained Embeddings
# --------------------------------------------------

In [None]:

# import gensim

import os
import time
import numpy as np
import pandas as pd
from tqdm import tqdm

from nltk.stem import PorterStemmer
ps = PorterStemmer()
from nltk.stem.lancaster import LancasterStemmer
lc = LancasterStemmer()
from nltk.stem import SnowballStemmer
sb = SnowballStemmer("english")
import gc

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import LSTM, GRU
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, concatenate
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.optimizers import Adam
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, optimizers, layers

import sys
from os.path import dirname
#sys.path.append(dirname(dirname(__file__)))
from keras import initializers
from keras.engine import InputSpec, Layer
from keras import backend as K

import spacy


In [None]:
# !python -m spacy download en_core_web_lg

In [None]:
def words(text): return re.findall(r'\w+', text.lower())
def P(word): 
    "Probability of `word`."
    # use inverse of rank as proxy
    # returns 0 if the word isn't in the dictionary
    return - WORDS.get(word, 0)
def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)
def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or [word])
def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)
def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)
def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))
def singlify(word):
    return "".join([letter for i,letter in enumerate(word) if i == 0 or letter != word[i-1]])

In [None]:
spell_model = gensim.models.KeyedVectors.load_word2vec_format('embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec')
words = spell_model.index2word
w_rank = {}
for i,word in enumerate(words):
    w_rank[word] = i
WORDS = w_rank

In [None]:
def load_glove(word_dict, lemma_dict):
    EMBEDDING_FILE = 'embeddings/glove.840B.300d/glove.840B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))
    embed_size = 300
    nb_words = len(word_dict)+1
    embedding_matrix = np.zeros((nb_words, embed_size), dtype=np.float32)
    unknown_vector = np.zeros((embed_size,), dtype=np.float32) - 1.
    print(unknown_vector[:5])
    for key in tqdm(word_dict):
        word = key
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = key.lower()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = key.upper()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = key.capitalize()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = ps.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = lc.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = sb.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = lemma_dict[key]
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        if len(key) > 1:
            word = correction(key)
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[word_dict[key]] = embedding_vector
                continue
        embedding_matrix[word_dict[key]] = unknown_vector                    
    return embedding_matrix, nb_words 

In [None]:
def load_para(word_dict, lemma_dict):
    EMBEDDING_FILE = 'embeddings/paragram_300_sl999/paragram_300_sl999.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)
    embed_size = 300
    nb_words = len(word_dict)+1
    embedding_matrix = np.zeros((nb_words, embed_size), dtype=np.float32)
    unknown_vector = np.zeros((embed_size,), dtype=np.float32) - 1.
    print(unknown_vector[:5])
    for key in tqdm(word_dict):
        word = key
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = key.lower()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = key.upper()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = key.capitalize()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = ps.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = lc.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = sb.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = lemma_dict[key]
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        if len(key) > 1:
            word = correction(key)
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[word_dict[key]] = embedding_vector
                continue
        embedding_matrix[word_dict[key]] = unknown_vector                    
    return embedding_matrix, nb_words 

In [None]:
train = pd.read_csv('data/train_F3WbcTw.csv')
test  = pd.read_csv(test_data_location)

In [None]:
train_text = train['text']
test_text = test['text']
text_list = pd.concat([train_text, test_text])

y = train['sentiment'].values
num_train_data = y.shape[0]

In [None]:
from keras.utils import to_categorical
y = to_categorical(y)

In [None]:
start_time = time.time()
print("Spacy NLP ...")
nlp = spacy.load('en_core_web_lg', disable=['parser','ner','tagger'])
nlp.vocab.add_flag(lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP)
word_dict = {}
word_index = 1
lemma_dict = {}
docs = nlp.pipe(text_list, n_threads = 2)
word_sequences = []
for doc in tqdm(docs):
    word_seq = []
    for token in doc:
        if (token.text not in word_dict) and (token.pos_ is not "PUNCT"):
            word_dict[token.text] = word_index
            word_index += 1
            lemma_dict[token.text] = token.lemma_
        if token.pos_ is not "PUNCT":
            word_seq.append(word_dict[token.text])
    word_sequences.append(word_seq)
del docs
gc.collect()
train_word_sequences = word_sequences[:num_train_data]
test_word_sequences = word_sequences[num_train_data:]
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
# hyperparameters
max_length = 1000
embedding_size = 600
learning_rate = 0.001
batch_size = 16
num_epoch = 9

In [None]:

train_word_sequences = pad_sequences(train_word_sequences, maxlen=max_length, padding='post')
test_word_sequences = pad_sequences(test_word_sequences, maxlen=max_length, padding='post')


In [None]:
embedding_matrix_glove, nb_words = load_glove(word_dict, lemma_dict)

In [None]:
embedding_matrix_para, nb_words = load_para(word_dict, lemma_dict)

In [None]:
embedding_matrix = np.concatenate((embedding_matrix_glove, embedding_matrix_para), axis=1)

In [None]:
test['sentiment'] = -1

test['flag'] = 'test'
train['flag'] = 'train'

dt_comb = pd.concat([train, test])

print(dt_comb.shape)

In [None]:
dt_comb.head(2)

In [None]:

dt_comb = pd.get_dummies(dt_comb, columns=['drug'])

drug_columns = [col for col in dt_comb.columns.map(str) if col.startswith('drug_')]


In [None]:

drug_train = dt_comb[drug_columns][dt_comb['flag'] == 'train']

drug_test = dt_comb[drug_columns][dt_comb['flag'] == 'test']


In [None]:
print(drug_train.shape)
print(drug_test.shape)

In [None]:
drug_train.head(2)

In [None]:
del model

In [None]:
inp1 = Input(shape=(max_length,))

x = Embedding(nb_words, embedding_size, weights=[embedding_matrix], trainable=False)(inp1)

x = SpatialDropout1D(0.3)(x)

x1 = Bidirectional(LSTM(256, return_sequences=True))(x)

x2 = Bidirectional(GRU(128, return_sequences=True))(x1)

max_pool1 = GlobalMaxPooling1D()(x1)

max_pool2 = GlobalMaxPooling1D()(x2)

conc = Concatenate()([max_pool1, max_pool2])



inp2 = Input(shape=(111,))

z = Dense(96, activation = 'relu', kernel_regularizer=regularizers.l2(0.01))(inp2)

z = Dense(64, activation = 'relu', kernel_regularizer=regularizers.l2(0.01))(z)


conc2 = Concatenate()([conc, z])

predictions = Dense(3, activation='softmax')(conc2)

model = Model(inputs=[inp1,inp2], outputs=predictions)

In [None]:
model.summary()

In [None]:
import tensorflow as tf
import keras.backend as K

def f1(y_true, y_pred):
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    # tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
    return K.mean(f1)

In [None]:
adam = optimizers.Adam(lr=learning_rate)

model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=[f1])

In [None]:

history = model.fit(
                    [train_word_sequences,drug_train], 
                    y, 
                    batch_size=batch_size, 
                    epochs = 9, 
                    verbose = 1
                   ) 


In [None]:
y_hat = model.predict(test_word_sequences)

y_hat_df = pd.DataFrame(y_hat)

y_hat_df['y_hat'] = y_hat_df.idxmax(axis=1)

y_hat_final = np.array(y_hat_df['y_hat'])

submission = pd.concat( [t, pd.DataFrame(y_hat_final, columns=['y_hat'])] , axis=1)[['unique_hash', 'y_hat']]
submission.columns = ['unique_hash', 'sentiment']

submission.head(2)

In [None]:
submission['sentiment'].value_counts()

In [None]:
s2 = submission

# --------------------------------------------------
# Ensemble
# --------------------------------------------------

In [None]:
ens = pd.merge(s1,s2)

In [None]:
ens['ens'] = ens.min(numeric_only=True, axis = 1)

In [None]:
ens2 = ens[['unique_hash', 'ens']].rename(columns={"ens": "sentiment"})

In [None]:
ens2['sentiment'].value_counts()

In [None]:
ens2['sentiment'].value_counts(normalize = True)

In [None]:
ens2.to_csv('submission.csv', index = False)

# --------------------------------------------------
# References
# --------------------------------------------------

1) https://www.kaggle.com/wowfattie/3rd-place/code

2) https://arxiv.org/pdf/1606.07792.pdf