In [1]:
from src.helpers import *
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from gensim.models import Word2Vec
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Input
from keras.layers.embeddings import Embedding
from keras import optimizers
from keras.layers import Dense, concatenate, Activation, Dropout
from keras.models import Model
from keras.layers.convolutional import Conv1D
from keras.layers.pooling import GlobalMaxPooling1D
from keras.utils import to_categorical

from sklearn.metrics import f1_score

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## read initial data via parser

In [2]:
data_train, data_test = get_parsed_data()
#dropout useless data and fill NaN with zeros
data_train.drop(['date', 'id', 'twitid'], axis=1, inplace = True, errors='ignore')
data_train.fillna(0)

data_train['label'] = data_train.apply(define_label, axis=1)
data_test['label'] = data_test.apply(define_label, axis=1)
data_train['clear_text'] = data_train.text.apply(clear_text)
data_test['clear_text'] = data_test.text.apply(clear_text)

## feature extraction

In [3]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(data_train.text)
X_test = vectorizer.transform(data_test.text)

## sklearn log regression model training and predicting

In [4]:
model = LogisticRegression()
model.fit(X_train, data_train.label)
model.score(X_test, data_test.label)

0.7070143884892086

## vector model training

In [5]:
data = pd.concat([data_train, data_test], ignore_index=True)
split_text = data.clear_text.apply(lambda sent: sent.lower().split())
w2v_model = Word2Vec(sentences=split_text, sg=1, min_count=1)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  """Entry point for launching an IPython kernel.


## transform data to index sequence

In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data.clear_text)
X_train = tokenizer.texts_to_sequences(data_train.clear_text)
X_test = tokenizer.texts_to_sequences(data_test.clear_text)

In [7]:
X_train = pad_sequences(X_train, maxlen=30)
X_test = pad_sequences(X_test, maxlen=30)

## embeding matrix creation

In [8]:
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, 100))
oov = []
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv.vocab:
        embedding_vector = w2v_model.wv.get_vector(word)
        embedding_matrix[i] = embedding_vector
    else:
        oov.append(word)

## CNN initialization

In [9]:
tweet_input = Input(shape=(30,), dtype='int32')
keys_num = len(list(w2v_model.wv.vocab.keys()))
inp = Embedding(keys_num+1, 100, input_length=30,
                weights=[embedding_matrix], trainable=False)(tweet_input)

In [10]:
branches = []

for size, filters_count in [(2, 10), (3, 10), (4, 10), (5, 10)]:
    for i in range(filters_count):
        branch = Conv1D(filters=1, kernel_size=size, padding='valid', activation='relu')(inp)
        branch = GlobalMaxPooling1D()(branch)
        branches.append(branch)
x = concatenate(branches, axis=1) 
drop1 = Dropout(0.2)(x)
hidden = Dense(30, activation='relu')(drop1)
drop2 = Dropout(0.2)(hidden)
out = Dense(3, activation='relu')(drop2)

In [11]:
model = Model(input=tweet_input, output=out) 

model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy']) 

  """Entry point for launching an IPython kernel.


In [12]:
model.fit([X_train], y=to_categorical(data_train.label.values+1), verbose=1, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1db44ba04e0>

# Results

In [13]:
model.evaluate(X_test, to_categorical(data_test.label.values+1))



[0.8027225728944051, 0.6577338130354023]

In [15]:
predicted = model.predict(X_test)
predicted[len(predicted)-1] = [0,0,1]
to_categorical([np.argmax(x) for x in predicted])

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]], dtype=float32)

In [16]:
f1_score(to_categorical([np.argmax(x) for x in predicted]), to_categorical(data_test.label.values+1), average='macro')

0.44240855705378274

In [17]:
f1_score(to_categorical([np.argmax(x) for x in predicted]), to_categorical(data_test.label.values+1), average='micro')

0.6579136690647482