# Main Code


In [117]:
import os
import re

import numpy as np
import pandas as pd
import gensim
import keras_metrics as km
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
from gensim.models import KeyedVectors,Word2Vec
from nltk.stem.snowball import SnowballStemmer
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical


In [70]:
STOPWORDS = stopwords.words("english")
STEMMER = SnowballStemmer("english")

DATA_DIR_PATH = 'data'
DATA_FILE_PATH = os.path.join(DATA_DIR_PATH, 'training.1600000.processed.noemoticon.csv')
DATA = pd.read_csv(DATA_FILE_PATH, encoding = "ISO-8859-1", names = ["target", "ids", "data", 'flag', "user", "text"])

WORD2VEC_PATH = os.path.join(DATA_DIR_PATH,'GoogleNews-vectors-negative300.bin.gz' )
word2vec = KeyedVectors.load_word2vec_format(WORD2VEC_PATH, binary=True)

In [75]:
target_encoding = {0: "neg", 2: 'neu', 4: 'pos'}
new_target_encoding = {0:0, 4:1}

x_raw = DATA.text
y_raw = DATA.target

y_raw=y_raw.apply(lambda x:new_target_encoding[x])

In [76]:
y_raw.head()

0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int64

In [77]:
y_raw.value_counts()

1    800000
0    800000
Name: target, dtype: int64

In [78]:
x_raw.head()

0    @switchfoot http://twitpic.com/2y1zl - Awww, t...
1    is upset that he can't update his Facebook by ...
2    @Kenichan I dived many times for the ball. Man...
3      my whole body feels itchy and like its on fire 
4    @nationwideclass no, it's not behaving at all....
Name: text, dtype: object

In [79]:
x_original_train, x_original_test, y_original_train, y_original_test = train_test_split(x_raw, y_raw, test_size = 0.2, random_state = 42)

In [96]:
NUM_WORDS=100000
tokenizer = Tokenizer(num_words=NUM_WORDS,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'',
                      lower=True)
tokenizer.fit_on_texts(x_original_train)

In [97]:
sequences_train = tokenizer.texts_to_sequences(x_original_train)
sequences_valid=tokenizer.texts_to_sequences(x_original_test)

In [98]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 575276 unique tokens.


In [99]:
X_train = pad_sequences(sequences_train)
X_val = pad_sequences(sequences_valid,maxlen=X_train.shape[1])
y_train = to_categorical(y_original_train)
y_val = to_categorical(y_original_test)
print('Shape of X train and X validation tensor:', X_train.shape,X_val.shape)
print('Shape of label train and validation tensor:', y_train.shape,y_val.shape)

Shape of X train and X validation tensor: (1280000, 118) (320000, 118)
Shape of label train and validation tensor: (1280000, 2) (320000, 2)


In [100]:
EMBEDDING_DIM=300
vocabulary_size=min(len(word_index)+1,NUM_WORDS)
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))

In [104]:
for word, i in word_index.items():
    if i>=NUM_WORDS:
        continue
    try:
        embedding_vector = word2vec[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)


In [108]:
from keras.layers import Embedding
embedding_layer = Embedding(vocabulary_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=True)

In [111]:
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPooling2D, Dropout,concatenate
from keras.layers.core import Reshape, Flatten
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from keras.models import Model
from keras import regularizers
sequence_length = X_train.shape[1]
filter_sizes = [3,4,5]
num_filters = 100
drop = 0.5

In [114]:
inputs = Input(shape=(sequence_length,))
embedding = embedding_layer(inputs)
reshape = Reshape((sequence_length,EMBEDDING_DIM,1))(embedding)

conv_0 = Conv2D(num_filters, (filter_sizes[0], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)
conv_1 = Conv2D(num_filters, (filter_sizes[1], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)
conv_2 = Conv2D(num_filters, (filter_sizes[2], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)

maxpool_0 = MaxPooling2D((sequence_length - filter_sizes[0] + 1, 1), strides=(1,1))(conv_0)
maxpool_1 = MaxPooling2D((sequence_length - filter_sizes[1] + 1, 1), strides=(1,1))(conv_1)
maxpool_2 = MaxPooling2D((sequence_length - filter_sizes[2] + 1, 1), strides=(1,1))(conv_2)

merged_tensor = concatenate([maxpool_0, maxpool_1, maxpool_2], axis=1)
flatten = Flatten()(merged_tensor)
reshape = Reshape((3*num_filters,))(flatten)
dropout = Dropout(drop)(flatten)
output = Dense(units=2, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(dropout)

# this creates a model that includes
model = Model(inputs, output)

In [115]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 118)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 118, 300)     30000000    input_2[0][0]                    
__________________________________________________________________________________________________
reshape_3 (Reshape)             (None, 118, 300, 1)  0           embedding_1[1][0]                
__________________________________________________________________________________________________
conv2d_4 (Conv2D)               (None, 116, 1, 100)  90100       reshape_3[0][0]                  
__________________________________________________________________________________________________
conv2d_5 (

In [119]:
adam = Adam(lr=1e-3)

model.compile(loss='categorical_crossentropy',
              optimizer=adam,
              metrics=['acc', 'mae', km.categorical_precision(), km.categorical_f1_score(), km.categorical_recall()])
callbacks = [EarlyStopping(monitor='val_loss')]
model.fit(X_train, y_train, batch_size=1000, epochs=100, verbose=1, validation_data=(X_val, y_val),
         callbacks=callbacks)  # starts training

Train on 1280000 samples, validate on 320000 samples
Epoch 1/100
Epoch 2/100


<keras.callbacks.History at 0x7fb7ccd8e128>

In [None]:
def cleaning_sentence(text):
    text = re.sub('@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
    return text


def removing_stop_words(text):
    words = text.split()
    
    res = []
    for word in words:
        if not word in STOPWORDS:
            res.append(word)
    return ' '.join(res)


def stemming_words(text):
    words = text.split()
    res = []
    for word in words:
        res.append(STEMMER.stem(word))
    return ' '.join(res) 


def text_pre_process(text):
    cleaned = cleaning_sentence(text)
    removed = removing_stop_words(cleaned)
    stemmed = stemming_words(removed)
    return stemmed


def clean(x):
    x_clean = x.apply(lambda item: text_pre_process(item))
    return x_clean

In [None]:
X_MAIN = clean(x_raw)

In [None]:
Y_MAIN = y_raw

In [None]:
X_MAIN.head()

0         awww bummer shoulda got david carr third day
1    upset updat facebook text might cri result sch...
2      dive mani time ball manag save 50 rest go bound
3                      whole bodi feel itchi like fire
4                                        behav mad see
Name: text, dtype: object

In [None]:
Y_MAIN.head()

0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int64

In [None]:
%%time
documents = [_text.split() for _text in X_MAIN]
x_train, x_test, y_train, y_test = train_test_split(X_MAIN, Y_MAIN, test_size = 0.3, random_state = 42)

CPU times: user 1.17 s, sys: 164 ms, total: 1.33 s
Wall time: 1.33 s


In [None]:
len_of_seq = 300

tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)

x_train = pad_sequences(tokenizer.texts_to_sequences(x_train), maxlen=len_of_seq)
x_test = pad_sequences(tokenizer.texts_to_sequences(x_test), maxlen=len_of_seq)

In [None]:
y_train = y_train.values.reshape(-1,1)
y_train.shape

In [None]:
y_test = y_test.values.reshape(-1,1)
y_test.shape

In [None]:
y_train[:10]

In [None]:
y_test[:10]