In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix

import keras
from keras import optimizers
from keras import backend as K
from keras import regularizers
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Flatten
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D 
from keras.utils import plot_model
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping

from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer 
import os, re, csv, math, codecs

stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])

Using TensorFlow backend.


In [2]:
# load embedding
# takes about 2 minutes to load
embeddings_index = {}
f = codecs.open('wiki-news-300d-1M-subword.vec', encoding='utf-8')
for line in tqdm(f):
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('found %s word vectors' % len(embeddings_index))

999995it [01:37, 10300.78it/s]

found 999995 word vectors





In [3]:
#load data
#df = pd.read_pickle('tweets_sentiment_cleaned.pkl')
df = pd.read_csv('../5153 Applied ML Project - General/Code/kaggleTweets_cleaned_undersamplig_dupl_Stopwords.csv')
#df = df.drop_duplicates('message',keep='first')
df.shape

(15036, 15)

In [4]:
X = df.message
y = df.sentiment.values
num_classes = len(np.unique(y))

In [5]:
max_num_words = 100000
max_seq_len = 40
embed_dim=300
y_oh = keras.utils.to_categorical(y,num_classes)
def process_tweet_input_into_CNN_inputs(X):
    #raw_docs = X.tolist()
    raw_docs = X

    processed_docs = []
    for doc in tqdm(raw_docs):
        tokens = RegexpTokenizer(r'\w+').tokenize(doc)
        #filtered = [word for word in tokens if word not in stop_words]
        processed_docs.append(" ".join(tokens))

    tokenizer = Tokenizer(num_words=max_num_words, lower=True, char_level=False)
    tokenizer.fit_on_texts(processed_docs)

    word_seq = tokenizer.texts_to_sequences(processed_docs)
    word_index = tokenizer.word_index
    print("dictionary size: ", len(word_index))

    #pad sequences
    word_seq = sequence.pad_sequences(word_seq, maxlen=max_seq_len)
    return word_seq,word_index

word_seq,word_index = process_tweet_input_into_CNN_inputs(X)

100%|████████████████████████████████████████████████████████████████████████| 15036/15036 [00:00<00:00, 132236.74it/s]


dictionary size:  23967


In [6]:
#embedding matrix
print('preparing embedding matrix...')
words_not_found = []
nb_words = min(max_num_words, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_dim))
for word, i in word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

preparing embedding matrix...
number of null word embeddings: 9457


In [7]:
print("sample words not found: ", np.random.choice(words_not_found, 10))

sample words not found:  ['scotclimate' 'kaatherinecx' 'winenews' 'politolizer' 'nytscience' 'wedn'
 'xanria' 'alecexposed' 'mloparis' 'breadconqueror']


In [8]:
#define custom f1-score metric
#because keras doesn't have F1-score built in as a metric
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

# model tuning

In [9]:
num_filters=100
kernel1=7
kernel2=7
weight_decay = 0.01
def get_model():#kernel1,kernel2,num_filters):
    model = Sequential()
    model.add(Embedding(nb_words, embed_dim,
              weights=[embedding_matrix], input_length=max_seq_len, trainable=False))
    model.add(Conv1D(num_filters, kernel_size=kernel1, activation='relu', padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Conv1D(num_filters, kernel_size=kernel2, activation='relu', padding='same'))
    model.add(GlobalMaxPooling1D())
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(weight_decay)))
    model.add(Dense(4, activation='sigmoid'))  #multi-label (k-hot encoding)

    adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy',f1_m])
    #model.summary()
    
    return model

In [10]:
from sklearn.model_selection import *
from keras.wrappers.scikit_learn import *

word_seq_train, word_seq_test, y_train_oh, y_test_oh = train_test_split(word_seq,y_oh,test_size=0.2,random_state=10,stratify=y)
length = word_seq_train.shape[0]
model = KerasClassifier(build_fn=get_model)

#param_grid = dict(kernel1=[2,3,5],kernel2=[5],num_filters=[100])
param_grid = dict(batch_size=[2])
grid = GridSearchCV(estimator=model,param_grid=param_grid,n_jobs=1,cv=3)

early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=3, verbose=1)
callbacks_list = [early_stopping]
grid_result = grid.fit(word_seq_train, y_train_oh, #batch_size=batch_size, epochs=num_epochs, 
                     callbacks=callbacks_list, 
                     validation_data=(word_seq_test, y_test_oh),
                     verbose=0)
    

Instructions for updating:
If using Keras pass *_constraint arguments to layers.

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



tensorflow.python.framework.errors_impl.UnknownError: 2 root error(s) found.
	 [[{{node conv1d_1/convolution}}]]
	 [[metrics/accuracy/Identity/_137]]
	 [[{{node conv1d_1/convolution}}]]
0 successful operations.
0 derived errors ignored.

tensorflow.python.framework.errors_impl.UnknownError: 2 root error(s) found.
	 [[{{node conv1d_3/convolution}}]]
	 [[Mean_1/_275]]
	 [[{{node conv1d_3/convolution}}]]
0 successful operations.
0 derived errors ignored.

tensorflow.python.framework.errors_impl.UnknownError: 2 root error(s) found.
	 [[{{node conv1d_5/convolution}}]]
	 [[metrics_2/f1_m/Identity/_419]]
	 [[{{node conv1d_5/convolution}}]]
0 successful operations.
0 derived errors ignored.



UnknownError: 2 root error(s) found.
  (0) Unknown: Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
	 [[{{node conv1d_7/convolution}}]]
	 [[metrics_3/f1_m/Identity/_559]]
  (1) Unknown: Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
	 [[{{node conv1d_7/convolution}}]]
0 successful operations.
0 derived errors ignored.

In [None]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
grid_result.best_params    print("%f (%f) with: %r" % (mean, stdev, param))

# rerunning with best params to get 5fold F1score

In [11]:
num_filters=200
kernel1=7
kernel2=7
weight_decay = 0.01
batch_size=256
def get_model():
    model = Sequential()
    model.add(Embedding(nb_words, embed_dim,
              weights=[embedding_matrix], input_length=max_seq_len, trainable=False))
    model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
    model.add(GlobalMaxPooling1D())
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(weight_decay)))
    model.add(Dense(4, activation='sigmoid'))  #multi-label (k-hot encoding)

    adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy',f1_m])
    #model.summary()
    
    return model

In [12]:
N_SPLITS = 5
KFOLD = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=123)

def get_train_test_idx(fold_no, y):
    '''
    Get train test indices based on fold_no.
    fold_no is 0-index (i.e. 0,1,2,3,4)
    '''
    assert fold_no < N_SPLITS and fold_no >= 0, 'invalid fold number'
    
    for fold, (train_id, val_id) in enumerate(KFOLD.split(y, y)):
        if fold == fold_no:
            break
            
    return train_id, val_id

In [13]:
# 5fold model training
def run_fold(fold_no):
    
    train_idx, test_idx = get_train_test_idx(fold_no,y)
    word_seq_train = word_seq[train_idx]
    word_seq_test = word_seq[test_idx]
    y_train_oh = y_oh[train_idx]
    y_test_oh = y_oh[test_idx] 
    
    model = get_model()
    early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=5, verbose=1)
    callbacks_list = [early_stopping]
    history = model.fit(word_seq_train, y_train_oh, 
                     batch_size=batch_size, epochs=15, 
                     callbacks=callbacks_list, 
                     validation_data=(word_seq_test, y_test_oh),
                     verbose=0)

    fig, ax = plt.subplots(1,2, figsize=(14,4))
    ax[0].plot(history.history['accuracy'], label='Train Acc')
    ax[0].plot(history.history['val_accuracy'], label='Val Acc')
    ax[0].set_xlabel('Epoch')
    ax[0].set_ylabel('Accuracy')
    ax[0].legend()

    ax[1].plot(history.history['f1_m'], label='Train F1')
    ax[1].plot(history.history['val_f1_m'], label='Val F1')
    ax[1].set_xlabel('Epoch')
    ax[1].set_ylabel('F1 Score')
    ax[1].legend()

    fig.suptitle(f'Fold {fold_no+1} Training History')
    plt.show()

    del(model)
    
    return history.history['val_f1_m'][-1]

In [14]:
f1 = np.zeros((5,1))

for i in range(5):
    print(f'Running Fold {i + 1}')
    f1[i] = run_fold(i)
    
print(f1.mean())
print(f1.std())

Running Fold 1


UnknownError: 2 root error(s) found.
  (0) Unknown: Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
	 [[{{node conv1d_9/convolution}}]]
	 [[metrics_4/accuracy/Identity/_697]]
  (1) Unknown: Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
	 [[{{node conv1d_9/convolution}}]]
0 successful operations.
0 derived errors ignored.

In [None]:
print(f1)

# below code is unused