import pandas as pd
import tensorflow as tf
import keras as keras

import re
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk
import warnings 
import os

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

%matplotlib inline

In [2]:
train_df = pd.read_csv("../../data/tweet/train.csv")
test_df = pd.read_csv("../../data/tweet/test.csv")

In [3]:
train_df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [4]:
combi_df = train_df.append(test_df, ignore_index=True, sort=False)

In [5]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt

# remove twitter handles (@user)
combi_df['tidy_tweet'] = np.vectorize(remove_pattern)(combi_df['tweet'], "@[\w]*")

combi_df['tidy_tweet'] = combi_df['tidy_tweet'].str.replace("[^a-zA-Z]", " ")

combi_df['tidy_tweet'] = combi_df['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>1]))

In [6]:
combi_df.head()

Unnamed: 0,id,label,tweet,tidy_tweet
0,1,0.0,@user when a father is dysfunctional and is s...,when father is dysfunctional and is so selfish...
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit can use cause they don ...
2,3,0.0,bihday your majesty,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...,model love take with all the time in ur
4,5,0.0,factsguide: society now #motivation,factsguide society now motivation


In [7]:
tokenized_tweet = combi_df['tidy_tweet'].apply(lambda x: x.split())
tokenized_tweet.head()

0    [when, father, is, dysfunctional, and, is, so,...
1    [thanks, for, lyft, credit, can, use, cause, t...
2                              [bihday, your, majesty]
3    [model, love, take, with, all, the, time, in, ur]
4               [factsguide, society, now, motivation]
Name: tidy_tweet, dtype: object

In [8]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

# tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
tokenized_tweet.head()

0    [when, father, is, dysfunctional, and, is, so,...
1    [thanks, for, lyft, credit, can, use, cause, t...
2                              [bihday, your, majesty]
3    [model, love, take, with, all, the, time, in, ur]
4               [factsguide, society, now, motivation]
Name: tidy_tweet, dtype: object

In [9]:
for i in range(len(tokenized_tweet)):
  tokenized_tweet[i] = ' '.join(tokenized_tweet[i])

combi_df['tidy_tweet'] = tokenized_tweet

In [10]:
combi_df.head()

Unnamed: 0,id,label,tweet,tidy_tweet
0,1,0.0,@user when a father is dysfunctional and is s...,when father is dysfunctional and is so selfish...
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit can use cause they don ...
2,3,0.0,bihday your majesty,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...,model love take with all the time in ur
4,5,0.0,factsguide: society now #motivation,factsguide society now motivation


In [11]:
MAX_SEQUENCE_LENGTH = 60

MAX_NUM_WORDS = 20000

# texts_to_sequences
from keras.preprocessing.text import *
from keras.preprocessing.sequence import *
t = Tokenizer(num_words=MAX_NUM_WORDS)
t.fit_on_texts(combi_df['tidy_tweet'])
print(t.word_counts)
combi_X = t.texts_to_sequences(combi_df['tidy_tweet'])
combi_X = pad_sequences(combi_X, maxlen=MAX_SEQUENCE_LENGTH, padding="pre")



In [12]:
len(t.word_counts)
# t.word_index

48861

In [13]:
train_df = combi_df[:31962]
test_df = combi_df[31962:]

In [14]:
train_X = combi_X[:31962]
test_X = combi_X[31962:]

In [15]:
GLOVE_DIR = os.path.join('../../data', 'glove.twitter.27B')

In [16]:
print('Indexing word vectors.')

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.twitter.27B.100d.txt')) as f:
  for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs


print('Found %s word vectors.' % len(embeddings_index))



Indexing word vectors.
Found 1193514 word vectors.


In [17]:
print('Preparing embedding matrix.')

EMBEDDING_DIM = 100
word_index = t.word_index

# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

Preparing embedding matrix.


In [18]:
from keras.models import *
from keras.layers import *
from keras.optimizers import *
from keras.initializers import Constant

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [19]:
train_df.shape[0]

31962

In [20]:
train_Y = train_df["label"]
train_Y.size

31962

In [61]:
vocab_size = len(t.word_counts) + 1
# embed_size = 164

hidden_size = 128

model = Sequential()
# model.add(Embedding(vocab_size, embed_size, input_length=max_in_len))
model.add(embedding_layer)
# model.add(Dense(64, activation="relu"))
# model.add(Dense(1, activation="relu"))
# model.add(LSTM(hidden_size, return_sequences=True))
# model.add(Dropout(0.3))
model.add(LSTM(hidden_size, dropout=0))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(32, activation="relu"))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(1, activation="sigmoid"))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 60, 100)           2000000   
_________________________________________________________________
lstm_5 (LSTM)                (None, 128)               117248    
_________________________________________________________________
batch_normalization_5 (Batch (None, 128)               512       
_________________________________________________________________
dropout_10 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 32)                4128      
_________________________________________________________________
batch_normalization_6 (Batch (None, 32)                128       
_________________________________________________________________
dropout_11 (Dropout)         (None, 32)                0         
__________

In [62]:
from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [63]:
# opt = SGD(lr=0.0001)
model.compile(optimizer="nadam",
              loss='binary_crossentropy', metrics=[f1])

In [64]:
model.fit(train_X, train_Y, epochs=10, validation_split=0.12)

Train on 28126 samples, validate on 3836 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff70910c278>

In [65]:
model.fit(train_X, train_Y, epochs=3, validation_split=0.12)

Train on 28126 samples, validate on 3836 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7ff708bd4b00>

In [66]:
model.fit(train_X, train_Y, epochs=2, validation_split=0.12)

Train on 28126 samples, validate on 3836 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7ff708bd4d68>

In [67]:
y_pred = model.predict(train_X)

In [68]:
y_pred.size

31962

In [69]:
y_pred

array([[4.8010486e-05],
       [1.5018397e-06],
       [3.4963417e-05],
       ...,
       [6.9462994e-06],
       [9.9700218e-01],
       [1.2906719e-05]], dtype=float32)

In [70]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

print('Confusion Matrix')
print(confusion_matrix(train_df["label"], (y_pred > 0.5).astype(np.int)))

Confusion Matrix
[[29644    76]
 [  241  2001]]


In [71]:
test_df = combi_df.loc[31962:]

# t = Tokenizer()
# t.fit_on_texts(test_df['tidy_tweet'])
# print(t.word_counts)
# test_X = t.texts_to_sequences(test_df['tidy_tweet'])
# test_X = pad_sequences(test_X, maxlen=max_in_len, padding="pre")

test_y = model.predict(test_X)

In [72]:
final_y = (test_y > 0.5).astype(np.int)

In [73]:
# final_y[100:400]

In [74]:
# test_df['label'] = test_y.round()
test_df['label'] = final_y
# test_df["label"] = test_df["label"].apply(lambda x: '1' if x > 0.3 else '0')
test_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,id,label,tweet,tidy_tweet
31962,31963,0,#studiolife #aislife #requires #passion #dedic...,studiolife aislife requires passion dedication...
31963,31964,1,@user #white #supremacists want everyone to s...,white supremacists want everyone to see the ne...
31964,31965,0,safe ways to heal your #acne!! #altwaystohe...,safe ways to heal your acne altwaystoheal heal...
31965,31966,0,is the hp and the cursed child book up for res...,is the hp and the cursed child book up for res...
31966,31967,0,"3rd #bihday to my amazing, hilarious #nephew...",rd bihday to my amazing hilarious nephew eli a...


In [75]:
test_df.head()

Unnamed: 0,id,label,tweet,tidy_tweet
31962,31963,0,#studiolife #aislife #requires #passion #dedic...,studiolife aislife requires passion dedication...
31963,31964,1,@user #white #supremacists want everyone to s...,white supremacists want everyone to see the ne...
31964,31965,0,safe ways to heal your #acne!! #altwaystohe...,safe ways to heal your acne altwaystoheal heal...
31965,31966,0,is the hp and the cursed child book up for res...,is the hp and the cursed child book up for res...
31966,31967,0,"3rd #bihday to my amazing, hilarious #nephew...",rd bihday to my amazing hilarious nephew eli a...


In [76]:
submission = test_df[['id','label']]
submission.to_csv('rnn2.csv', index=False) # writing data to a CSV file