In [4]:
import pandas as pd
import tensorflow as tf
import os
import tensorflow_text as tf_text
from nltk.corpus import stopwords
import gensim
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import random


import tensorflow as tf

from tensorflow.keras.layers import Dense, InputLayer, GlobalMaxPool1D, Dropout, Conv1D, MaxPool1D, Flatten, Embedding, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.activations import relu, sigmoid
from tensorflow.keras.optimizers import Adam, SGD

from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.model_selection import train_test_split



In [5]:
# WORD2VEC 
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10

# KERAS 
SEQUENCE_LENGTH = 300
EPOCHS = 8
BATCH_SIZE = 1024

In [None]:
def first_data_prep():
    data = pd.read_csv('Data/twitter_data.csv', 
                      encoding='latin-1', names=['sentiment', 'id', 'date', 'flag', 'user',
                                                'text'])
    data['sentiment'].replace(4, 1, inplace=True)
    data = data.sample(frac=1).reset_index(drop=True).copy()
    data['text'] = data['text'].str.lower()
    return data

In [None]:
def delete_nicknames(row):
    # Prepare list of words
    words = row.split()
    # Remove nicknames
    for word in words:
        if word[0] == '@':
            words.remove(word)
    # Return string 
    return ' '.join(word for word in words)

In [None]:
def stratified_split(df, size):
    # Split dataset into smaller one 
    col_list = list(df.columns)
    # Drop target column name
    col_list.pop(0)
    x_train, x_valid = train_test_split(
    df, random_state=1, stratify=df['sentiment'], test_size=size)
    # Prepare new indexes 
    x_valid.reset_index(drop=True, inplace=True)
    return x_valid 
    

In [None]:
def replace_with_space(text):
    # Replace new rows with space 
    text = text.replace('\n', " ").replace("\r", " ")
    # Create list of all not needed chars 
    punc_list = '!"@#$%^&*()+_-.<>?/:;[]{}|\~'
    # Make transformation with dict that contains punc_list chars
    t = str.maketrans(dict.fromkeys(punc_list, " "))
    # Apply transformation
    text = text.translate(t)
    # Replace single quote with empty char
    t = text.maketrans(dict.fromkeys("'`"))
    text.translate(t)
    
    return text
    

In [7]:
def remove_stop_words(text):
    # Prepare set of stopwords
    stop_words = set(stopwords.words('english'))
    # Not not found on win10, returns Key Error
#     stop_words.remove('Not')

    # Remove stopwords from the text
    filtered_text = [word for word in text.split() if not word in stop_words]
    
    return filtered_text

In [8]:
def tokenize(data, num_words, num_words_pad): 
    data = data.copy()
    # Apply replace func that replace chars with spaces
    data['text'] = data['text'].apply(lambda x: replace_with_space(x)).copy()
    # Apply func that removes stop words
    data['text'] = data['text'].apply(lambda x: remove_stop_words(x))
    
    
    w2v_model = gensim.models.word2vec.Word2Vec(vector_size=W2V_SIZE, 
                                            window=W2V_WINDOW, 
                                            min_count=W2V_MIN_COUNT, 
                                            workers=8)
    
    documents = [_text.split() for _text in df_train.text] 
    w2v_model.build_vocab(documents)


    # Initialize tokenizer
    tok = tf.keras.preprocessing.text.Tokenizer(num_words=num_words)
    # Updates internal vocabulary based on a list of texts 
    tok.fit_on_texts(list(data['text']))
    # Transforms each text in texts to a sequence of integers.
    seq = tok.texts_to_sequences(list(data['text']))
    # Pad sequences to make them same lenght 
    tf_ready = tf.keras.preprocessing.sequence.pad_sequences(seq)
    
    return tf_ready, tok

In [9]:
%%time
data = first_data_prep()
# Apply replace func that replace chars with spaces
data['text'] = data['text'].apply(lambda x: replace_with_space(x)).copy()
# Apply func that removes stop words
data['text'] = data['text'].apply(lambda x: remove_stop_words(x))

Wall time: 6min 50s


In [10]:
%%time
tok = tf.keras.preprocessing.text.Tokenizer()
# Updates internal vocabulary based on a list of texts 
tok.fit_on_texts(list(data['text']))
# Transforms each text in texts to a sequence of integers.
seq = tok.texts_to_sequences(list(data['text']))
# Pad sequences to make them same lenght 
tf_ready = tf.keras.preprocessing.sequence.pad_sequences(seq)

Wall time: 27.1 s


In [11]:
%%time
w2v_model = gensim.models.word2vec.Word2Vec(vector_size=W2V_SIZE, 
                                            window=W2V_WINDOW, 
                                            min_count=W2V_MIN_COUNT, 
                                            workers=8)
    
documents = data.text
w2v_model.build_vocab(documents)

Wall time: 4.51 s


In [12]:
%%time
words = w2v_model.wv.index_to_key
vocab_size = len(tok.word_index) + 1
print("Vocab size", vocab_size)

Vocab size 739298
Wall time: 0 ns


In [13]:
%%time
w2v_model.train(documents, total_examples=len(documents), epochs=W2V_EPOCH)

Wall time: 5min 7s


(355796583, 415799456)

In [14]:
embedding_matrix = np.zeros((vocab_size, W2V_SIZE))
for word, i in tok.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]
embedding_layer = Embedding(vocab_size, W2V_SIZE,
                            weights=[embedding_matrix], input_length=SEQUENCE_LENGTH, trainable=False)

In [15]:
tf_df = pd.DataFrame(tf_ready)
tf_df['sentiment'] = data['sentiment']
# tf_df.to_csv('tokenized_data.csv', index=False)

In [3]:
def configure_cnn(data,
                embedding,
                layers=None, 
                dropout_rate=0,
                optimizer='Adam',
                loss='binary_crossentropy',
                 kernel_initializer='lecun_normal',
                 kernel_regularizer=tf.keras.regularizers.L2(0.01)
                ):
    """ Layers argument shape:
    [[number of nodes, activate function], 
    [number of nodes, activate function],
    ...]
    
    """
    input_len = data.shape[1] - 1 
    model = Sequential()
    model.add(embedding)
    model.add(Conv1D(32, 3, padding='same', activation='relu'))
    model.add(MaxPool1D(3))
    model.add(Conv1D(64, 3, activation='relu'))
    model.add(MaxPool1D(pool_size=3))
    model.add(Conv1D(128, 3, activation='relu'))
    model.add(GlobalMaxPool1D())
    model.add(Flatten())
    
    if dropout_rate > 0:
        model.add(Dropout(dropout_rate))
    
    if layers != None:
        for node in layers[1:]:
            model.add(Dense(node[0], activation=node[1], kernel_initializer=kernel_initializer, 
                            kernel_regularizer=kernel_regularizer))
    model.add(Dense(10, activation='sigmoid'))
    model.add(Dense(1, activation='sigmoid', kernel_initializer=kernel_initializer, 
                    kernel_regularizer=kernel_regularizer))
    
    model.compile(loss=loss, 
                 optimizer=optimizer,
                 metrics=['accuracy'])
    
    return model

NameError: name 'tf' is not defined

In [None]:
def configure_rnn(data,
                embedding,
                layers=None, 
                dropout_rate=0,
                optimizer='Adam',
                loss='binary_crossentropy',
                 kernel_initializer='lecun_normal',
                 kernel_regularizer=tf.keras.regularizers.L2(0.01)
                ):
    """ Layers argument shape:
    [[number of nodes, activate function], 
    [number of nodes, activate function],
    ...]
    
    """
    input_len = data.shape[1] - 1 
    model = Sequential()
    model.add(embedding)
    model.add(Dropout(0.5))
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))    

    if layers != None:
        for node in layers[1:]:
            model.add(Dense(node[0], activation=node[1], kernel_initializer=kernel_initializer, 
                            kernel_regularizer=kernel_regularizer))
    model.add(Dense(1, activation='sigmoid', kernel_initializer=kernel_initializer, 
                    kernel_regularizer=kernel_regularizer))
    
    model.compile(loss=loss, 
                 optimizer=optimizer,
                 metrics=['accuracy'])
    
    return model

In [2]:
model = configure_cnn(tf_df, embedding_layer)
# model_rnn = configure_rnn(tf_df, embedding_layer)

NameError: name 'configure_cnn' is not defined

In [59]:
features = tf_df.columns.tolist()
features.remove('sentiment')
X_train, X_test, y_train, y_test = train_test_split(tf_df[features], tf_df['sentiment'], test_size=0.33, random_state=42)

In [1]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 8088951385268926044
]


In [75]:
model_rnn.summary

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 300)          44304600  
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 300, 32)           28832     
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 100, 32)           0         
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 98, 64)            6208      
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 32, 64)            0         
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 30, 128)           24704     
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 128)              

In [67]:
model.fit(X_train, y_train, epochs=EPOCHS, verbose=1,
         validation_data=(X_test, y_test), batch_size=BATCH_SIZE)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<tensorflow.python.keras.callbacks.History at 0x7f808c10fc70>

In [52]:
# 0.77
ef strip(row):
    return row.split()

data['text'].apply(lambda x: strip(x))

0          [@elleasinswell, oh,, i'll, have, to, try, it!...
1          [@cmlundy, done!!!!!, i, really, need, one, to...
2          [lost, google, notebook, ie, add, on, with, th...
3          [@natalietran, at, least, they, have, a, moral...
4          [@kirstyhilton, ive, been, trying, to, get, mi...
                                 ...                        
1599995    [ain't, watching, the, laker, game,, i, can't,...
1599996    [bummed, about, the, softball, loss, 0-1, thes...
1599997    [back, in, god's, hands,, back, in, god's, han...
1599998                                         [bbq, party]
1599999    [@iamjemzie, what, time, is, this, and, where,...
Name: text, Length: 1600000, dtype: object

In [46]:
data['text'].str.len().max()

374

In [51]:
test.strip()

'@cmlundy done!!!!! i really need one too... aritzia hasnt gotten back yet  dammit!!'