In [1]:
import zipfile
import numba
import pandas as pd
import dask.dataframe as dd
from bs4 import BeautifulSoup
import re
import nltk
from dask.multiprocessing import get
from gensim.models import Word2Vec, Phrases
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Bidirectional, Dropout, Dense, Attention, Embedding, Conv1D, MaxPool1D
import tqdm
from multiprocessing import cpu_count
import tensorflow as tf

#Configuraciones 
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession

config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

In [None]:
tf.keras.callbacks

# Load data

In [2]:
dataset_1_zip = zipfile.ZipFile('/home/rubiales/PycharmProjects/pycharm/NLP/ignore_files/IMDB Dataset.csv.zip', 'r')
dataset_1_csv = dataset_1_zip.open('IMDB Dataset.csv')

In [3]:
df_1_train = pd.read_csv(dataset_1_csv)
#positive and negative sentimen to categorical
df_1_train['sentiment'] = pd.Categorical(df_1_train.sentiment).codes
print('Tamaño del dataframe: ', df_1_train.shape)
df_1_train[:3]

Tamaño del dataframe:  (50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1


In [4]:
dataset_2_zip = zipfile.ZipFile('/home/rubiales/PycharmProjects/pycharm/NLP/ignore_files/word2vec-nlp-tutorial.zip')
dataset_2_csv_train = dataset_2_zip.open('labeledTrainData.tsv')
dataset_2_csv_test = dataset_2_zip.open('testData.tsv')

In [5]:
df_2_train = pd.read_csv(dataset_2_csv_train, sep='\t')
df_test = pd.read_csv(dataset_2_csv_test, sep='\t')

In [6]:
print(df_1_train.head())
print(df_2_train.head())
print(df_test.head())

                                              review  sentiment
0  One of the other reviewers has mentioned that ...          1
1  A wonderful little production. <br /><br />The...          1
2  I thought this was a wonderful way to spend ti...          1
3  Basically there's a family where a little boy ...          0
4  Petter Mattei's "Love in the Time of Money" is...          1
       id  sentiment                                             review
0  5814_8          1  With all this stuff going down at the moment w...
1  2381_9          1  \The Classic War of the Worlds\" by Timothy Hi...
2  7759_3          0  The film starts with a manager (Nicholas Bell)...
3  3630_4          0  It must be assumed that those who praised this...
4  9495_8          1  Superbly trashy and wondrously unpretentious 8...
         id                                             review
0  12311_10  Naturally in a film who's main themes are of m...
1    8348_2  This movie is a disaster within a disaster fi

We can see that our first dataset doesn't has the column "id" so we will drop this column fromo our datasets and let concat the dataframe train

In [7]:
# df_test.drop(columns='id', inplace=True)
df_train = pd.concat([df_1_train, df_2_train[['sentiment', 'review']]], axis=0, sort=False)
df_train.reset_index(drop=True, inplace=True)
print('columnas train:', df_train.columns)
print('columnas test:', df_test.columns)
print('Shape train:', df_train.shape)
print('Shape test:', df_test.shape)
df_train.review = df_train.review.astype(str)
df_train[:3]

columnas train: Index(['review', 'sentiment'], dtype='object')
columnas test: Index(['id', 'review'], dtype='object')
Shape train: (75000, 2)
Shape test: (25000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1


## Pass everything to Dask

In [8]:
#count number of threads
cores = cpu_count()

In [9]:
#Create 2 empty columns for dask text processing
df_train['procesed'] = ''
df_test['procesed'] = ''
dask_train = dd.from_pandas(df_train, npartitions=cores)
dask_test = dd.from_pandas(df_test, npartitions=cores)

# Text Pre - Processing

In [19]:
#Remove html
def parse_html(nlp):
    return BeautifulSoup(nlp).get_text()

#Remove non words
def remove_nonwords(nlp):
    return re.sub("[^a-zA-Z]", " ", nlp)

#Lower all text
def lower(nlp):
    return nlp.lower()

#remove stopwords
def remove_stopwords(nlp):
    stopwords = nltk.corpus.stopwords.words('english')
    splited = nlp.split()
    removed = [item for item in splited if item not in stopwords]
    joined = ' '.join(removed)
    return joined

#tokenize text
def own_tokenizer(nlp):
    return nltk.word_tokenize(nlp)

#lematize words
def own_lemmatizer(nlp):
    lemmatizer = nltk.stem.WordNetLemmatizer()            
    lemmas = list(map(lemmatizer.lemmatize, nlp))
    return lemmas

#a fuction that group all the preprocess functions
def df_clean(df_train):
    df_train['procesed'] = df_train.review.map(parse_html).map(remove_nonwords).map(lower).map(remove_stopwords).map(
    own_tokenizer).map(own_lemmatizer)
    return df_train

In [20]:
dask_train_processed = dask_train.map_partitions(df_clean, meta=df_train)
dask_test_processed = dask_test.map_partitions(df_clean, meta=df_test)
preprocessed_train = dask_train_processed.compute(scheduler='processes')
preprocessed_test = dask_test_processed.compute(scheduler='processes')
preprocessed_train

Unnamed: 0,review,sentiment,procesed
0,One of the other reviewers has mentioned that ...,1,"[one, reviewer, mentioned, watching, oz, episo..."
1,A wonderful little production. <br /><br />The...,1,"[wonderful, little, production, filming, techn..."
2,I thought this was a wonderful way to spend ti...,1,"[thought, wonderful, way, spend, time, hot, su..."
3,Basically there's a family where a little boy ...,0,"[basically, family, little, boy, jake, think, ..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,"[petter, mattei, love, time, money, visually, ..."
...,...,...,...
74995,It seems like more consideration has gone into...,0,"[seems, like, consideration, gone, imdb, revie..."
74996,I don't believe they made this film. Completel...,0,"[believe, made, film, completely, unnecessary,..."
74997,"Guy is a loser. Can't get girls, needs to buil...",0,"[guy, loser, get, girl, need, build, picked, s..."
74998,This 30 minute documentary Buñuel made in the ...,0,"[minute, documentary, bu, uel, made, early, on..."


# mini-EDA

In [None]:
print('Balance of the classes in train:',  preprocessed_train.sentiment.mean())

In [None]:
lenth = preprocessed_train.procesed.map(len)
print('Mean', lenth.mean())
print('Median', lenth.median())
print('Mode', lenth.mode()[0])

# Processing to model

## train word2vec

In [None]:
#Create the vocabulary in Bigrams
bigrams = Phrases(sentences=preprocessed_train['procesed'], min_count=4)

In [None]:
#Create the vocabulary in Trigrams
trigrams = Phrases(sentences=bigrams[preprocessed_train['procesed']], min_count=3)

In [None]:
#Create the vocabulary in cuatrigrams
fourgrams = Phrases(sentences=trigrams[preprocessed_train['procesed']], min_count=3)

In [None]:
#Bigram model
#window is the maximun distance between the current and predicted word within a sentence
embedding_size = 256
bigram_model = Word2Vec(sentences = bigrams[preprocessed_train['procesed']], size=embedding_size,
                        min_count=4, window=5, workers=cores)

In [None]:
#Trigram model
trigrams_model = Word2Vec(sentences = trigrams[bigrams[preprocessed_train['procesed']]], size=embedding_size,
                        min_count=3, window=5, workers=cores)

In [None]:
#Fourgram model
fourgram_model = Word2Vec(sentences = fourgrams[trigrams[preprocessed_train['procesed']]], size=embedding_size,
                        min_count=3, window=5, workers=cores)

### Test the models

In [None]:
bigram_model.wv.most_similar('america')

In [None]:
trigrams_model.wv.most_similar('america')

In [49]:
fourgram_model.wv.most_similar('america')

[('nation', 0.8577866554260254),
 ('india', 0.8333728313446045),
 ('japan', 0.8321987390518188),
 ('australia', 0.81800776720047),
 ('germany', 0.798524022102356),
 ('united_state', 0.7973009347915649),
 ('europe', 0.795366108417511),
 ('country', 0.7930383682250977),
 ('britain', 0.7832762002944946),
 ('russia', 0.7785017490386963)]

In [50]:
fourgram_model.wv.most_similar('husband')

[('wife', 0.9080113172531128),
 ('daughter', 0.8578042984008789),
 ('mother', 0.8107233047485352),
 ('father', 0.8020291924476624),
 ('son', 0.7928421497344971),
 ('marriage', 0.7738940715789795),
 ('sister', 0.7531670331954956),
 ('boyfriend', 0.7522717714309692),
 ('married', 0.7418345212936401),
 ('spouse', 0.741738498210907)]

In [None]:
X_data = fourgrams[trigrams[bigrams[preprocessed_train['procesed']]]]

X_data_test = fourgrams[trigrams[bigrams[preprocessed_test['procesed']]]]

In [None]:
def word_2_vec(data, vocab):
    keys = list(vocab.keys())
    filter_unknown = lambda word: vocab.get(word, None) is not None
    encode = lambda review: list(map(keys.index, filter(filter_unknown, review)))
    vectorized = list(map(encode, data))
    return vectorized

In [None]:
word_vector = word_2_vec(X_data, fourgram_model.wv.vocab)
word_vector_test = word_2_vec(X_data_test, fourgram_model.wv.vocab)

In [None]:
input_length = 150
X_pad = pad_sequences(sequences=word_vector, maxlen=input_length, padding='post')
X_pad_test = pad_sequences(sequences=word_vector_test, maxlen=input_length, padding='post')

### Guardamos los modelos y variables necesarias

In [None]:
#Save the models and text transformations
# fourgram_model.save('/home/alberto/Escritorio/pycharm/NLP/ignore_files/w2v_fourgram.model')
# pd.DataFrame(X_pad).to_csv('/home/alberto/Escritorio/pycharm/NLP/ignore_files/w2v_padaseq.csv')
# pd.DataFrame(X_pad_test).to_csv('/home/alberto/Escritorio/pycharm/NLP/ignore_files/w2v_padaseq_test.csv')

In [10]:
#Load model and vectors
input_length = 150
fourgram_model = Word2Vec.load('/home/rubiales/PycharmProjects/pycharm/NLP/ignore_files/w2v_fourgram.model')
X_pad = pd.read_csv('/home/rubiales/PycharmProjects/pycharm/NLP/ignore_files/w2v_padaseq.csv')
X_pad.drop(columns=['Unnamed: 0'], inplace=True)

X_pad_test = pd.read_csv('/home/rubiales/PycharmProjects/pycharm/NLP/ignore_files/w2v_padaseq.csv')
X_pad_test.drop(columns=['Unnamed: 0'], inplace=True)

## Pre_trained glove2vec

In [None]:
with open('/home/alberto/Escritorio/pycharm/NLP/ignore_files/glove.840B.300d.txt') as f:
    content = f.readlines()

In [None]:
embeddings_index = {}

for line in tqdm.tqdm_notebook(content):
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

In [None]:
MAX_SEQUENCE_LENGTH = 150
EMBEDDING_DIM = 300

In [None]:
embedding_matrix = np.zeros((len(fourgram_model.wv.vocab) + 1, EMBEDDING_DIM))
suma = 0
for word, i in tqdm.notebook.tqdm(zip(fourgram_model.wv.vocab.keys(), range(len(fourgram_model.wv.vocab.keys())))):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i-suma] = embedding_vector
    else:
        suma += 1

# Neural Network

In [28]:
path_tensorboard = 'ignore_files/logs'
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=path_tensorboard, write_graph=True, write_images=True)

path_checkpoint = 'ignore_files/model_checkpoint/'
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(path_checkpoint, monitor='accuracy', save_best_only=True,
                                                         save_weights_only=True, mode='max')

early_stop = tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=50, mode='max')

In [56]:
mirrored_strategy = tf.distribute.MirroredStrategy()
with mirrored_strategy.scope():
    model = Sequential()
    model.add(Embedding(input_dim = fourgram_model.wv.vectors.shape[0],
                       output_dim = fourgram_model.wv.vectors.shape[1],
                       input_length = input_length,
                       weights = [fourgram_model.wv.vectors],
                       trainable = False))

    model.add(Conv1D(128, 3,
                     activation='relu', padding='same'))
    model.add(MaxPool1D(pool_size=3))
    model.add(Dropout(0.2))
    model.add(Conv1D(64, 3,
                     activation='relu', padding='same'))
    model.add(MaxPool1D(pool_size=3))
    model.add(Bidirectional(LSTM(128, recurrent_dropout=0.1)))
    model.add(Dropout(0.20))
    model.add(Dense(120))
#     model.add(Dropout(0.25))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')
Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 150, 256)          32472064  
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 150, 128)          98432     
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 50, 128)           0         
_________________________________________________________________
dropout_10 (Dropout)         (None, 50, 128)           0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 50, 64)            24640     
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 1

In [None]:
model.fit(x=X_pad.values, y=df_train.sentiment.values, batch_size=6000, epochs=2000, callbacks=[tensorboard_callback, 
                                                                                                early_stop, checkpoint_callback])
#Write X_pad.values if it's trained.

Train on 75000 samples
Epoch 1/2000
INFO:tensorflow:batch_all_reduce: 14 all-reduces with algorithm = nccl, num_packs = 1, agg_small_grads_max_bytes = 0 and agg_small_grads_max_group = 10
INFO:tensorflow:batch_all_reduce: 14 all-reduces with algorithm = nccl, num_packs = 1, agg_small_grads_max_bytes = 0 and agg_small_grads_max_group = 10
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2

In [51]:
model.load_weights(path_checkpoint)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f63cc39fad0>

In [55]:
model.evaluate(X_pad.values, df_train.sentiment.values)



[0.0007861169725145738, 0.99998665]

In [52]:
#Submit the results to kaggle
X_submit = model.predict_classes(X_pad_test.values)
submission = pd.merge(df_test.id, pd.DataFrame(X_submit, columns=['sentiment']), left_index=True, right_index=True)
submission.to_csv('submission_w2v', index=False)

# Neural Network Glove2vec Pre trained

In [None]:
model_glove = Sequential()
model_glove.add(Embedding(input_dim = len(fourgram_model.wv.vectors) + 1,
                   output_dim = EMBEDDING_DIM,
                   input_length = input_length,
                   weights = [embedding_matrix],
                   trainable = False))
# model_glove.add(Dropout(0.2))
model_glove.add(Conv1D(64, 3, activation='relu', padding='same'))
model_glove.add(MaxPool1D(pool_size=4))
model_glove.add(Bidirectional(LSTM(128, recurrent_dropout=0.1)))
model_glove.add(Dropout(0.25))
model_glove.add(Dense(64))
model_glove.add(Dropout(0.3))
model_glove.add(Dense(1, activation='sigmoid'))
model_glove.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_glove.summary()

In [None]:
model_glove.fit(x=X_pad.values, y=df_train.sentiment.values, batch_size=100, epochs=20)

In [None]:
model_glove.save('/home/alberto/Escritorio/pycharm/NLP/ignore_files/sentyment_model_G2V_trained.h5')

In [None]:
model_glove.load_weights('/home/alberto/Escritorio/pycharm/NLP/ignore_files/sentyment_model_G2V_trained.h5')

In [None]:
#Submit the results to kaggle
X_submit = model_glove.predict_classes(X_pad_test.values)
submission = pd.merge(df_test.id, pd.DataFrame(X_submit, columns=['sentiment']), left_index=True, right_index=True)
submission.to_csv('submission_G2V', index=False)
#Kaggle give us a 98% of accuracy wich is great