In [1]:
# importacion general de librerias y de visualizacion (matplotlib y seaborn)
import pandas as pd
import numpy as np
import random
import re

pd.options.display.float_format = '{:20,.2f}'.format # suprimimos la notacion cientifica en los outputs

import warnings
warnings.filterwarnings('ignore')

In [2]:
import tensorflow_hub as hub
import tensorflow as tf
from bert import bert_tokenization
from tensorflow.keras.models import Model       # Keras is the new high level API for TensorFlow
import math
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import SGD
FullTokenizer = bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",trainable=True)
max_seq_length = 128

In [3]:
train_data = pd.read_csv('~/Documents/Datos/DataSets/TP2/train_super_featured.csv')
test_data = pd.read_csv('~/Documents/Datos/DataSets/TP2/test_super_featured.csv')

In [4]:
train_data['keyword_original'].fillna('no_keyword', inplace=True)
test_data['keyword_original'].fillna('no_keyword', inplace=True)
train_data['location_original'].fillna('no_location', inplace=True)
test_data['location_original'].fillna('no_location', inplace=True)

In [5]:
train_data['text_original'].fillna('', inplace=True)
train_data['clean_text'].fillna('', inplace=True)
train_data['super_clean_text'].fillna('', inplace=True)
train_data['kaggle_text'].fillna('', inplace=True)
train_data['semi_cleaned_text'].fillna('', inplace=True)
test_data['text_original'].fillna('', inplace=True)
test_data['clean_text'].fillna('', inplace=True)
test_data['super_clean_text'].fillna('', inplace=True)
test_data['kaggle_text'].fillna('', inplace=True)
test_data['semi_cleaned_text'].fillna('', inplace=True)

In [6]:
def get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        #Cutting down the excess length
        tokens = tokens[0:max_seq_length]
        return [1]*len(tokens)
    else :
      return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

def get_segments(tokens, max_seq_length):
    if len(tokens)>max_seq_length:
      #Cutting down the excess length
      tokens = tokens[:max_seq_length]
      segments = []
      current_segment_id = 0
      for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
          current_segment_id = 1
      return segments
    else:
      segments = []
      current_segment_id = 0
      for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
          current_segment_id = 1
      return segments + [0] * (max_seq_length - len(tokens))

def get_ids(tokens, tokenizer, max_seq_length):    
    if len(tokens)>max_seq_length:
      tokens = tokens[:max_seq_length]
      token_ids = tokenizer.convert_tokens_to_ids(tokens)
      return token_ids
    else:
      token_ids = tokenizer.convert_tokens_to_ids(tokens)
      input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
      return input_ids

In [7]:

def prep(s, get = 'id'):
  stokens = tokenizer.tokenize(s)
  stokens = ["[CLS]"] + stokens + ["[SEP]"]
  if get == 'id':
    input_ids = get_ids(stokens, tokenizer, max_seq_length)
    return input_ids
  elif get == 'mask':
    input_masks = get_masks(stokens, max_seq_length)
    return input_masks
  else:
    input_segments = get_segments(stokens, max_seq_length)
    return input_segments

In [8]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [10]:
input_word_ids = Input(shape=(max_seq_length,), dtype=tf.int32, name='input_word_ids')
input_mask = Input(shape=(max_seq_length,), dtype=tf.int32, name='input_mask')
segment_ids = Input(shape=(max_seq_length,), dtype=tf.int32, name='segment_ids')    

pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])   
clf_output = sequence_output[:, 0, :]
out = Dense(1, activation='sigmoid')(clf_output)

model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
optimizer = SGD(learning_rate=0.001, momentum=0.8)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [11]:
def encode(tweets):
    all_ids = []
    all_masks = []
    all_segments = []
    for tweet in tweets:
        s1 = tweet
        stokens1 = tokenizer.tokenize(s1)
        stokens1 = ["[CLS]"] + stokens1 + ["[SEP]"]

        input_ids = get_ids(stokens1, tokenizer, max_seq_length)
        input_masks = get_masks(stokens1, max_seq_length)
        input_segments = get_segments(stokens1, max_seq_length)
        
        all_ids.append(input_ids)
        all_masks.append(input_masks)
        all_segments.append(input_segments)
    
    return np.array(all_ids), np.array(all_masks), np.array(all_segments)

In [12]:
tweets_encoded = encode(train_data['super_clean_text'].str.lower())

In [13]:
early_stopping = EarlyStopping(monitor='loss', min_delta=0.001, patience=8, verbose=1)
callbacks_list = [early_stopping]

In [14]:
model.fit(tweets_encoded, train_data['target_relabeled'], callbacks=callbacks_list, epochs=5, batch_size=64)


Epoch 1/5




Epoch 2/5




Epoch 3/5




Epoch 4/5




Epoch 5/5






<tensorflow.python.keras.callbacks.History at 0x1444f4358>

In [15]:
test_encoded = encode(test_data['super_clean_text'].str.lower())

In [17]:
y_pred = model.predict(test_encoded)

In [18]:
y_pred

array([[0.87771785],
       [0.87212855],
       [0.8944807 ],
       ...,
       [0.915142  ],
       [0.85924244],
       [0.94016445]], dtype=float32)

In [19]:
df_liked = pd.read_csv('~/Documents/Datos/DataSets/TP2/test_with_targets.csv', dtype={'id': np.int16, 'target': np.int8})


In [20]:
y_pred = np.round(y_pred).astype('int')
np.mean(y_pred.flatten() == df_liked.target)

0.7977321483297579