In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("final_dataset.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,TweetText,sentiment,text_clean,text_len,token_lens
0,0,appartment old need make holes need owner agre...,0,appartment old need make holes need owner agre...,11,16
1,1,castlenes havent you dislike handwriting im he...,1,castlenes havent you dislike handwriting im he...,7,11
2,2,guavawrite good see you ill get bigmouths foll...,1,guavawrite good see you ill get bigmouths foll...,9,15
3,3,goodnight space mountain spending next week pa...,0,goodnight space mountain spending next week pa...,12,17
4,4,frankmusik big load grey boobs hope grey thing...,1,frankmusik big load grey boobs hope grey thing...,13,18


In [4]:
#choose batch size
BATCH_SIZE = 32

#how many epochs?
EPOCHS = 8

#clean Tweets?
CLEAN_TWEETS = False

#use meta data?
USE_META = True

#add dense layer?
ADD_DENSE = False
DENSE_DIM = 64

#add dropout?
ADD_DROPOUT = True
DROPOUT = .2

#train BERT base model? 
TRAIN_BASE = True

In [5]:
#the basics
import os, re, math, string, pandas as pd, numpy as np, seaborn as sns

#graphing
import matplotlib.pyplot as plt

#deep learning
import tensorflow as tf

#nlp
from wordcloud import STOPWORDS

#scaling
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import pandas as pd

from sklearn import preprocessing
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
seed = 42

In [6]:
ros = RandomOverSampler()
train_x, train_y = ros.fit_resample(np.array(df['text_clean']).reshape(-1, 1), np.array(df['sentiment']).reshape(-1, 1));
train_os = pd.DataFrame(list(zip([x[0] for x in train_x], train_y)), columns = ['text_clean', 'sentiment']);

In [7]:
X = train_os['text_clean'].values
y = train_os['sentiment'].values

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y, random_state=seed)

In [9]:
from transformers import BertTokenizer
import tensorflow as tf
from transformers import TFBertModel, BertModel

In [10]:
#get BERT layer
bert_base = TFBertModel.from_pretrained('bert-base-uncased')
#bert_base = BertModel.from_pretrained('bert-base-uncased')          #to use with PyTorch

#select BERT tokenizer
TOKENIZER = BertTokenizer.from_pretrained("bert-base-uncased")

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [11]:

def bert_encode(data,maximum_len) :
    input_ids = []
    attention_masks = []
  

    for i in range(len(data)):
        encoded = TOKENIZER.encode_plus(data[i],
                                        add_special_tokens=True,
                                        max_length=maximum_len,
                                        pad_to_max_length=True,
                                        return_attention_mask=True)
      
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
        
    return np.array(input_ids),np.array(attention_masks)

In [16]:
def build_model(model_layer, learning_rate, use_meta = USE_META, add_dense = ADD_DENSE,
               dense_dim = DENSE_DIM, add_dropout = ADD_DROPOUT, dropout = DROPOUT):
    
    #define inputs
    input_ids = tf.keras.Input(shape=(60,),dtype='int32')
    attention_masks = tf.keras.Input(shape=(60,),dtype='int32')
    # meta_input = tf.keras.Input(shape = (meta_train.shape[1], ))
    
    #insert BERT layer
    transformer_layer = model_layer([input_ids,attention_masks])
    
    #choose only last hidden-state
    output = transformer_layer[1]
    
    #add meta data
    # if use_meta:
    #     output = tf.keras.layers.Concatenate()([output, meta_input])
        
    
    #add dense relu layer
    if add_dense:
        print("Training with additional dense layer...")
        output = tf.keras.layers.Dense(dense_dim,activation='relu')(output)
    
    #add dropout
    if add_dropout:
        print("Training with dropout...")
        output = tf.keras.layers.Dropout(dropout)(output)
    
    #add final node for binary classification
    output = tf.keras.layers.Dense(1,activation='sigmoid')(output)
    
    #assemble and compile
#     if use_meta:
#         print("Training with meta-data...")
#         model = tf.keras.models.Model(inputs = [input_ids,attention_masks, meta_input],outputs = output)
    
#     else:
    print("Training without meta-data...")
    model = tf.keras.models.Model(inputs = [input_ids,attention_masks],outputs = output)

    model.compile(tf.keras.optimizers.Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [13]:
if TRAIN_BASE:
    #get our inputs
    print('Encoding Tweets...')
    train_input_ids,train_attention_masks = bert_encode(X_train,60)
    test_input_ids,test_attention_masks = bert_encode(X_test,60)
    print('Tweets encoded')
    print('')

    #debugging step
    print('Train length:', len(train_input_ids))
    print('Test length:', len(test_input_ids))


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Encoding Tweets...




Tweets encoded

Train length: 27336
Test length: 3038


In [17]:
#and build and view parameters
BERT_base = build_model(bert_base, learning_rate = 1e-5)
BERT_base.summary()

Training with dropout...
Training without meta-data...
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 60)]         0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 60)]         0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     TFBaseModelOutputWit 109482240   input_3[0][0]                    
                                                                 input_4[0][0]                    
__________________________________________________________________________________________________
dropout_38 (Dropout)            (None, 

  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [18]:
checkpoint = tf.keras.callbacks.ModelCheckpoint('base_model.h5', monitor='val_loss', save_best_only = True, save_weights_only = True)

In [19]:
#train BERT
if TRAIN_BASE:
    # if USE_META:
    #     history = BERT_base.fit([train_input_ids,train_attention_masks, meta_train], train.target, validation_split = .2, epochs = EPOCHS, callbacks = [checkpoint], batch_size = BATCH_SIZE)
    
    # else:
    history = BERT_base.fit([train_input_ids,train_attention_masks], y_train, validation_split = .2, epochs = EPOCHS, callbacks = [checkpoint], batch_size = BATCH_SIZE)   

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [23]:
#predict with BERT
if TRAIN_BASE:
    preds_base = BERT_base.predict([test_input_ids,test_attention_masks])

In [24]:
test = pd.DataFrame({'text': X_test, 'sentiment': y_test})
test.head()

Unnamed: 0,text,sentiment
0,absolutely loving new american rejects album,1
1,fearnecotton heyy could play either well fight...,1
2,thefrasermills sorry like seeing sun try talk ...,1
3,picked puppys crap annoyed poops pee,0
4,coughing lung ive since xmas,0


In [25]:
test_id = test.index.tolist()

In [26]:
if TRAIN_BASE:
    submission_base = pd.DataFrame()
    submission_base['id'] = test_id
    submission_base['prob'] = preds_base
    submission_base['target'] = np.round(submission_base['prob']).astype(int)
    submission_base.head(10)

In [27]:
if TRAIN_BASE:
    submission_base = submission_base[['id', 'target']]

    #save to disk
    submission_base.to_csv('submission_bert_base.csv', index = False)
    print('Submission saved')

Submission saved


In [29]:
df_sub = pd.read_csv("submission_bert_base.csv")

Unnamed: 0,id,target
0,0,1
1,1,1
2,2,0
3,3,0
4,4,0
...,...,...
3033,3033,0
3034,3034,1
3035,3035,0
3036,3036,1
