## Importation of the necessary libraries

In [None]:
import matplotlib.pyplot as plt
import math
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('movie_reviews')
nltk.download('wordnet')
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import nltk
import re
from sklearn.utils import shuffle

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Importing the dataset

In [None]:
# Reading the data using pandas
df = pd.read_csv('train.txt', header = None, delimiter="\t", encoding="utf-8", names = ['raw_text','label']) 

df.head()

Unnamed: 0,raw_text,label
0,"béart and berling are both superb , while hupp...",1
1,not only a coming-of-age story and cautionary ...,1
2,"at the end of the movie , my 6-year-old nephew...",1
3,"even if you can't pronounce "" gyro "" correctly...",1
4,an immensely entertaining look at some of the ...,1


## Data cleaning

In [None]:
#  Let's put all the words in lower case:
df['clean_text'] = df['raw_text'].apply(lambda x: " ".join(x.lower() for x in x.split()))

# let's replace the contraction n't by not
df['clean_text'] = df['clean_text'].str.replace("'t",' not')


#  Let's remove stop words:
from nltk.corpus import stopwords
stop = stopwords.words('english')
df['clean_text'] = df['clean_text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))


#  Let's remove common words 
common_words = ['a','are','movie','film',' not','see','time','about', 'am', 'an', 'and', 'any', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'but', 'by', 'could', 'did', 'do', 'does', 'doing', 'during', 'each', 'for', 'from', 'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', ' I', 'if', 'in', 'into', 'is', 'it', 'its', 'itself', 'me', 'my', 'myself', 'nor', 'of', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same', 'she', 'should', 'so', 'some', 'such', 'than', 'that', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', 'we', 'were', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'with', 'would', 'you', 'your', 'yours', 'yourself', 'yourselves']
df['clean_text'] = df['clean_text'].apply(lambda x: " ".join(x for x in x.split() if x not in common_words))



#  Let's remove punctuation:
df['clean_text'] = df['clean_text'].str.replace('[^\w\s]',' ')


#  Let's remove special characters, numbers, punctuation
df['clean_text'] = df['clean_text'].str.replace("[^a-zA-Z#]", " ")


#  Let's remove spaces:
df['clean_text'] = df['clean_text'].apply(lambda x: " ".join(x.strip() for x in x.split()))


#  Let's remove short words. Here words with length < 2 will be removed
df['clean_text'] = df['clean_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))


# Stemming words
from nltk.stem import PorterStemmer
st = PorterStemmer()
df['clean_text'] = df['clean_text'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))



# Let's check the first rows
df['clean_text'].head()

0                     art berl superb huppert magnific
1    come age stori cautionari parabl also perfectl...
2    end year old nephew said guess come broken fam...
3    even pronounc gyro correctli appreci much vard...
4    immens entertain look unsung hero centuri pop ...
Name: clean_text, dtype: object

### Most occuring words

In [None]:
# Most occuring words

a = df['clean_text'].str.lower().str.cat(sep=' ')
words = nltk.tokenize.word_tokenize(a)
word_dist_1 = nltk.FreqDist(words)

rslt_1 = pd.DataFrame(word_dist_1.most_common(100),
                    columns=['Word', 'Frequency'])
print(rslt_1)


        Word  Frequency
0       like        666
1        one        645
2       make        486
3      stori        440
4    charact        391
5     comedi        338
6       good        337
7       even        314
8       much        313
9       work        301
10      film        300
11      feel        285
12   perform        281
13       way        271
14      well        267
15      movi        267
16  director        263
17      love        256
18       get        254
19      look        244
20     littl        243
21     funni        236
22      come        229
23      life        211
24     never        209
25      take        203
26    enough        203
27      best        200
28       end        197
29       bad        197
..       ...        ...
70      kind        132
71     world        131
72    though        131
73    pictur        130
74      time        130
75  thriller        130
76   without        130
77       fun        128
78     enjoy        127
79     heart    

In [None]:
# Let's remove the most occuring words
freq_most_occuring_words = pd.Series(' '.join(df['clean_text']).split()).value_counts()[:5]
freq_most_occuring_words
df['clean_text'] = df['clean_text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq_most_occuring_words))

### Least occuring words

In [None]:
# Let's remove the least occuring words in order to reduce noise.
freq_least_occuring_word = pd.Series(' '.join(df['clean_text']).split()).value_counts()[-4500:]
freq_least_occuring_word
df['clean_text'] = df['clean_text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq_least_occuring_word))


## Model development

In [None]:
# Creating the feature matrix and the target vector
df_x = df['clean_text'].tolist()
y = df['label'].tolist()



##################### Using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(binary=True, ngram_range=(1,2))


# train set
vectorizer.fit(df_x)
X = vectorizer.fit_transform(df_x)


##################### Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=100)

In [None]:
# List containing f1 score of all the models
f1_scores = []
# List containing all the names of the models
models = []

**Logistic Regression**

In [None]:
# Logistic Regression Model
from sklearn.linear_model import LogisticRegression

# using the gridsearchCV to have the best hyperparameters
from sklearn.model_selection import GridSearchCV

param_LR = [{'C': [1,1.5,2,2.5,3,3.5,4,4.5,5,5.5,6,6.5,7,7.5,8,8.5,9,9.5,10]}]
grid_LR = GridSearchCV(estimator = LogisticRegression(solver = 'lbfgs', max_iter = 10000), param_grid = param_LR, cv = 10, scoring='f1')
grid_LR.fit(X, y)

C_param = grid_LR.best_params_

# Using the best params from the gridsearchCV to compute the f1_score of the logistic regression
model_LR = LogisticRegression(solver = 'lbfgs', max_iter = 10000, C=C_param['C']) 
model_LR.fit(X_train, y_train)


Y_predict = model_LR.predict(X_test)

f1_scores.append(f1_score(y_test, Y_predict))
models.append('Logistic Regression')

**Decision Tree**

In [None]:
####  Decision Tree
from sklearn.tree import DecisionTreeClassifier


## using gridsearchCv to optimize the hyperparameters
from sklearn.model_selection import GridSearchCV

parameter_DT={'max_depth': range(1,20,1)}
clf_tree=DecisionTreeClassifier()
clf=GridSearchCV(clf_tree,parameter_DT, cv = 10, scoring= 'f1')
clf.fit(X,y)

depth = clf.best_params_

# Using the best params from the gridsearchCV to compute the f1_score of the decision tree
model_DT = DecisionTreeClassifier(max_depth = depth['max_depth'])
model_DT.fit(X_train, y_train)

Y_predict = model_DT.predict(X_test)

f1_scores.append(f1_score(y_test, Y_predict))
models.append('Decision Tree')



**Random Forest**

In [None]:
#### Random Forest
from sklearn.ensemble import RandomForestClassifier

model_RF = RandomForestClassifier(n_estimators =100, max_depth =40)
model_RF.fit(X_train, y_train)

Y_predict = model_RF.predict(X_test)

f1_scores.append(f1_score(y_test, Y_predict))
models.append('Random Forest')

**Gradient Tree Boosting**

In [None]:
#### GradientTree Boosting
from sklearn.ensemble import GradientBoostingClassifier

model_GTB = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
    max_depth=10).fit(X_train, y_train)

Y_predict = model_GTB.predict(X_test)

f1_scores.append(f1_score(y_test, Y_predict))
models.append('Gradient Tree Boosting')

**Support Vector Machine**

In [None]:
#### SVC
from sklearn.svm import SVC


model_svc = SVC(kernel='rbf', gamma = 0.1) 
model_svc.fit(X_train, y_train)

Y_predict = model_svc.predict(X_test)

f1_scores.append(f1_score(y_test, Y_predict))
models.append('Support Vector')

**Ensemble Method**

In [None]:
#### Ensemble Method
from sklearn.ensemble import VotingClassifier


# Training classifiers

clf_1 = LogisticRegression(fit_intercept=False, C=1, solver = 'lbfgs', max_iter = 10000)
clf_2 = SVC(kernel='rbf', gamma = 1, probability=True)
eclf = VotingClassifier(estimators=[('LR', clf_1), ('svc', clf_2)],voting='soft', weights=[1, 1])

clf_1 = clf_1.fit(X_train, y_train)
clf_2 = clf_2.fit(X_train, y_train)
eclf = eclf.fit(X_train, y_train)

Y_predict = eclf.predict(X_test)

f1_scores.append(f1_score(y_test, Y_predict))
models.append('Ensemble method')

**Neural Network**

In [None]:
#### Using Neural Network

from sklearn.neural_network import MLPClassifier

model_nn = MLPClassifier()

model_nn.fit(X_train, y_train) 

Y_predict = model_nn.predict(X_test)

f1_scores.append(f1_score(y_test, Y_predict))
models.append('Neural Network')

**Deep Learning - BERT Model -----> The GPU option needs to be used to accelerate the computation**

In [None]:
# Importing additional libraries

import re
from sklearn.utils import shuffle
from tensorflow import keras
import os
import re
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime

!pip install bert-tensorflow

import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization



In [None]:
# Set the output directory for saving model file
# Optionally, set a GCP bucket location

OUTPUT_DIR = 'Predictions'#@param {type:"string"}
#@markdown Whether or not to clear/delete the directory and create a new one
DO_DELETE = True #@param {type:"boolean"}
#@markdown Set USE_BUCKET and BUCKET if you want to (optionally) store model output on GCP bucket.
USE_BUCKET = False #@param {type:"boolean"}
BUCKET = 'BUCKET_NAME' #@param {type:"string"}

if USE_BUCKET:
    OUTPUT_DIR = 'gs://{}/{}'.format(BUCKET, OUTPUT_DIR)
    from google.colab import auth
    auth.authenticate_user()

if DO_DELETE:
    try:
        tf.gfile.DeleteRecursively(OUTPUT_DIR)
    except:
        # Doesn't matter if the directory didn't exist
    pass
tf.gfile.MakeDirs(OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(OUTPUT_DIR))

***** Model output directory: Predictions *****


In [None]:
from tensorflow import keras
import os
import re

# Load all files from a directory in a DataFrame.
def load_directory_data(directory):
    data = {}
    data["sentence"] = []
    data["sentiment"] = []
    for file_path in os.listdir(directory):
        with tf.gfile.GFile(os.path.join(directory, file_path), "r") as f:
            data["sentence"].append(f.read())
            data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
    return pd.DataFrame.from_dict(data)

# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
    pos_df = load_directory_data(os.path.join(directory, "pos"))
    neg_df = load_directory_data(os.path.join(directory, "neg"))
    pos_df["polarity"] = 1
    neg_df["polarity"] = 0
    return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
    dataset = tf.keras.utils.get_file(
        fname="aclImdb.tar.gz", 
        origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
        extract=True)
  
    train_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                       "aclImdb", "train"))
    test_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                      "aclImdb", "test"))
  
    return train_df, test_df

In [None]:
# Train/test split
train1, test1 = train_test_split(df, test_size=0.3)

In [None]:
DATA_COLUMN = 'raw_text'
LABEL_COLUMN = 'label'
# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = [0, 1]

In [None]:
# Use the InputExample class from BERT's run_classifier code to create examples from the data
train_InputExamples = train1.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

test_InputExamples = test1.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

In [None]:
# This is a path to an uncased (all lowercase) version of BERT
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

def create_tokenizer_from_hub_module():
    """Get the vocab file and casing info from the Hub module."""
    with tf.Graph().as_default():
        bert_module = hub.Module(BERT_MODEL_HUB)
        tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
        with tf.Session() as sess:
            vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                                tokenization_info["do_lower_case"]])
      
    return bert.tokenization.FullTokenizer(
      vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer_from_hub_module()

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0407 18:40:36.066641 140301468497792 saver.py:1483] Saver not created because there are no variables in the graph to restore


In [None]:
# We'll set sequences to be at most 128 tokens long.
MAX_SEQ_LENGTH = 128
# Convert our train and test features to InputFeatures that BERT understands.
train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

INFO:tensorflow:Writing example 0 of 5971


I0407 18:40:36.733700 140301468497792 run_classifier.py:774] Writing example 0 of 5971


INFO:tensorflow:*** Example ***


I0407 18:40:36.746382 140301468497792 run_classifier.py:461] *** Example ***


INFO:tensorflow:guid: None


I0407 18:40:36.751572 140301468497792 run_classifier.py:462] guid: None


INFO:tensorflow:tokens: [CLS] nothing sticks , really , except a lingering creep ##iness one feels from being dragged through a sad , so ##rdi ##d universe of guns , drugs , ava ##rice and damaged dreams . [SEP]


I0407 18:40:36.756379 140301468497792 run_classifier.py:464] tokens: [CLS] nothing sticks , really , except a lingering creep ##iness one feels from being dragged through a sad , so ##rdi ##d universe of guns , drugs , ava ##rice and damaged dreams . [SEP]


INFO:tensorflow:input_ids: 101 2498 12668 1010 2428 1010 3272 1037 15304 19815 9961 2028 5683 2013 2108 7944 2083 1037 6517 1010 2061 17080 2094 5304 1997 4409 1010 5850 1010 10927 17599 1998 5591 5544 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0407 18:40:36.763004 140301468497792 run_classifier.py:465] input_ids: 101 2498 12668 1010 2428 1010 3272 1037 15304 19815 9961 2028 5683 2013 2108 7944 2083 1037 6517 1010 2061 17080 2094 5304 1997 4409 1010 5850 1010 10927 17599 1998 5591 5544 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0407 18:40:36.767893 140301468497792 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0407 18:40:36.772761 140301468497792 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:label: 0 (id = 0)


I0407 18:40:36.777930 140301468497792 run_classifier.py:468] label: 0 (id = 0)


INFO:tensorflow:*** Example ***


I0407 18:40:36.784984 140301468497792 run_classifier.py:461] *** Example ***


INFO:tensorflow:guid: None


I0407 18:40:36.789668 140301468497792 run_classifier.py:462] guid: None


INFO:tensorflow:tokens: [CLS] where tom green stages his gag ##s as assaults on america ' s knee - jerk moral san ##ct ##imo ##ny , jack ##ass lacks aspirations of social up ##hea ##val . [SEP]


I0407 18:40:36.795941 140301468497792 run_classifier.py:464] tokens: [CLS] where tom green stages his gag ##s as assaults on america ' s knee - jerk moral san ##ct ##imo ##ny , jack ##ass lacks aspirations of social up ##hea ##val . [SEP]


INFO:tensorflow:input_ids: 101 2073 3419 2665 5711 2010 18201 2015 2004 22664 2006 2637 1005 1055 6181 1011 12181 7191 2624 6593 16339 4890 1010 2990 12054 14087 22877 1997 2591 2039 20192 10175 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0407 18:40:36.801237 140301468497792 run_classifier.py:465] input_ids: 101 2073 3419 2665 5711 2010 18201 2015 2004 22664 2006 2637 1005 1055 6181 1011 12181 7191 2624 6593 16339 4890 1010 2990 12054 14087 22877 1997 2591 2039 20192 10175 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0407 18:40:36.814836 140301468497792 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0407 18:40:36.819594 140301468497792 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:label: 0 (id = 0)


I0407 18:40:36.824675 140301468497792 run_classifier.py:468] label: 0 (id = 0)


INFO:tensorflow:*** Example ***


I0407 18:40:36.830924 140301468497792 run_classifier.py:461] *** Example ***


INFO:tensorflow:guid: None


I0407 18:40:36.834290 140301468497792 run_classifier.py:462] guid: None


INFO:tensorflow:tokens: [CLS] me ##y ##je ##s ' provocative film might be called an example of the ha ##pha ##zard ##ness of evil . [SEP]


I0407 18:40:36.837817 140301468497792 run_classifier.py:464] tokens: [CLS] me ##y ##je ##s ' provocative film might be called an example of the ha ##pha ##zard ##ness of evil . [SEP]


INFO:tensorflow:input_ids: 101 2033 2100 6460 2015 1005 26422 2143 2453 2022 2170 2019 2742 1997 1996 5292 21890 26154 2791 1997 4763 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0407 18:40:36.840950 140301468497792 run_classifier.py:465] input_ids: 101 2033 2100 6460 2015 1005 26422 2143 2453 2022 2170 2019 2742 1997 1996 5292 21890 26154 2791 1997 4763 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0407 18:40:36.843803 140301468497792 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0407 18:40:36.847199 140301468497792 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:label: 1 (id = 1)


I0407 18:40:36.851093 140301468497792 run_classifier.py:468] label: 1 (id = 1)


INFO:tensorflow:*** Example ***


I0407 18:40:36.855587 140301468497792 run_classifier.py:461] *** Example ***


INFO:tensorflow:guid: None


I0407 18:40:36.859011 140301468497792 run_classifier.py:462] guid: None


INFO:tensorflow:tokens: [CLS] rarely , indeed almost never , is such high - watt ##age brain ##power coupled with pitch - perfect acting and an exquisite , un ##fa ##ka ##ble sense of cinema . [SEP]


I0407 18:40:36.862414 140301468497792 run_classifier.py:464] tokens: [CLS] rarely , indeed almost never , is such high - watt ##age brain ##power coupled with pitch - perfect acting and an exquisite , un ##fa ##ka ##ble sense of cinema . [SEP]


INFO:tensorflow:input_ids: 101 6524 1010 5262 2471 2196 1010 2003 2107 2152 1011 15231 4270 4167 11452 11211 2007 6510 1011 3819 3772 1998 2019 19401 1010 4895 7011 2912 3468 3168 1997 5988 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0407 18:40:36.865622 140301468497792 run_classifier.py:465] input_ids: 101 6524 1010 5262 2471 2196 1010 2003 2107 2152 1011 15231 4270 4167 11452 11211 2007 6510 1011 3819 3772 1998 2019 19401 1010 4895 7011 2912 3468 3168 1997 5988 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0407 18:40:36.869169 140301468497792 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0407 18:40:36.872622 140301468497792 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:label: 1 (id = 1)


I0407 18:40:36.876027 140301468497792 run_classifier.py:468] label: 1 (id = 1)


INFO:tensorflow:*** Example ***


I0407 18:40:36.880655 140301468497792 run_classifier.py:461] *** Example ***


INFO:tensorflow:guid: None


I0407 18:40:36.886834 140301468497792 run_classifier.py:462] guid: None


INFO:tensorflow:tokens: [CLS] a movie that ' s held captive by med ##io ##cr ##ity . not bad , but not all that good . bacon keeps things interesting , but don ' t go out of your way to pay full price . [SEP]


I0407 18:40:36.890172 140301468497792 run_classifier.py:464] tokens: [CLS] a movie that ' s held captive by med ##io ##cr ##ity . not bad , but not all that good . bacon keeps things interesting , but don ' t go out of your way to pay full price . [SEP]


INFO:tensorflow:input_ids: 101 1037 3185 2008 1005 1055 2218 12481 2011 19960 3695 26775 3012 1012 2025 2919 1010 2021 2025 2035 2008 2204 1012 11611 7906 2477 5875 1010 2021 2123 1005 1056 2175 2041 1997 2115 2126 2000 3477 2440 3976 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0407 18:40:36.893687 140301468497792 run_classifier.py:465] input_ids: 101 1037 3185 2008 1005 1055 2218 12481 2011 19960 3695 26775 3012 1012 2025 2919 1010 2021 2025 2035 2008 2204 1012 11611 7906 2477 5875 1010 2021 2123 1005 1056 2175 2041 1997 2115 2126 2000 3477 2440 3976 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0407 18:40:36.896831 140301468497792 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0407 18:40:36.900381 140301468497792 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:label: 0 (id = 0)


I0407 18:40:36.903808 140301468497792 run_classifier.py:468] label: 0 (id = 0)


INFO:tensorflow:Writing example 0 of 2559


I0407 18:40:40.180126 140301468497792 run_classifier.py:774] Writing example 0 of 2559


INFO:tensorflow:*** Example ***


I0407 18:40:40.183442 140301468497792 run_classifier.py:461] *** Example ***


INFO:tensorflow:guid: None


I0407 18:40:40.189767 140301468497792 run_classifier.py:462] guid: None


INFO:tensorflow:tokens: [CLS] even if the ring has a familiar ring , it ' s still unusually craft ##y and intelligent for hollywood horror . [SEP]


I0407 18:40:40.194171 140301468497792 run_classifier.py:464] tokens: [CLS] even if the ring has a familiar ring , it ' s still unusually craft ##y and intelligent for hollywood horror . [SEP]


INFO:tensorflow:input_ids: 101 2130 2065 1996 3614 2038 1037 5220 3614 1010 2009 1005 1055 2145 12890 7477 2100 1998 9414 2005 5365 5469 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0407 18:40:40.199172 140301468497792 run_classifier.py:465] input_ids: 101 2130 2065 1996 3614 2038 1037 5220 3614 1010 2009 1005 1055 2145 12890 7477 2100 1998 9414 2005 5365 5469 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0407 18:40:40.203392 140301468497792 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0407 18:40:40.207637 140301468497792 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:label: 1 (id = 1)


I0407 18:40:40.211919 140301468497792 run_classifier.py:468] label: 1 (id = 1)


INFO:tensorflow:*** Example ***


I0407 18:40:40.217623 140301468497792 run_classifier.py:461] *** Example ***


INFO:tensorflow:guid: None


I0407 18:40:40.222930 140301468497792 run_classifier.py:462] guid: None


INFO:tensorflow:tokens: [CLS] a mix of velocity and id ##io ##cy , this ruin ##ous remake lacks the bra ##wn - - and the brains - - of the 1970s original . [SEP]


I0407 18:40:40.229100 140301468497792 run_classifier.py:464] tokens: [CLS] a mix of velocity and id ##io ##cy , this ruin ##ous remake lacks the bra ##wn - - and the brains - - of the 1970s original . [SEP]


INFO:tensorflow:input_ids: 101 1037 4666 1997 10146 1998 8909 3695 5666 1010 2023 10083 3560 12661 14087 1996 11655 7962 1011 1011 1998 1996 14332 1011 1011 1997 1996 3955 2434 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0407 18:40:40.235943 140301468497792 run_classifier.py:465] input_ids: 101 1037 4666 1997 10146 1998 8909 3695 5666 1010 2023 10083 3560 12661 14087 1996 11655 7962 1011 1011 1998 1996 14332 1011 1011 1997 1996 3955 2434 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0407 18:40:40.243972 140301468497792 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0407 18:40:40.251402 140301468497792 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:label: 0 (id = 0)


I0407 18:40:40.257932 140301468497792 run_classifier.py:468] label: 0 (id = 0)


INFO:tensorflow:*** Example ***


I0407 18:40:40.263833 140301468497792 run_classifier.py:461] *** Example ***


INFO:tensorflow:guid: None


I0407 18:40:40.268130 140301468497792 run_classifier.py:462] guid: None


INFO:tensorflow:tokens: [CLS] creepy but ultimately un ##sat ##is ##fying thriller . [SEP]


I0407 18:40:40.272327 140301468497792 run_classifier.py:464] tokens: [CLS] creepy but ultimately un ##sat ##is ##fying thriller . [SEP]


INFO:tensorflow:input_ids: 101 17109 2021 4821 4895 16846 2483 14116 10874 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0407 18:40:40.276226 140301468497792 run_classifier.py:465] input_ids: 101 17109 2021 4821 4895 16846 2483 14116 10874 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0407 18:40:40.280487 140301468497792 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0407 18:40:40.284496 140301468497792 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:label: 0 (id = 0)


I0407 18:40:40.288113 140301468497792 run_classifier.py:468] label: 0 (id = 0)


INFO:tensorflow:*** Example ***


I0407 18:40:40.295233 140301468497792 run_classifier.py:461] *** Example ***


INFO:tensorflow:guid: None


I0407 18:40:40.298863 140301468497792 run_classifier.py:462] guid: None


INFO:tensorflow:tokens: [CLS] it strikes hardest when it reminds you how per ##tine ##nt its dynamics remain . fifty years after the fact , the world ' s political situation seems little different , and [ director phillip ] no ##yce brings out the all ##ego ##ry with remarkable skill . [SEP]


I0407 18:40:40.302541 140301468497792 run_classifier.py:464] tokens: [CLS] it strikes hardest when it reminds you how per ##tine ##nt its dynamics remain . fifty years after the fact , the world ' s political situation seems little different , and [ director phillip ] no ##yce brings out the all ##ego ##ry with remarkable skill . [SEP]


INFO:tensorflow:input_ids: 101 2009 9326 18263 2043 2009 15537 2017 2129 2566 10196 3372 2049 10949 3961 1012 5595 2086 2044 1996 2755 1010 1996 2088 1005 1055 2576 3663 3849 2210 2367 1010 1998 1031 2472 10852 1033 2053 29297 7545 2041 1996 2035 20265 2854 2007 9487 8066 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0407 18:40:40.306310 140301468497792 run_classifier.py:465] input_ids: 101 2009 9326 18263 2043 2009 15537 2017 2129 2566 10196 3372 2049 10949 3961 1012 5595 2086 2044 1996 2755 1010 1996 2088 1005 1055 2576 3663 3849 2210 2367 1010 1998 1031 2472 10852 1033 2053 29297 7545 2041 1996 2035 20265 2854 2007 9487 8066 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0407 18:40:40.310254 140301468497792 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0407 18:40:40.314343 140301468497792 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:label: 1 (id = 1)


I0407 18:40:40.318211 140301468497792 run_classifier.py:468] label: 1 (id = 1)


INFO:tensorflow:*** Example ***


I0407 18:40:40.322264 140301468497792 run_classifier.py:461] *** Example ***


INFO:tensorflow:guid: None


I0407 18:40:40.326906 140301468497792 run_classifier.py:462] guid: None


INFO:tensorflow:tokens: [CLS] you ' re too conscious of the effort it takes to be this spontaneous . [SEP]


I0407 18:40:40.330605 140301468497792 run_classifier.py:464] tokens: [CLS] you ' re too conscious of the effort it takes to be this spontaneous . [SEP]


INFO:tensorflow:input_ids: 101 2017 1005 2128 2205 9715 1997 1996 3947 2009 3138 2000 2022 2023 17630 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0407 18:40:40.334424 140301468497792 run_classifier.py:465] input_ids: 101 2017 1005 2128 2205 9715 1997 1996 3947 2009 3138 2000 2022 2023 17630 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0407 18:40:40.338173 140301468497792 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0407 18:40:40.341974 140301468497792 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:label: 0 (id = 0)


I0407 18:40:40.345842 140301468497792 run_classifier.py:468] label: 0 (id = 0)


In [None]:
def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,
                 num_labels):
    """Creates a classification model."""

    bert_module = hub.Module(
      BERT_MODEL_HUB,
      trainable=True)
    bert_inputs = dict(
      input_ids=input_ids,
      input_mask=input_mask,
      segment_ids=segment_ids)
    bert_outputs = bert_module(
      inputs=bert_inputs,
      signature="tokens",
      as_dict=True)

    # Use "pooled_output" for classification tasks on an entire sentence.
    # Use "sequence_outputs" for token-level output.
    output_layer = bert_outputs["pooled_output"]

    hidden_size = output_layer.shape[-1].value

    # Create our own layer to tune for politeness data.
    output_weights = tf.get_variable(
      "output_weights", [num_labels, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable(
      "output_bias", [num_labels], initializer=tf.zeros_initializer())

    with tf.variable_scope("loss"):

        # Dropout helps prevent overfitting
        output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

        logits = tf.matmul(output_layer, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        # Convert labels into one-hot encoding
        one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

        predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))
        # If we're predicting, we want predicted labels and the probabiltiies.
        if is_predicting:
            return (predicted_labels, log_probs)

    # If we're train/eval, compute loss between predicted and actual label
    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
    loss = tf.reduce_mean(per_example_loss)
    return (loss, predicted_labels, log_probs)

In [None]:
# model_fn_builder actually creates our model function
# using the passed parameters for num_labels, learning_rate, etc.
def model_fn_builder(num_labels, learning_rate, num_train_steps,
                     num_warmup_steps):
      """Returns `model_fn` closure for TPUEstimator."""
    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""

        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        label_ids = features["label_ids"]

        is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)

        # TRAIN and EVAL
        if not is_predicting:

            (loss, predicted_labels, log_probs) = create_model(
            is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

            train_op = bert.optimization.create_optimizer(
              loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)

            # Calculate evaluation metrics. 
            def metric_fn(label_ids, predicted_labels):
                accuracy = tf.metrics.accuracy(label_ids, predicted_labels)
                f1_score = tf.contrib.metrics.f1_score(
                    label_ids,
                    predicted_labels)
                auc = tf.metrics.auc(
                    label_ids,
                    predicted_labels)
                recall = tf.metrics.recall(
                    label_ids,
                    predicted_labels)
                precision = tf.metrics.precision(
                    label_ids,
                    predicted_labels) 
                true_pos = tf.metrics.true_positives(
                    label_ids,
                    predicted_labels)
                true_neg = tf.metrics.true_negatives(
                    label_ids,
                    predicted_labels)   
                false_pos = tf.metrics.false_positives(
                    label_ids,
                    predicted_labels)  
                false_neg = tf.metrics.false_negatives(
                    label_ids,
                    predicted_labels)
                return {
                    "eval_accuracy": accuracy,
                    "f1_score": f1_score,
                    "auc": auc,
                    "precision": precision,
                    "recall": recall,
                    "true_positives": true_pos,
                    "true_negatives": true_neg,
                    "false_positives": false_pos,
                    "false_negatives": false_neg
                }

            eval_metrics = metric_fn(label_ids, predicted_labels)

            if mode == tf.estimator.ModeKeys.TRAIN:
                return tf.estimator.EstimatorSpec(mode=mode,
                  loss=loss,
                  train_op=train_op)
            else:
                return tf.estimator.EstimatorSpec(mode=mode,
                                                  loss=loss,
                                                  eval_metric_ops=eval_metrics)
        else:
            (predicted_labels, log_probs) = create_model(
            is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

            predictions = {
              'probabilities': log_probs,
              'labels': predicted_labels
            }
            return tf.estimator.EstimatorSpec(mode, predictions=predictions)

    # Return the actual model function in the closure
    return model_fn

In [None]:
# Compute train and warmup steps from batch size
# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
#LEARNING_RATE = 1
NUM_TRAIN_EPOCHS = 3.0
# Warmup is a period of time where hte learning rate 
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
# Model configs
SAVE_CHECKPOINTS_STEPS = 500
SAVE_SUMMARY_STEPS = 100

In [None]:
# Compute # train and warmup steps from batch size
num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

In [None]:
# Specify output directory and number of checkpoint steps to save
run_config = tf.estimator.RunConfig(
    model_dir=OUTPUT_DIR,
    save_summary_steps=SAVE_SUMMARY_STEPS,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

In [None]:
model_fn = model_fn_builder(
  num_labels=len(label_list),
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps)

estimator = tf.estimator.Estimator(
  model_fn=model_fn,
  config=run_config,
  params={"batch_size": BATCH_SIZE})

INFO:tensorflow:Using config: {'_model_dir': 'Predictions', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 500, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f9a486907f0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


I0407 18:40:41.935703 140301468497792 estimator.py:201] Using config: {'_model_dir': 'Predictions', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 500, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f9a486907f0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [None]:
# Create an input function for training. drop_remainder = True for using TPUs.
train_input_fn = bert.run_classifier.input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=False)

In [None]:
print(f'Beginning Training!')
current_time = datetime.now()
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print("Training took time ", datetime.now() - current_time)

Beginning Training!
INFO:tensorflow:Calling model_fn.


I0407 18:40:45.757392 140301468497792 estimator.py:1111] Calling model_fn.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0407 18:40:50.057102 140301468497792 saver.py:1483] Saver not created because there are no variables in the graph to restore
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


INFO:tensorflow:Done calling model_fn.


I0407 18:41:02.431860 140301468497792 estimator.py:1113] Done calling model_fn.


INFO:tensorflow:Create CheckpointSaverHook.


I0407 18:41:02.443089 140301468497792 basic_session_run_hooks.py:527] Create CheckpointSaverHook.


INFO:tensorflow:Graph was finalized.


I0407 18:41:04.927006 140301468497792 monitored_session.py:222] Graph was finalized.


INFO:tensorflow:Running local_init_op.


I0407 18:41:10.423558 140301468497792 session_manager.py:491] Running local_init_op.


INFO:tensorflow:Done running local_init_op.


I0407 18:41:10.720764 140301468497792 session_manager.py:493] Done running local_init_op.


INFO:tensorflow:Saving checkpoints for 0 into Predictions/model.ckpt.


I0407 18:41:23.386939 140301468497792 basic_session_run_hooks.py:594] Saving checkpoints for 0 into Predictions/model.ckpt.


INFO:tensorflow:loss = 0.7249242, step = 0


I0407 18:41:40.746360 140301468497792 basic_session_run_hooks.py:249] loss = 0.7249242, step = 0


INFO:tensorflow:global_step/sec: 0.592396


I0407 18:44:29.551442 140301468497792 basic_session_run_hooks.py:680] global_step/sec: 0.592396


INFO:tensorflow:loss = 0.56756353, step = 100 (168.815 sec)


I0407 18:44:29.561033 140301468497792 basic_session_run_hooks.py:247] loss = 0.56756353, step = 100 (168.815 sec)


INFO:tensorflow:global_step/sec: 0.642459


I0407 18:47:05.203411 140301468497792 basic_session_run_hooks.py:680] global_step/sec: 0.642459


INFO:tensorflow:loss = 0.23725152, step = 200 (155.650 sec)


I0407 18:47:05.210602 140301468497792 basic_session_run_hooks.py:247] loss = 0.23725152, step = 200 (155.650 sec)


INFO:tensorflow:global_step/sec: 0.642865


I0407 18:49:40.757028 140301468497792 basic_session_run_hooks.py:680] global_step/sec: 0.642865


INFO:tensorflow:loss = 0.0071808393, step = 300 (155.551 sec)


I0407 18:49:40.761895 140301468497792 basic_session_run_hooks.py:247] loss = 0.0071808393, step = 300 (155.551 sec)


INFO:tensorflow:global_step/sec: 0.642551


I0407 18:52:16.386736 140301468497792 basic_session_run_hooks.py:680] global_step/sec: 0.642551


INFO:tensorflow:loss = 0.058364574, step = 400 (155.628 sec)


I0407 18:52:16.389985 140301468497792 basic_session_run_hooks.py:247] loss = 0.058364574, step = 400 (155.628 sec)


INFO:tensorflow:Saving checkpoints for 500 into Predictions/model.ckpt.


I0407 18:54:50.342715 140301468497792 basic_session_run_hooks.py:594] Saving checkpoints for 500 into Predictions/model.ckpt.


INFO:tensorflow:global_step/sec: 0.605936


I0407 18:55:01.420762 140301468497792 basic_session_run_hooks.py:680] global_step/sec: 0.605936


INFO:tensorflow:loss = 0.0040011248, step = 500 (165.039 sec)


I0407 18:55:01.428872 140301468497792 basic_session_run_hooks.py:247] loss = 0.0040011248, step = 500 (165.039 sec)


INFO:tensorflow:Saving checkpoints for 559 into Predictions/model.ckpt.


I0407 18:56:31.677477 140301468497792 basic_session_run_hooks.py:594] Saving checkpoints for 559 into Predictions/model.ckpt.


INFO:tensorflow:Loss for final step: 0.0015926969.


I0407 18:56:41.641688 140301468497792 estimator.py:359] Loss for final step: 0.0015926969.


Training took time  0:15:59.639616


In [None]:
test_input_fn = run_classifier.input_fn_builder(
    features=test_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=False)

In [None]:
# Printing the f1 score
estimator.evaluate(input_fn=test_input_fn, steps=None)

INFO:tensorflow:Calling model_fn.


I0407 18:56:43.635624 140301468497792 estimator.py:1111] Calling model_fn.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0407 18:56:48.432791 140301468497792 saver.py:1483] Saver not created because there are no variables in the graph to restore
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


INFO:tensorflow:Done calling model_fn.


I0407 18:57:00.997750 140301468497792 estimator.py:1113] Done calling model_fn.


INFO:tensorflow:Starting evaluation at 2019-04-07T18:57:01Z


I0407 18:57:01.036407 140301468497792 evaluation.py:257] Starting evaluation at 2019-04-07T18:57:01Z


INFO:tensorflow:Graph was finalized.


I0407 18:57:03.012496 140301468497792 monitored_session.py:222] Graph was finalized.


INFO:tensorflow:Restoring parameters from Predictions/model.ckpt-559


I0407 18:57:03.027288 140301468497792 saver.py:1270] Restoring parameters from Predictions/model.ckpt-559


INFO:tensorflow:Running local_init_op.


I0407 18:57:05.655165 140301468497792 session_manager.py:491] Running local_init_op.


INFO:tensorflow:Done running local_init_op.


I0407 18:57:05.953655 140301468497792 session_manager.py:493] Done running local_init_op.


INFO:tensorflow:Finished evaluation at 2019-04-07-18:57:50


I0407 18:57:50.112837 140301468497792 evaluation.py:277] Finished evaluation at 2019-04-07-18:57:50


INFO:tensorflow:Saving dict for global step 559: auc = 0.86267644, eval_accuracy = 0.86244625, f1_score = 0.8627144, false_negatives = 154.0, false_positives = 198.0, global_step = 559, loss = 0.6476597, precision = 0.8481595, recall = 0.87777776, true_negatives = 1101.0, true_positives = 1106.0


I0407 18:57:50.122061 140301468497792 estimator.py:1979] Saving dict for global step 559: auc = 0.86267644, eval_accuracy = 0.86244625, f1_score = 0.8627144, false_negatives = 154.0, false_positives = 198.0, global_step = 559, loss = 0.6476597, precision = 0.8481595, recall = 0.87777776, true_negatives = 1101.0, true_positives = 1106.0


INFO:tensorflow:Saving 'checkpoint_path' summary for global step 559: Predictions/model.ckpt-559


I0407 18:57:50.129385 140301468497792 estimator.py:2039] Saving 'checkpoint_path' summary for global step 559: Predictions/model.ckpt-559


{'auc': 0.86267644,
 'eval_accuracy': 0.86244625,
 'f1_score': 0.8627144,
 'false_negatives': 154.0,
 'false_positives': 198.0,
 'global_step': 559,
 'loss': 0.6476597,
 'precision': 0.8481595,
 'recall': 0.87777776,
 'true_negatives': 1101.0,
 'true_positives': 1106.0}

In [None]:
def getPrediction(in_sentences):
    labels = ["0", "1"]
    input_examples = [run_classifier.InputExample(guid="", text_a = x, text_b = None, label = 0) for x in in_sentences] # here, "" is just a dummy label
    input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
    predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)
    predictions = estimator.predict(predict_input_fn)
    return [(sentence, prediction['probabilities'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)]

In [None]:
# Concatenating the result of the BERT Model
models.append('BERT')
f1_scores.append(0.86913)

**f1 scores for all different models**

In [None]:
results = pd.DataFrame()
results['Model'] = models
results['F1 score'] = f1_scores
print(results)

                    Model  F1 score
0     Logistic Regression  0.743182
1           Decision Tree  0.668795
2           Random Forest  0.709821
3  Gradient Tree Boosting  0.631835
4          Support Vector  0.722864
5         Ensemble method  0.742664
6          Neural Network  0.734788
7                    BERT  0.869130
