In [None]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub

!pip install -q tf-models-official
from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')

!pip install tensorflow_text
import tensorflow_text as text

In [8]:
import pandas as pd

df = pd.read_csv('drive/MyDrive/Milestone2/WikiLarge_Train.csv')
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32
seed = 42

# train_label = df['label']
# train_ds = tf.data.Dataset.from_tensor_slices((df.original_text.values, train_label.values))
# train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [9]:
# df = pd.read_csv('drive/MyDrive/Milestone2/WikiLarge_Train.csv')
pos_df = df[df.label==1]
neg_df = df[df.label==0]

In [10]:
pos_df.shape, neg_df.shape

((208384, 2), (208384, 2))

In [11]:
from tqdm import tqdm
import os

os.makedirs('wiki_train/pos')

for i in tqdm(range(len(pos_df))):
  text = pos_df.original_text.iloc[i]
  with open(f'wiki_train/pos/{i}.txt', 'w') as f:
    f.write(text)
    f.close()

100%|██████████| 208384/208384 [00:15<00:00, 13195.29it/s]


In [12]:
os.makedirs('wiki_train/neg')

for i in tqdm(range(len(neg_df))):
  text = neg_df.original_text.iloc[i]
  with open(f'wiki_train/neg/{i}_.txt', 'w') as f:
    f.write(text)
    f.close()

100%|██████████| 208384/208384 [00:15<00:00, 13187.77it/s]


In [None]:
# os.rmdir('wiki_train/.ipynb_checkpoints')

In [15]:
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32
seed = 42

raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'wiki_train/',
    batch_size=batch_size,
    seed=seed)

class_names = raw_train_ds.class_names
train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)

Found 416768 files belonging to 2 classes.


In [None]:
parameters={"BERT_L8": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-512_A-8/1",
"Preprocess_L8": "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
"BERT_L4": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1",
"Preprocess_L4": "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"}

tfhub_handle_preprocess = parameters['Preprocess_L8']
tfhub_handle_encoder = parameters['BERT_L8']

In [None]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
bert_model = hub.KerasLayer(tfhub_handle_encoder)

In [None]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()

epochs = 5
steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [None]:
def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.5)(net)
  net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
  return tf.keras.Model(text_input, net)

classifier_model = build_classifier_model()
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

In [None]:
# print(f'Training model with {tfhub_handle_encoder}')
history = classifier_model.fit(x=train_ds, epochs=epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
saved_model_path='drive/MyDrive/Milestone2/wiki_difficulty_classification_bert_v2'
classifier_model.save(saved_model_path, include_optimizer=False)



In [17]:
import tensorflow as tf
# !pip install tensorflow_text
# import tensorflow_text as text
import pandas as pd


saved_model_path='drive/MyDrive/Milestone2/wiki_difficulty_classification_bert_v2'
reloaded_model = tf.saved_model.load(saved_model_path)

test = pd.read_csv('drive/MyDrive/Milestone2/WikiLarge_Test.csv')
test_ls = test['original_text'].tolist()

results_all=[]
temp = tf.sigmoid(reloaded_model(tf.constant(test_ls[:1000])))
results_all.append(temp)

from tqdm import tqdm

for i in tqdm(range(1,120)):
  reloaded_results = tf.sigmoid(reloaded_model(tf.constant(test_ls[i*1000:(i+1)*1000])))
  results_all.append(reloaded_results)

results_all2 = np.vstack(results_all)
results_all3=np.ndarray.flatten((results_all2>0.5)*1)

test['label']=results_all3
submit = test.drop('original_text',1)
submit.to_csv('BERT_submission_v2.csv', index=False)

In [20]:
df_raw = df.original_text.tolist()

results_all = []
temp = tf.sigmoid(reloaded_model(tf.constant(df_raw[:1000])))
results_all.append(temp)

In [None]:
import pandas as pd

test['label']=results_all3
submit = test.drop('original_text',1)
submit.to_csv('BERT_submission_v2.csv', index=False)