# setting up

In [None]:
!pip install tensorflow_text
!pip install tf-models-official

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import os
import shutil
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization 
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import re

tf.get_logger().setLevel('ERROR')


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [None]:
# only run this cell if you are using a gpu
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0"

# Preprocessing

In [None]:
# load data
df = pd.read_csv('fake_job_postings.csv')
# drop job id column
df.drop('job_id', axis=1, inplace=True)
# fill na
df.fillna(' ',inplace=True)

In [None]:
# create text column
df['text'] = df['title'] + " " + df['company_profile'] + " " + df['description'] + " " + df['department'] + " " + df['requirements'] + " " + df['benefits'] 

In [None]:
# data cleaning
df['text']=df['text'].str.replace('\n','')
df['text']=df['text'].str.replace('\r','')
df['text']=df['text'].str.replace('\t','')
  
  #This removes unwanted texts
df['text'] = df['text'].apply(lambda x: re.sub(r'[0-9]','',x))
df['text'] = df['text'].apply(lambda x: re.sub(r'[/(){}\[\]\|@,;.:-]',' ',x))
  
  #Converting all upper case to lower case
df['text']= df['text'].apply(lambda s:s.lower() if type(s) == str else s)
  

  #Remove un necessary white space
df['text']=df['text'].str.replace('  ',' ')

In [None]:
# split data
X =df['text']
Y=df['fraudulent']

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.20,random_state=127)

# BERT
https://www.tensorflow.org/text/tutorials/classify_text_with_bert#about_bert

In [None]:
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3'

In [None]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
bert_model = hub.KerasLayer(tfhub_handle_encoder)

In [None]:
def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.1)(net)
  net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
  return tf.keras.Model(text_input, net)

In [None]:
classifier_model = build_classifier_model()

In [None]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = [tf.metrics.BinaryAccuracy(), tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]

In [None]:
epochs = 3
steps_per_epoch = len(X_train)
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)
init_lr = 3e-5

In [None]:
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [None]:
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

In [None]:
print(f'Training model with {tfhub_handle_encoder}')
history = classifier_model.fit(x=X_train,
                               y=y_train,
                               batch_size=42,
                               epochs=epochs,
                               validation_split=0.2,
                               steps_per_epoch=steps_per_epoch)

Training model with https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3
Epoch 1/3


In [None]:
# evaluation
loss, accuracy, precision, recall = classifier_model.evaluate(X_test, y_test)

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')

Loss: 0.07004035264253616
Accuracy: 0.9843400716781616
Precision: 0.9191176295280457
Recall: 0.7352941036224365


In [None]:
# path to save model
saved_model_path = 'the_model_A12'
# save model
classifier_model.save(saved_model_path, include_optimizer=False)



In [None]:
# seeing examples
def print_my_examples(inputs, results):
  result_for_printing = \
    [f'input: {inputs[i]:<30} : score: {results[i][0]:.6f}'
                         for i in range(len(inputs))]
  print(*result_for_printing, sep='\n')
  print()

# examples taken from test set
examples = [
    X_test.loc[10312], 
    X_test.loc[8573],
    X_test.loc[9971],
    X_test.loc[9257],
    X_test.loc[584]
]

#labels of examples
example_labs = [y_test.loc[10312], 
                y_test.loc[8573],
                y_test.loc[9971],
                y_test.loc[9257],
                y_test.loc[584]]

In [None]:
# reload model
reloaded_model = tf.saved_model.load(saved_model_path)

In [None]:
# using the saved model for inference
reloaded_results = tf.sigmoid(reloaded_model(tf.constant(examples)))

print('Results from the saved model:')
print_my_examples(example_labs, reloaded_results)

Results from the saved model:
input: 0                              : score: 0.000119
input: 1                              : score: 0.997122
input: 1                              : score: 0.997692
input: 1                              : score: 0.998016
input: 1                              : score: 0.985893



In [None]:
# exporting model on google colab
!zip -r /content/unbal_A12.zip /content/the_model_A12/

  adding: content/the_model_A12/ (stored 0%)
  adding: content/the_model_A12/saved_model.pb (deflated 93%)
  adding: content/the_model_A12/fingerprint.pb (stored 0%)
  adding: content/the_model_A12/assets/ (stored 0%)
  adding: content/the_model_A12/assets/vocab.txt (deflated 53%)
  adding: content/the_model_A12/keras_metadata.pb (deflated 85%)
  adding: content/the_model_A12/variables/ (stored 0%)
  adding: content/the_model_A12/variables/variables.data-00000-of-00001 (deflated 8%)
  adding: content/the_model_A12/variables/variables.index (deflated 80%)


In [None]:
# only need this for exporting the model on google colab
from google.colab import files
files.download("/content/unbal_A12.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# save a single file
files.download('/content/the_model_A12/variables/variables.data-00000-of-00001') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# saving to my drive for faster download
!cp -r '/content/the_model_A12/variables/variables.data-00000-of-00001' /content/gdrive/MyDrive/