In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

from tfx.components import CsvExampleGen, StatisticsGen, SchemaGen, ExampleValidator, Transform, Trainer, Tuner
from tfx.proto import example_gen_pb2
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext
from utils import clean_text, normalize_text

In [None]:
DATA_ROOT = "dataset"
DATA_CLEAN = "data"

# Prepare data

In [ ]:
dataset = pd.read_csv(DATA_ROOT + "/fake_job_postings.csv")

In [ ]:
dataset.head()

In [ ]:
dataset.info()

In [ ]:
dataset.dropna(inplace=True)

In [ ]:
dataset = dataset.drop(columns=['job_id', 'telecommuting', 'has_questions', 'has_company_logo'])

In [ ]:
df = dataset

In [ ]:
df.head()

# Information about data

In [ ]:
fraudulent = df[df['fraudulent'] == 1]['fraudulent'].count()
non_fraudulent = df[df['fraudulent'] == 0]['fraudulent'].count()

In [ ]:
fig = plt.figure(figsize=(10, 5))

plt.bar(['fraudulent', 'non fraudulent'], [fraudulent, non_fraudulent], color=['#d81159', '#00a6fb'])

plt.text(0, fraudulent, fraudulent, ha='center', va='bottom', fontsize=12)
plt.text(1, non_fraudulent, non_fraudulent, ha='center', va='bottom', fontsize=12)

plt.title("Information about fraudulent and non fraudulent jobs")
plt.xlabel("Type of job")
plt.ylabel("Number of fraudulent and non fraudulent jobs")
plt.show()

.
___

In [ ]:
employment_type = df.groupby(['employment_type'])['employment_type'].count()
employment_type_key = list(employment_type.keys())
employment_type_value = list(employment_type.values)

In [ ]:
fig = plt.figure(figsize=(10, 5))

plt.bar(employment_type_key, employment_type_value, color="#00a6fb")

for i in range(len(employment_type_key)):
    plt.text(i, employment_type_value[i], employment_type_value[i], ha='center', va='bottom', fontsize=12)

plt.title("Information about employment type")
plt.xlabel("Employment type")
plt.ylabel("Number of employment type")
plt.show()

In [ ]:
.
___

In [ ]:
required_experience = df.groupby(['required_experience'])['required_experience'].count()
required_experience_key = list(required_experience.keys())
required_experience_value = list(required_experience.values)

In [ ]:
fig = plt.figure(figsize=(10, 5))

plt.bar(required_experience_key, required_experience_value, color="#00a6fb")

for i in range(len(required_experience_key)):
    plt.text(i, required_experience_value[i], required_experience_value[i], ha='center', va='bottom', fontsize=12)

plt.title("Information about required experience")
plt.xlabel("Required experience")
plt.ylabel("Number of required experience")
plt.show()

.
___

In [ ]:
required_education = df.groupby(['required_education'])['required_education'].count()
required_education_key = list(required_education.keys())
required_education_value = list(required_education.values)

In [ ]:
fig = plt.figure(figsize=(20, 5))

plt.bar(required_education_key, required_education_value, color="#00a6fb")

for i in range(len(required_education_key)):
    plt.text(i, required_education_value[i], required_education_value[i], ha='center', va='bottom', fontsize=12)

plt.title("Information about required education")
plt.xlabel("Required education")
plt.ylabel("Number of required education")
plt.show()

.
___

In [ ]:
function = df.groupby(['function'])['function'].count()
function

.
___

In [ ]:
industry = df.groupby(['industry'])['industry'].count()
industry

# Data Cleansing

In [ ]:
df.isnull().sum()

In [ ]:
df.fillna(" ", inplace=True)
df.isna().sum()

In [ ]:
df['full_description'] = df[
    ['title', 'location', 'salary_range', 'company_profile', 'description', 'requirements', 'benefits',
     'employment_type', 'required_experience', 'required_education', 'industry', 'function', 'department']].apply(
    lambda x: ' '.join(x), axis=1)

df.drop(columns=['title',
                 'location',
                 'salary_range',
                 'company_profile',
                 'description',
                 'requirements',
                 'benefits',
                 'employment_type',
                 'required_experience',
                 'required_education',
                 'industry',
                 'function',
                 'department'], inplace=True)

In [ ]:
df.head()

In [ ]:
df['full_description'] = df['full_description'].apply(clean_text)

In [ ]:
df['full_description'] = df['full_description'].apply(normalize_text)

In [ ]:
df.head()

In [ ]:
df.to_csv(os.path.join(DATA_ROOT, "fake_job_postings_clean.csv"), index=False)

# Prepare pipeline

In [0]:
PIPELINE_NAME = "real-or-fake-jobs-pipeline"
SCHEMA_PIPELINE_NAME = "real-or-fake-jobs-tfdv-schema"

PIPELINE_ROOT = os.path.join('celvineadiputra-pipelines', PIPELINE_NAME)

METADATA_PATH = os.path.join('metadata', PIPELINE_NAME, 'metadata.db')

SERVING_MODEL_DIR = os.path.join('serving_model', PIPELINE_NAME)

In [None]:
interactive_context = InteractiveContext(pipeline_root=PIPELINE_ROOT)

## Data Ingestion
##### ExampleGen

In [ ]:
output = example_gen_pb2.Output(
    split_config=example_gen_pb2.SplitConfig(splits=[
        example_gen_pb2.SplitConfig.Split(name="train", hash_buckets=8),
        example_gen_pb2.SplitConfig.Split(name="eval", hash_buckets=2)
    ])
)
example_gen = CsvExampleGen(input_base=DATA_ROOT, output_config=output)

In [ ]:
interactive_context.run(example_gen)

## Data Validation
##### StatisticsGen

In [ ]:
statistics_gen = StatisticsGen(
    examples=example_gen.outputs["examples"]
)

interactive_context.run(statistics_gen)

In [ ]:
interactive_context.show(statistics_gen.outputs["statistics"])

## Data Schema
##### SchemaGen

In [ ]:
schema_gen = SchemaGen(statistics=statistics_gen.outputs["statistics"])
interactive_context.run(schema_gen)

In [ ]:
interactive_context.show(schema_gen.outputs["schema"])

## Example Validator

In [ ]:
example_validator = ExampleValidator(
    statistics=statistics_gen.outputs['statistics'],
    schema=schema_gen.outputs['schema']
)
interactive_context.run(example_validator)

In [ ]:
interactive_context.show(example_validator.outputs['anomalies'])

# Data Preprocessing
##### Transform

In [ ]:
TRANSFORM_MODULE_FILE = "real_or_fake_job_transform.py"

In [ ]:
%%writefile {TRANSFORM_MODULE_FILE}

import tensorflow as tf

LABEL_KEY = "fraudulent"
FEATURE_KEY = "full_description"


def transformed_name(key):
    return key + "_xf"


def preprocessing_fn(inputs):
    outputs = {}

    outputs[transformed_name(FEATURE_KEY)] = tf.strings.lower(inputs[FEATURE_KEY])

    outputs[transformed_name(LABEL_KEY)] = tf.cast(inputs[LABEL_KEY], tf.int64)

    return outputs

In [ ]:
transform = Transform(
    examples=example_gen.outputs['examples'],
    schema=schema_gen.outputs['schema'],
    module_file=os.path.abspath(TRANSFORM_MODULE_FILE)
)
interactive_context.run(transform)

# Model Development

In [ ]:
TRAINER_MODULE_FILE = "real_or_fake_job_trainer.py"

In [ ]:
%%writefile {TRAINER_MODULE_FILE}
import tensorflow as tf
import tensorflow_transform as tft
from tensorflow.keras import layers
import os
import tensorflow_hub as hub
from tfx.components.trainer.fn_args_utils import FnArgs

LABEL_KEY = "fraudulent"
FEATURE_KEY = "full_description"


def transformed_name(key):
    """Renaming transformed features"""
    return key + "_xf"


def gzip_reader_fn(filenames):
    """Loads compressed data"""
    return tf.data.TFRecordDataset(filenames, compression_type='GZIP')


def input_fn(file_pattern,
             tf_transform_output,
             num_epochs,
             batch_size=64) -> tf.data.Dataset:
    """Get post_tranform feature & create batches of data"""

    # Get post_transform feature spec
    transform_feature_spec = (
        tf_transform_output.transformed_feature_spec().copy())

    # create batches of data
    dataset = tf.data.experimental.make_batched_features_dataset(
        file_pattern=file_pattern,
        batch_size=batch_size,
        features=transform_feature_spec,
        reader=gzip_reader_fn,
        num_epochs=num_epochs,
        label_key=transformed_name(LABEL_KEY))
    return dataset


os.environ['TFHUB_CACHE_DIR'] = '/hub_chace'
embed = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4")

VOCAB_SIZE = 10000
SEQUENCE_LENGTH = 100

vectorize_layer = layers.TextVectorization(
    standardize="lower_and_strip_punctuation",
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=SEQUENCE_LENGTH)

embedding_dim = 16


def model_builder():
    inputs = tf.keras.Input(shape=(1,), name=transformed_name(FEATURE_KEY), dtype=tf.string)
    reshaped_narrative = tf.reshape(inputs, [-1])
    x = vectorize_layer(reshaped_narrative)
    x = layers.Embedding(VOCAB_SIZE, embedding_dim, name="embedding")(x)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dense(64, activation='relu')(x)
    x = layers.Dense(32, activation="relu")(x)
    outputs = layers.Dense(1, activation='sigmoid')(x)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)

    model.compile(
        loss='binary_crossentropy',
        optimizer=tf.keras.optimizers.Adam(0.01),
        metrics=[tf.keras.metrics.BinaryAccuracy()]

    )

    model.summary()
    return model


def _get_serve_tf_examples_fn(model, tf_transform_output):
    model.tft_layer = tf_transform_output.transform_features_layer()

    @tf.function
    def serve_tf_examples_fn(serialized_tf_examples):
        feature_spec = tf_transform_output.raw_feature_spec()

        feature_spec.pop(LABEL_KEY)

        parsed_features = tf.io.parse_example(serialized_tf_examples, feature_spec)

        transformed_features = model.tft_layer(parsed_features)

        # get predictions using the transformed features
        return model(transformed_features)

    return serve_tf_examples_fn


def run_fn(fn_args: FnArgs) -> None:
    log_dir = os.path.join(os.path.dirname(fn_args.serving_model_dir), 'logs')

    tensorboard_callback = tf.keras.callbacks.TensorBoard(
        log_dir=log_dir, update_freq='batch'
    )

    es = tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', mode='max', verbose=1, patience=10)
    mc = tf.keras.callbacks.ModelCheckpoint(fn_args.serving_model_dir, monitor='val_binary_accuracy', mode='max',
                                            verbose=1, save_best_only=True)

    # Load the transform output
    tf_transform_output = tft.TFTransformOutput(fn_args.transform_graph_path)

    # Create batches of data
    train_set = input_fn(fn_args.train_files, tf_transform_output, 10)
    val_set = input_fn(fn_args.eval_files, tf_transform_output, 10)
    vectorize_layer.adapt(
        [j[0].numpy()[0] for j in [
            i[0][transformed_name(FEATURE_KEY)]
            for i in list(train_set)]])

    # Build the model
    model = model_builder()

    # Train the model
    model.fit(x=train_set,
              validation_data=val_set,
              callbacks=[tensorboard_callback, es, mc],
              steps_per_epoch=1000,
              validation_steps=1000,
              epochs=10)
    signatures = {
        'serving_default':
            _get_serve_tf_examples_fn(model, tf_transform_output).get_concrete_function(
                tf.TensorSpec(
                    shape=[None],
                    dtype=tf.string,
                    name='examples'))
    }
    model.save(fn_args.serving_model_dir, save_format='tf', signatures=signatures)

##### Trainer

In [ ]:
from tfx.proto import trainer_pb2

trainer = Trainer(
    module_file=os.path.abspath(TRAINER_MODULE_FILE),
    examples=transform.outputs['transformed_examples'],
    transform_graph=transform.outputs['transform_graph'],
    schema=schema_gen.outputs['schema'],
    train_args=trainer_pb2.TrainArgs(splits=['train']),
    eval_args=trainer_pb2.EvalArgs(splits=['eval'])
)
interactive_context.run(trainer)


##### Resolver

In [ ]:
from tfx.dsl.components.common.resolver import Resolver
from tfx.dsl.input_resolution.strategies.latest_blessed_model_strategy import LatestBlessedModelStrategy
from tfx.types import Channel
from tfx.types.standard_artifacts import Model, ModelBlessing

model_resolver = Resolver(
    strategy_class=LatestBlessedModelStrategy,
    model=Channel(type=Model),
    model_blessing=Channel(type=ModelBlessing)
).with_id('Latest_blessed_model_resolver')

interactive_context.run(model_resolver)

##### Evaluator

In [ ]:
import tensorflow_model_analysis as tfma

eval_config = tfma.EvalConfig(
    model_specs=[tfma.ModelSpec(label_key='is_sarcastic')],
    slicing_specs=[tfma.SlicingSpec()],
    metrics_specs=[
        tfma.MetricsSpec(metrics=[

            tfma.MetricConfig(class_name='ExampleCount'),
            tfma.MetricConfig(class_name='AUC'),
            tfma.MetricConfig(class_name='FalsePositives'),
            tfma.MetricConfig(class_name='TruePositives'),
            tfma.MetricConfig(class_name='FalseNegatives'),
            tfma.MetricConfig(class_name='TrueNegatives'),
            tfma.MetricConfig(class_name='BinaryAccuracy',
                              threshold=tfma.MetricThreshold(
                                  value_threshold=tfma.GenericValueThreshold(
                                      lower_bound={'value': 0.5}),
                                  change_threshold=tfma.GenericChangeThreshold(
                                      direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                                      absolute={'value': 0.0001})
                              )
                              )
        ])
    ]
)

In [ ]:
from tfx.components import Evaluator

evaluator = Evaluator(
    examples=example_gen.outputs['examples'],
    model=trainer.outputs['model'],
    baseline_model=model_resolver.outputs['model'],
    eval_config=eval_config)

interactive_context.run(evaluator)

In [ ]:
# Visualize the evaluation results
eval_result = evaluator.outputs['evaluation'].get()[0].uri
tfma_result = tfma.load_eval_result(eval_result)
tfma.view.render_slicing_metrics(tfma_result)
tfma.addons.fairness.view.widget_view.render_fairness_indicator(
    tfma_result
)

##### Pusher

In [ ]:
from tfx.components import Pusher
from tfx.proto import pusher_pb2

pusher = Pusher(
    model=trainer.outputs['model'],
    model_blessing=evaluator.outputs['blessing'],
    push_destination=pusher_pb2.PushDestination(
        filesystem=pusher_pb2.PushDestination.Filesystem(
            base_directory='serving_model_dir/real-or-fake-jobs-detection-model'))

)

interactive_context.run(pusher)