In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


## Load the dataset

In [None]:
import pandas as pd
import os
import numpy as np


dataset_dir = '/content/gdrive/MyDrive/Essentials in Text and Speech Processing'

x_train = pd.read_csv(os.path.join(dataset_dir, 'x_train.csv'))
x_test = pd.read_csv(os.path.join(dataset_dir, 'x_test.csv'))
y_train = pd.read_csv(os.path.join(dataset_dir, 'y_train.csv'))
y_test = pd.read_csv(os.path.join(dataset_dir, 'y_test.csv'))


## Preprocess one more time lol

In [None]:
# Check for missing values in x_train
missing_values_train = x_train.isnull().sum()
missing_values_test = x_test.isnull().sum()

print(f"Missing values in x_train:\n{missing_values_train[missing_values_train > 0]}")
print(f"Missing values in x_test:\n{missing_values_test[missing_values_test > 0]}")

Missing values in x_train:
requirements    3
dtype: int64
Missing values in x_test:
description     1
requirements    3
dtype: int64


In [None]:
x_train['requirements'] = x_train['requirements'].fillna('')
x_test['requirements'] = x_test['requirements'].fillna('')
x_test['description'] = x_test['description'].fillna('')

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Attention, Input, GlobalAveragePooling1D
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report



##Concatenate relevant text columns into a single input

In [None]:

# Combine the relevant text columns
x_train_text = x_train['title'] + " " + x_train['company_profile'] + " " + x_train['description'] + " " + x_train['requirements']
x_test_text = x_test['title'] + " " + x_test['company_profile'] + " " + x_test['description'] + " " + x_test['requirements']


UFuncTypeError: ufunc 'add' did not contain a loop with signature matching types (dtype('float64'), dtype('<U1')) -> None

In [None]:
# Flatten the target labels
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()


## Tokenization and padding

In [None]:
# Define a tokenizer and fit it on the training data
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(x_train_text)

# Convert text to sequences and pad sequences to ensure uniform length
x_train_seq = tokenizer.texts_to_sequences(x_train_text)
x_test_seq = tokenizer.texts_to_sequences(x_test_text)

max_sequence_length = 300
x_train_pad = pad_sequences(x_train_seq, maxlen=max_sequence_length, padding='post')
x_test_pad = pad_sequences(x_test_seq, maxlen=max_sequence_length, padding='post')

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='minority', random_state=42)
x_train_res, y_train_res = smote.fit_resample(x_train_pad, y_train)

## adding Attention Layer in LSTM Model

In [None]:
# LSTM 모델 정의
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Attention, Input, GlobalAveragePooling1D

In [None]:
# Define input and embedding layers
vocab_size = 5000
embedding_dim = 64
input_seq = Input(shape=(max_sequence_length,), dtype='int32')
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(input_seq)

# LSTM layer with attention
lstm_layer = LSTM(128, return_sequences=True)(embedding_layer)
attention = Attention()([lstm_layer, lstm_layer])  # Attention mechanism
attention_output = GlobalAveragePooling1D()(attention)

In [None]:
# Output layer
output = Dense(1, activation='sigmoid')(attention_output)
model = Model(inputs=input_seq, outputs=output)

# Compile model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()

##Visualization and evaluation



In [None]:

# train the model
history = model.fit(x_train_res, y_train_res, epochs=5, batch_size=32, validation_data=(x_test_pad, y_test))


In [None]:
# Make predictions on the test data
y_pred_prob = model.predict(x_test_pad)  # Predicted probabilities for the test data

# Check if it is binary or multi-class classification
if y_pred_prob.shape[1] == 1:  # Binary classification (1 output node)
    y_pred = np.round(y_pred_prob).astype(int).flatten()  # Convert probabilities to binary predictions (0 or 1)
else:  # Multi-class classification (more than 1 output node)
    y_pred = np.argmax(y_pred_prob, axis=1)  # Select the class with the highest probability

# If y_test is one-hot encoded, convert it to class labels
if len(y_test.shape) > 1 and y_test.shape[1] > 1:
    y_true = np.argmax(y_test, axis=1)  # Convert to 1D array of class labels
else:
    y_true = y_test.flatten()  # Flatten if y_test is already a 1D array

# Calculate performance metrics
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
precision = precision_score(y_true, y_pred, average='weighted')

# Print evaluation results
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")

# Class-specific performance metrics
report = classification_report(y_true, y_pred, target_names=['Class 0', 'Class 1'])
print("\nClassification Report:\n", report)

In [None]:
import matplotlib.pyplot as plt


# Plot training & validation accuracy and loss
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Accuracy over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tqdm import tqdm  # For progress tracking



# Define the function to plot attention weights for a given input sentence
def plot_attention(model, tokenizer, input_text, max_len=300):
    # Preprocess the input text (tokenize and pad)
    input_sequence = tokenizer.texts_to_sequences([input_text])
    input_sequence = tf.keras.preprocessing.sequence.pad_sequences(input_sequence, maxlen=max_len)

    # Run the model to get the attention weights and prediction
    intermediate_model = tf.keras.Model(inputs=model.input,outputs=[model.get_layer('attention_1').output, model.output])
    attention_weights, prediction = intermediate_model.predict(input_sequence)

    # Calculate average attention for each word
    attention_weights = np.mean(attention_weights, axis=2).flatten()

    # Get the words from the input text
    words = input_text.split()

    # Truncate or pad words to match max_len for visualization
    if len(words) > max_len:
        words = words[:max_len]
        attention_weights = attention_weights[:max_len]
    elif len(words) < max_len:
        words += [''] * (max_len - len(words))

    # Plot attention weights
    plt.figure(figsize=(15, 5))
    plt.bar(range(len(words)), attention_weights, color="skyblue")
    plt.xticks(range(len(words)), words, rotation=90)
    plt.xlabel('Words')
    plt.ylabel('Attention Weight')
    plt.title(f'Attention Weights for Prediction: {"Fraud" if prediction[0] > 0.5 else "Non-Fraud"}')
    plt.show()

# Iterate through each sample in X_test and plot the attention weights
for i in tqdm(range(len(x_test))):  # Progress bar for tracking
    example_text = (
        (x_test.iloc[i]['title'] if x_test.iloc[i]['title'] is not np.nan else "") + " " +
        (x_test.iloc[i]['company_profile'] if x_test.iloc[i]['company_profile'] is not np.nan else "") + " " +
        (x_test.iloc[i]['description'] if x_test.iloc[i]['description'] is not np.nan else "") + " " +
        (x_test.iloc[i]['requirements'] if x_test.iloc[i]['requirements'] is not np.nan else "")
    )

    # Plot the attention for the current sample
    plot_attention(model, tokenizer, example_text)


In [None]:
import shap

# SHAP 해석기 정의
explainer = shap.KernelExplainer(predict_proba, x_train_pad[:100])
shap_values = explainer.shap_values(x_test_pad[:1])

# 특정 텍스트에 대한 SHAP 시각화
shap.initjs()
example_text = "Immediate hire work from home job with high salary and no experience needed"
example_seq = tokenizer.texts_to_sequences([example_text])
example_pad = pad_sequences(example_seq, maxlen=max_sequence_length)

shap.force_plot(explainer.expected_value[1], shap_values[1][0], example_text.split())
