In [None]:
!pip install tensorflow

In [None]:
!pip install keras-core --upgrade
!pip install -q keras-nlp
!pip install seaborn
!pip install tensorflow-text==2.15.0
!pip install --upgrade tensorflow-hub
!pip install scikit-learn

In [None]:
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import keras_core as keras
import keras_nlp
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer


In [None]:
PROJECT_DIR = '/content/drive/MyDrive/data/'

for dirname, _, filenames in os.walk(PROJECT_DIR):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
os.getcwd()

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
df_train_essays = pd.read_csv("/content/drive/MyDrive/data/daigt_full_dataset.csv")

In [None]:
f, ax = plt.subplots(figsize=(12, 4))

sns.despine()
ax = sns.countplot(data=df_train_essays,
                   x="label")

abs_values = df_train_essays['label'].value_counts().values

ax.bar_label(container=ax.containers[0], labels=abs_values)

ax.set_title("Distribution of Labels")

In [None]:
df_test_essays = pd.read_csv(PROJECT_DIR + 'CNN-and-Essays-Datasets/CNN Dataset.csv', encoding='utf-8')
df_test_essays.rename(columns={"generated": "label"}, inplace=True)

In [None]:
if "label" not in df_train_essays.columns:
    df_train_essays["label"] = 0

In [None]:
df_train_essays_ext_shuffled = pd.concat([df_train_essays_ext.iloc[:1], df_train_essays_ext.iloc[1:].sample(frac=1, random_state=42)])

df_train_essays_ext_shuffled.reset_index(drop=True, inplace=True)
train_data, test_data = train_test_split(df_train_essays_ext_shuffled.iloc[1:], test_size=0.2, random_state=42)
train_data_with_header = pd.concat([df_train_essays_ext_shuffled.iloc[:1], train_data[["text", "label"]]])
test_data_with_header = pd.concat([df_train_essays_ext_shuffled.iloc[:1], test_data[["text", "label"]]])
train_data_with_header.reset_index(drop=True, inplace=True)
test_data_with_header.reset_index(drop=True, inplace=True)
df_test_essays_final = pd.concat([train_data_with_header[["text", "label"]], df_train_essays[["text", "label"]]])

In [None]:
f, ax = plt.subplots(figsize=(12, 4))

sns.despine()
ax = sns.countplot(data=df_test_essays,
                   x="label")

abs_values = df_test_essays['label'].value_counts().values

ax.bar_label(container=ax.containers[0], labels=abs_values)

ax.set_title("Distribution of Generated Text")

In [None]:
df_train_essays["text_length"] = df_train_essays["text"].apply(lambda x : len(x.split()))

In [None]:
from tensorflow.python.keras.optimizers import adam_v2


In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
SEQ_LENGTH = 512

preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(
    "distil_bert_base_en_uncased",
    sequence_length=SEQ_LENGTH,
)

classifier = keras_nlp.models.DistilBertClassifier.from_preset(
    "distil_bert_base_en_uncased",
    num_classes=2,
    activation=None,
    preprocessor=preprocessor,
)

classifier.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-4),
    metrics=[
        tf.keras.metrics.SparseCategoricalAccuracy()
   ]
)

classifier.backbone.trainable = True


classifier.summary()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_test_essays_final["text"],
                                                    df_test_essays_final["label"],
                                                    test_size=0.30,
                                                    random_state=42)

In [None]:
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [None]:
def encode_texts(texts):
    if isinstance(texts, pd.Series):
        texts = texts.tolist()
    elif isinstance(texts, str):
        texts = [texts]

    texts_tensor = tf.convert_to_tensor(texts)
    texts_tensor = tf.cast(texts_tensor, tf.string)

    encoded_texts = use_model(texts_tensor)
    return encoded_texts.numpy()

In [None]:
input_layer = tf.keras.layers.Input(shape=(512,), dtype=tf.float32, name="input_layer")
dropout_layer = tf.keras.layers.Dropout(0.5)(input_layer)
output_layer = tf.keras.layers.Dense(2, activation="softmax", name="output_layer")(dropout_layer)

model = tf.keras.Model(inputs=input_layer, outputs=output_layer)

In [None]:
model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
)

In [None]:
X_train = X_train.astype(str)
X_test = X_test.astype(str)

X_train_encoded = np.array([encode_texts(text) for text in X_train])
X_test_encoded = np.array([encode_texts(text) for text in X_test])

In [None]:
X_train_encoded = X_train_encoded.reshape((-1, 512))
X_test_encoded = X_test_encoded.reshape((-1, 512))

model.fit(
    x=X_train_encoded,
    y=y_train,
    validation_data=(X_test_encoded, y_test),
    epochs=20,
    batch_size=32
)

In [None]:
def displayConfusionMatrix(y_true, y_pred, dataset):
    disp = ConfusionMatrixDisplay.from_predictions(
        y_true,
        np.argmax(y_pred, axis=1),
        display_labels=["Not Generated","Generated"],
        cmap=plt.cm.Blues
    )

    tn, fp, fn, tp = confusion_matrix(y_true, np.argmax(y_pred, axis=1)).ravel()
    f1_score = tp / (tp+((fn+fp)/2))

    disp.ax_.set_title("Confusion Matrix on " + dataset + " Dataset -- F1 Score: " + str(f1_score.round(2)))


In [None]:
y_pred_test = model.predict(X_test_encoded)

In [None]:
def accuracy_score(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    correct = np.sum(y_true == y_pred)
    total = y_true.size

    return correct / total

y_pred_labels = np.argmax(y_pred_test, axis=1)
test_accuracy = accuracy_score(y_test, y_pred_labels)
print(f"Test set accuracy: {test_accuracy:.4f}")

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
displayConfusionMatrix(y_test, y_pred_test,  "Test")