# Load Libraries

In [1]:
!pip install keras-core --upgrade
!pip install -q keras-nlp --upgrade

# This sample uses Keras Core, the multi-backend version of Keras.
# The selected backend is TensorFlow (other supported backends are 'jax' and 'torch')
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'



In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import keras_core as keras
import keras_nlp
import seaborn as sns
import matplotlib.pyplot as plt

train_df = pd.read_csv('/kaggle/input/artificial-text-detection-homework/dev.csv')

train_X = train_df['Text']
train_y = train_df['Class'].apply(lambda x: 1 if x == 'M' else 0)



Using TensorFlow backend


# Load The DistilBert for classification

In [3]:
# Load a DistilBERT model.
preset= "distil_bert_base_en_uncased"

# Use a shorter sequence length.
preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(preset,
                                                                   sequence_length=160,
                                                                   name="preprocessor_4_tweets"
                                                                  )

# Pretrained classifier.
classifier = keras_nlp.models.DistilBertClassifier.from_preset(preset,
                                                               preprocessor = preprocessor, 
                                                               num_classes=2)

classifier.summary()

Downloading data from https://storage.googleapis.com/keras-nlp/models/distil_bert_base_en_uncased/v1/vocab.txt
[1m231508/231508[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step       
Downloading data from https://storage.googleapis.com/keras-nlp/models/distil_bert_base_en_uncased/v1/model.h5
[1m265570304/265570304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 0us/step


# Prepare Data

In [5]:
from sklearn.model_selection import train_test_split
# Assuming train_X and train_y are your features and labels
X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, test_size=0.2, random_state=42)

# Compile The Model

In [6]:
# Compile
classifier.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), #'binary_crossentropy',
    optimizer=keras.optimizers.Adam(1e-5),
    metrics= ["accuracy"]  
)

# Train (Finetune) DistilBertForClassification

In [7]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    patience = 10, 
    min_delta = 1e-3, 
    restore_best_weights = True
)

In [8]:
EPOCHS = 50
BATCH_SIZE = 32

# Fit
history = classifier.fit(x=X_train,
                         y=y_train,
                         batch_size=BATCH_SIZE,
                         epochs=EPOCHS, 
                         validation_data=(X_val, y_val), 
                         callbacks= [early_stopping]
                        )

Epoch 1/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 355ms/step - accuracy: 0.7267 - loss: 0.6317 - val_accuracy: 0.9550 - val_loss: 0.2477
Epoch 2/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 261ms/step - accuracy: 0.9402 - loss: 0.2198 - val_accuracy: 0.9775 - val_loss: 0.0810
Epoch 3/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 260ms/step - accuracy: 0.9769 - loss: 0.0926 - val_accuracy: 0.9825 - val_loss: 0.0615
Epoch 4/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 608ms/step - accuracy: 0.9856 - loss: 0.0591 - val_accuracy: 0.9900 - val_loss: 0.0470
Epoch 5/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 263ms/step - accuracy: 0.9907 - loss: 0.0349 - val_accuracy: 0.9900 - val_loss: 0.0353
Epoch 6/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 257ms/step - accuracy: 0.9918 - loss: 0.0288 - val_accuracy: 0.9875 - val_loss: 0.0436
Epoch 7/50
[1m50/50[

# Predict on test

In [25]:
df_test = pd.read_csv('/kaggle/input/artificial-text-detection-homework/test.csv')
df_test['Class'] = [('M' if np.argmax(i) == 1 else 'H') for i in classifier.predict(df_test['Text'].to_list())]

[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 65ms/step


In [24]:
submission_df = pd.DataFrame({'ID': df_test['ID'], 'Class': df_test['Class']})
submission_df.to_csv('submission.csv', index = False)

# Upvote and Comment if you like this notebook😉