# Importing Dependencies

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd 
import os
import re
import tensorflow as tf

# Load data

In [2]:
df_train = pd.read_csv("/kaggle/input/daigt-proper-train-dataset/train_drcat_02.csv")
df_train = df_train.drop(['prompt','source','fold'],axis = 1)
df_train.head()

Unnamed: 0,essay_id,text,label
0,E0B86027C5C5,"Okay, here is my essay:\n\nWorking or doing th...",1
1,8B9EEF86DD66,"Dear Principal,\r\n\r\nI just want to let you ...",0
2,116FB053BEC7,I believe they should change the voting to pop...,0
3,A72A4397F9F5,One example of an inspiring individual who has...,1
4,D90A159EDD27,"Dear Mr. Principal,\n\nI believe that students...",0


# Data Preprocessing

Converting text to numeric sequences, one-hot encodes labels, and ensures uniform sequence length through tokenization and padding. The resulting data is shaped for compatibility with a neural network model.

In [3]:
df_train['text'] = df_train['text'].astype(str)
data = df_train.copy()

In [4]:
labels = data['label'] 

In [5]:
from tensorflow.python.keras.utils import np_utils
from tensorflow.keras.utils import to_categorical

labels = to_categorical(np.asarray(labels ))

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer

max_sequence_length = 200   
max_words = 2500   
tokenizer = Tokenizer(num_words = max_words)  
tokenizer.fit_on_texts(data.text)      
sequences = tokenizer.texts_to_sequences(data.text)

In [7]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
data = pad_sequences(sequences, maxlen = max_sequence_length)
data.shape 

(39785, 200)

# Model Building

Training a neural network model using Keras with TensorFlow backend. The model consists of an embedding layer, a bidirectional LSTM layer, global max pooling, and a dense layer with softmax activation. The model is compiled with binary crossentropy loss and the Adam optimizer. It is then trained on the preprocessed text data (data) and corresponding labels (labels) for 20 epochs with a batch size of 48.

In [8]:
from tensorflow.keras.layers import Dense,  LSTM, Embedding  
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers
from tensorflow.keras.layers import Dense, Input, Input, Flatten, Dropout, BatchNormalization 

In [9]:
embedding_dim = 32 

model = Sequential()
model.add(Embedding(max_words, 
                   embedding_dim,
                   input_length=max_sequence_length))

model.add(Bidirectional(LSTM(16, return_sequences=True, dropout=0.4, recurrent_dropout=0.1)))   
model.add(GlobalMaxPool1D())
model.add(Dense(2,activation='softmax'))

In [10]:
model.compile(loss = 'binary_crossentropy', optimizer='Adam', metrics = ['accuracy']) 

In [11]:
model.fit(data,labels, batch_size=48, epochs= 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7fb80dbe1240>

# Making Predictions

Using the trained model (model) to predict the probabilities for each class (in this case, binary classification with two classes). Finally, it extracts the predicted probabilities for the positive class (class 1) and stores them in the pred variable. These probabilities can be used for further analysis or evaluation.

In [12]:
df_test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
test_sequences = tokenizer.texts_to_sequences(df_test.text)
testdata = pad_sequences(test_sequences, maxlen = max_sequence_length)
predicted= model.predict(testdata)
pred = predicted[:, 1]



In [13]:
pred

array([0.9492775, 0.9492775, 0.9492775], dtype=float32)