In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
class CFG:
    sequence_length = 1024
    vocab_size = 10000
    is_training = True

In [3]:
train_df = pd.read_csv("train_v2_drcat_02.csv")
train_df.head()

Unnamed: 0,text,label,prompt_name,source,RDizzl3_seven
0,Phones\n\nModern humans today are always on th...,0,Phones and driving,persuade_corpus,False
1,This essay will explain if drivers should or s...,0,Phones and driving,persuade_corpus,False
2,Driving while the use of cellular devices\n\nT...,0,Phones and driving,persuade_corpus,False
3,Phones & Driving\n\nDrivers should not be able...,0,Phones and driving,persuade_corpus,False
4,Cell Phone Operation While Driving\n\nThe abil...,0,Phones and driving,persuade_corpus,False


In [4]:
train_df["text_length"] = train_df["text"].apply(lambda text: len(text.split(" ")))

In [5]:
train_df[["text_length"]].describe()

Unnamed: 0,text_length
count,44868.0
mean,386.14244
std,225.375414
min,4.0
25%,273.0
50%,350.0
75%,451.0
max,14818.0


In [6]:
train_data, valid_data = train_test_split(train_df, test_size=0.2, stratify=train_df["label"], random_state=42)
train_data.shape, valid_data.shape

((35894, 6), (8974, 6))

In [7]:
def create_dataset(dataframe, shuffle=True):
    # Create a TensorFlow dataset from the text and label columns of the dataframe
    dataset = tf.data.Dataset.from_tensor_slices((dataframe["text"], dataframe["label"]))
    if shuffle:
        # Shuffle the dataset if the shuffle parameter is True
        dataset = dataset.shuffle(1024, reshuffle_each_iteration=True)
    # Batch the dataset into smaller batches of size 256
    dataset = dataset.batch(256).cache().prefetch(tf.data.AUTOTUNE)
    # Prefetch the next batch of data to further optimize training
    return dataset

In [8]:
train_ds = create_dataset(train_data)
valid_ds = create_dataset(valid_data, shuffle=False)

In [9]:
# Create a TextVectorization layer with specified parameters
vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=CFG.vocab_size, 
    output_sequence_length=CFG.sequence_length, 
    pad_to_max_tokens=True
)
# Adapt the TextVectorization layer to the training data
vectorizer.adapt(train_df["text"], batch_size=1024)

In [10]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(), dtype=tf.string),
    vectorizer,
    tf.keras.layers.Embedding(
        input_dim=CFG.vocab_size, 
        output_dim=64,
        input_length=CFG.sequence_length, 
        mask_zero=True
    ),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)), 
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)), 
    tf.keras.layers.Dense(16, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])
model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(), 
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), 
    metrics=[
        "accuracy", 
        tf.keras.metrics.AUC(name="auc")
    ]
)
# model.summary()
# tf.keras.utils.plot_model(model, show_shapes=True)



In [13]:
model.load_weights("lstm.h5")

In [None]:
x=model.get_weights()

print(x)

In [21]:
val_loss, val_accuracy, val_auc = model.evaluate(valid_ds)

[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 1s/step - accuracy: 0.9912 - auc: 0.9987 - loss: 0.0276


In [24]:
res=model.predict(valid_ds)

[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 1s/step


In [61]:
# Create a DataFrame with a single row containing the input text"
data = {"text": [input_text]}  # Assuming label is not used for prediction
input_text = "In a quaint town, a mysterious old bookstore appeared overnight, its shelves filled with books bound in strange leather. Curiosity drew Emma inside, where she found a book titled 'The Forgotten Tales.' As she read, the stories came to life around her, transporting her to fantastical realms. She met a mischievous fairy, a kind-hearted dragon, and a lonely ghost seeking company. Each adventure taught her valuable lessons about friendship, courage, and the power of imagination. When Emma finally closed the book, the store vanished, leaving her with memories of an extraordinary journey and a newfound belief in the magic of storytelling."
input_df = pd.DataFrame(data)

# Create a TensorFlow dataset from the DataFrame
input_dataset = tf.data.Dataset.from_tensor_slices((input_df["text"]))
input_dataset = input_dataset.batch(1)  # Batch size of 1 since there's only one input text
input_dataset = input_dataset.cache().prefetch(tf.data.AUTOTUNE)

# Make a prediction using the model
predictions = model.predict(input_dataset)

# Assuming predictions is a numpy array, you can access the predicted class or value
print(predictions)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step
[[0.79949]]


In [51]:
valid_data

Unnamed: 0,text,label,prompt_name,source,RDizzl3_seven,text_length
25721,Advice about receiving advice\n\nDoing somethi...,0,Seeking multiple opinions,persuade_corpus,False,414
2844,"Cars, some people think they are the most impo...",0,Car-free cities,persuade_corpus,True,434
5726,Seagoing\n\nDo you want an adventure? Maybe yo...,0,"""A Cowboy Who Rode the Waves""",persuade_corpus,True,390
19708,I think they should go by popular vote ! beacu...,0,Does the electoral college work?,persuade_corpus,True,3097
11234,Do you believe that there is a computer that c...,0,Facial action coding system,persuade_corpus,True,179
...,...,...,...,...,...,...
34531,"[Your Name]\n[Your Address]\n[City, State, ZIP...",1,Does the electoral college work?,radek_500,True,603
31905,Taking the traditional four years to complete ...,1,Distance learning,chat_gpt_moth,False,256
18193,"The future is near, and it's bringing what Ame...",0,Driverless cars,persuade_corpus,True,358
30048,"Hey, so I'm gonna write this essay about why f...",1,Seeking multiple opinions,llama2_chat,False,359


In [58]:
model.save("model.keras")

UnicodeEncodeError: 'charmap' codec can't encode character '\U0001f605' in position 25589: character maps to <undefined>