In [2]:
import time
from tqdm import tqdm
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
import tensorflow as tf
from gensim.models import Word2Vec
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from sklearn.model_selection import train_test_split
import os
from keras.models import load_model
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
nltk.download('punkt')
nltk.download('stopwords')

In [3]:
# Load the data
df = pd.read_csv('Data/train_essays/train_essays.csv')

# Display the data
df

Unnamed: 0,id,prompt_id,text,generated
0,0059830c,0,Cars. Cars have been around since they became ...,0
1,005db917,0,Transportation is a large necessity in most co...,0
2,008f63e3,0,"""America's love affair with it's vehicles seem...",0
3,00940276,0,How often do you ride in a car? Do you drive a...,0
4,00c39458,0,Cars are a wonderful thing. They are perhaps o...,0
...,...,...,...,...
1373,fe6ff9a5,1,There has been a fuss about the Elector Colleg...,0
1374,ff669174,0,Limiting car usage has many advantages. Such a...,0
1375,ffa247e0,0,There's a new trend that has been developing f...,0
1376,ffc237e9,0,As we all know cars are a big part of our soci...,0


In [4]:
# Split the data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Drop the last column from the test data
test_df = test_df.iloc[:, :-1]

# Create a directory to save the temporary data
os.makedirs('Data/temporary', exist_ok=True)

# Save the training and testing data
train_df.to_csv('Data/temporary/Temp_Train_Data.csv', index=False)
test_df.to_csv('Data/temporary/Temp_Test_Data.csv', index=False)

In [5]:
# Load the training and testing data
train_df = pd.read_csv('Data/temporary/Temp_Train_Data.csv')
test_df = pd.read_csv('Data/temporary/Temp_Test_Data.csv')


In [6]:
# Display the training data
train_df

Unnamed: 0,id,prompt_id,text,generated
0,e0dbb2e7,0,"Cars, they make life so much easier, or, do th...",0
1,3d75a33b,0,Now a days you see everyone with cars driving ...,0
2,6049a24f,1,Presidential election is held every after four...,0
3,cecf6e5e,1,I dont think that the electoral college should...,0
4,669e6f61,0,How could we get the reduction of Greenhouse G...,0
...,...,...,...,...
1097,c7c1cf69,0,People all over the world are saying goodbye t...,0
1098,cf8af518,1,"Dear state senator, I believe that we shouldn'...",0
1099,f01dd0a6,0,The modern automobile has been the axle to the...,0
1100,9b753df1,1,The Electoral College is a process that should...,0


In [7]:
# Display the testing data
test_df

Unnamed: 0,id,prompt_id,text
0,70d7c567,0,Can you imagine living in a place where there ...
1,81977e6c,0,Limiting car usage could have many advantages ...
2,e43869b8,0,"In this generation, our planet is getting filt..."
3,d33eca96,0,Cars are a basic need for people today we use ...
4,71f7131e,1,Why do we keep this despised method of choosin...
...,...,...,...
271,62b480e1,0,The culture of the car has been coming to an e...
272,7405b110,0,"Cars, though useful, have negative impacts on ..."
273,47e743a7,0,The extensive use of car transportation is set...
274,95d41c3e,0,Do you know what pollution is? Have you ever t...


In [8]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Define the function to preprocess the text
def preprocess_text(text):
    # Tokenize the text
    words = word_tokenize(text)
    
    # Remove stopwords and lemmatize the words
    words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english')]
    
    return words

# Initialize tqdm for pandas
tqdm.pandas()

# Define a function to apply to the DataFrame
def preprocess_df(df):
    # Apply the preprocess_text function to the 'text' column with a progress bar
    return df['text'].progress_apply(preprocess_text)

# Start the timer
start_time = time.time()

# Preprocess the text
train_df['text'] = preprocess_df(train_df)

# Train a Word2Vec model
model = Word2Vec(train_df['text'].tolist(), vector_size=100, window=5, min_count=1, workers=4)

# Convert words into numerical representations
train_df['text'] = train_df['text'].apply(lambda text: [model.wv[word] for word in text])

# Pad the sequences
train_df['text'] = pad_sequences(train_df['text']).tolist()

# Convert the list of sequences into a numpy array
X = np.array(train_df['text'].tolist())

# y is target variable
y = np.array(train_df['generated'].tolist())

# End the timer and print the elapsed time
end_time = time.time()
elapsed_time = end_time - start_time
hours, rem = divmod(elapsed_time, 3600)
minutes, seconds = divmod(rem, 60)
print(f"Elapsed time: {int(hours):02d}:{int(minutes):02d}:{seconds:05.2f}")

100%|██████████| 1102/1102 [01:31<00:00, 12.04it/s]


Elapsed time: 00:01:44.26


In [10]:
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense

# Define the model
model = Sequential()
model.add(SimpleRNN(100, input_shape=(None, 100)))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Fit the model
history = model.fit(X, y, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [11]:
# Evaluate the model
loss, accuracy = model.evaluate(X, y)
print(f'Accuracy: {accuracy * 100}%')

Accuracy: 99.90925788879395%


In [13]:
# Save the model
model.save('my_model.h5')

# Load the model
from keras.models import load_model
model = load_model('my_model.h5')

  saving_api.save_model(


In [14]:
# Preprocess the test data
test_df['text'] = preprocess_df(test_df)
test_df['text'] = test_df['text'].apply(lambda text: [model.wv[word] for word in text])
test_df['text'] = pad_sequences(test_df['text']).tolist()

# Convert the list of sequences into a numpy array
X_test = np.array(test_df['text'].tolist())
y_test = np.array(test_df['generated'].tolist())

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

100%|██████████| 276/276 [00:22<00:00, 12.06it/s]


AttributeError: 'Sequential' object has no attribute 'wv'