In [None]:
import os
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from keras.optimizers import Adam
from keras.losses import binary_crossentropy
from keras.callbacks import EarlyStopping
from data_utils import train_texts
import matplotlib.pyplot as plt

In [None]:
# Define constants
MAX_WORDS = 8000  # Vocabulary size
MAX_SEQUENCE_LENGTH = 500  # Max number of words per review
EMBEDDING_DIM = 100  # Dimension of word embeddings
BATCH_SIZE = 128
NUM_EPOCHS = 15

In [None]:
# Load data
# Get the absolute path of the current directory
current_dir = os.path.abspath(os.path.dirname(__file__))

# Construct paths to train and test folders
train_dir = os.path.join(current_dir, 'train')
test_dir = os.path.join(current_dir, 'test')

In [None]:
train_texts = []
train_labels = []

for category in ['neg', 'pos']:
    train_path = os.path.join(train_dir, category)
    for fname in os.listdir(train_path):
        if fname.endswith('.txt'):
            with open(os.path.join(train_path, fname), 'r', encoding='utf-8') as f:
                train_texts.append(f.read())
            train_labels.append(0 if category == 'neg' else 1)

test_texts = []
test_labels = []


for category in ['neg', 'pos']:
    test_path = os.path.join(test_dir, category)
    for fname in os.listdir(test_path):
        if fname.endswith('.txt'):
            with open(os.path.join(test_path, fname), 'r', encoding='utf-8') as f:
                test_texts.append(f.read())
            test_labels.append(0 if category == 'neg' else 1)

In [None]:
# Tokenize texts
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(train_texts)
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)
train_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:

# Define CNN model
model = Sequential()
model.add(Embedding(MAX_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

In [None]:
# Compile model
model.compile(optimizer=Adam(), loss=binary_crossentropy, metrics=['accuracy'])

# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1, restore_best_weights=True)

# Train model with early stopping
model.fit(train_data, np.array(train_labels), epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, callbacks=[early_stopping], validation_split=0.2)

In [None]:
loss, accuracy = model.evaluate(test_data, np.array(test_labels))
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

In [None]:
from sklearn.metrics import classification_report

# Predict test labels
y_pred = model.predict_classes(test_data)

# Print classification report
print(classification_report(test_labels, y_pred))


In [None]:
# Train model with early stopping and save history
history = model.fit(train_data, np.array(train_labels), epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, callbacks=[early_stopping], validation_split=0.2)

# Plot training and validation loss
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()


In [None]:
from sklearn.metrics import roc_curve, auc

# Calculate probabilities for positive class
y_prob = model.predict_proba(test_data)

# Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(test_labels, y_prob)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()
