In [None]:
import pandas as pd
import numpy as np
from keras import losses, metrics, optimizers
from keras.models import Sequential
from keras.layers import Dense
from keras.preprocessing.text import Tokenizer

# Load text data and labels from CSV files
df_text = pd.read_csv('test_reviews.csv')
df_labels = pd.read_csv('test_labels_pred.csv')

# Create a mapping for sentiment labels (Negative -> 0, Positive -> 1)
label_mapping = {'Negative': 0, 'Positive': 1}
df_labels['sentiment'] = df_labels['sentiment'].map(label_mapping)

# Merge text data and labels on the 'id' column
merged_df = pd.merge(df_text, df_labels, on='id')

# Define the fraction of data to be used for training
train_fraction = 0.8
train_size = int(len(merged_df) * train_fraction)

# Split the data into training and testing sets
train_data = merged_df['text'][:train_size]
train_labels = merged_df['sentiment'][:train_size]
test_data = merged_df['text'][train_size:]
test_labels = merged_df['sentiment'][train_size:]

# Define the maximum number of words to tokenize
max_words = 10000  

# Initialize a tokenizer and fit it on the training data
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_data)

# Convert text data to binary matrix representation
x_train = tokenizer.texts_to_matrix(train_data, mode='binary')
x_test = tokenizer.texts_to_matrix(test_data, mode='binary')

# Convert labels to NumPy arrays of float32
y_train = np.asarray(train_labels).astype('float32')
y_test = np.asarray(test_labels).astype('float32')

# Create a Sequential model for binary classification
model = Sequential()
model.add(Dense(16, activation='relu', input_shape=(max_words,)))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model with RMSprop optimizer, binary cross-entropy loss, and binary accuracy metric
model.compile(optimizer=optimizers.RMSprop(lr=0.001),
              loss=losses.binary_crossentropy,
              metrics=[metrics.binary_accuracy])

# Train the model on the training data
history = model.fit(x_train,
                    y_train,
                    epochs=100,
                    batch_size=512,
                    validation_split=0.2)


In [None]:
import pandas as pd
from keras.preprocessing.text import Tokenizer

df_text = pd.read_csv('test_reviews.csv')
df_labels = pd.read_csv('test_labels_pred.csv')
merged_df = pd.merge(df_text, df_labels, on='id')

max_words = 10000 
# Initialize a tokenizer and fit it on the 'text' column of the merged dataframe
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(merged_df['text'])

# Specify the row number of the text to decode (0-based index)
row_number_to_decode = 0 

# Retrieve the text from the specified row
chosen_text = merged_df['text'][row_number_to_decode]

# Tokenize the chosen text into sequences using the trained tokenizer
chosen_text_data = tokenizer.texts_to_sequences([chosen_text])

# Create a reverse word index to map numeric tokens back to words
reverse_word_index = {v: k for k, v in tokenizer.word_index.items()}

# Decode the numeric sequence back into text, replacing unknown words with '?'
decoded_review = ' '.join([reverse_word_index.get(i, '?') for i in chosen_text_data[0]])

# Print the decoded review
print(decoded_review)

In [None]:
# Evaluate the model on the test data to calculate test loss and accuracy
test_loss, test_accuracy = model.evaluate(x_test, y_test)

# Print the test loss and test accuracy
print("Loss :",test_loss)
print("Accuracy :",test_accuracy)

In [None]:
# Save the trained model to a file named 'model.h5'
model.save('model.h5')