In [None]:
import numpy as np
import pandas as pd
import bz2
import nltk
nltk.download('stopwords')
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense, Dropout
from keras.models import save_model

# Function to load and extract labels and texts from the file
def load_extract(file, max_samples=None):
    texts, labels = [], []
    total_samples = 0
    for line in file:
        x = line.decode('utf-8')  # decode binary to string
        labels.append(int(x[9]) - 1)  # extract labels
        texts.append(x[10:].strip())  # extract texts
        total_samples += 1
        # Break loop if maximum number of samples is reached
        if max_samples is not None and total_samples >= max_samples:
            break
    print('Done !')
    return np.array(labels), texts

# Function to clean texts
def clean_texts(texts):
    stwords = stopwords.words('english')
    temp_texts = []
    total_samples = len(texts)
    for i, text in enumerate(texts):
        text = re.sub('\d','0',text) #replace every digit with 0
        if 'www.' in text or 'http:' in text or 'https:' in text or '.com' in text: # remove links and urls
            text = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", " ", text)

        text = re.sub('[^a-zA-Z]', ' ', text) #anything which is not a character replace with whitespace char
        text = text.lower()
        text = text.split()
        text = [word for word in text if not word in stwords] # remove stopwords
        text = ' '.join(text)
        temp_texts.append(text)
        # Print progress every 10000 samples
        if (i + 1) % 10000 == 0 or (i + 1) == total_samples:
            print(f"--{((i + 1) / total_samples) * 100:.2f}%--Done !")
    return temp_texts

# Open the bz2 files and load data
max_train_samples = 10000  # Set maximum number of train samples
max_test_samples = 5000  # Set maximum number of test samples
with bz2.BZ2File('train.ft.txt.bz2', 'r') as train_file, bz2.BZ2File('test.ft.txt.bz2', 'r') as test_file:
    train_labels, train_texts = load_extract(train_file, max_samples=max_train_samples)
    test_labels, test_texts = load_extract(test_file, max_samples=max_test_samples)

# Cleaning the texts
train_texts_cleaned = clean_texts(train_texts)
test_texts_cleaned = clean_texts(test_texts)

# Preprocessing
max_words = 10000  # Max number of words to keep
maxlen = 100  # Max length of sequences

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_texts_cleaned)
X_train = pad_sequences(tokenizer.texts_to_sequences(train_texts_cleaned), maxlen=maxlen)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_texts_cleaned), maxlen=maxlen)

y_train = np.array(train_labels)
y_test = np.array(test_labels)

# Model architecture
embedding_dim = 100
filters = 128
kernel_size = 5

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Conv1D(filters, kernel_size, activation='relu'))
model.add(MaxPooling1D())
model.add(Conv1D(filters, kernel_size, activation='relu'))
model.add(MaxPooling1D())
model.add(Conv1D(filters, kernel_size, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Training
batch_size = 32
epochs = 10
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, y_test))

# Evaluation
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)

# Save the trained model to a file
model.save('cnn_model.h5')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Done !
Done !
--100.00%--Done !
--100.00%--Done !
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.8101999759674072


  saving_api.save_model(


In [None]:
from sklearn.metrics import classification_report

# Predict probabilities on test data
probabilities = model.predict(X_test)

# Convert probabilities to class labels based on a threshold
predictions = (probabilities > 0.5).astype(int)

# Generate classification report
report = classification_report(y_test, predictions)

print(report)



              precision    recall  f1-score   support

           0       0.84      0.76      0.80      2435
           1       0.79      0.86      0.82      2565

    accuracy                           0.81      5000
   macro avg       0.81      0.81      0.81      5000
weighted avg       0.81      0.81      0.81      5000



In [None]:
import bz2
import pandas as pd

# Mapping labels to sentiment
label_map = {0: 'Negative', 1: 'Positive'}

# Convert labels to sentiment
train_sentiments = [label_map[label] for label in train_labels]
test_sentiments = [label_map[label] for label in test_labels]

# Create DataFrame to visualize the data
train_df = pd.DataFrame({'Sentiment': train_sentiments, 'Review': train_texts})
test_df = pd.DataFrame({'Sentiment': test_sentiments, 'Review': test_texts})

# Display the first few rows of the datasets
print("Train data:")
print(train_df.head(10))

print("\nTest data:")
print(test_df.head(10))




Train data:
  Sentiment                                             Review
0  Positive  Stuning even for the non-gamer: This sound tra...
1  Positive  The best soundtrack ever to anything.: I'm rea...
2  Positive  Amazing!: This soundtrack is my favorite music...
3  Positive  Excellent Soundtrack: I truly like this soundt...
4  Positive  Remember, Pull Your Jaw Off The Floor After He...
5  Positive  an absolute masterpiece: I am quite sure any o...
6  Negative  Buyer beware: This is a self-published book, a...
7  Positive  Glorious story: I loved Whisper of the wicked ...
8  Positive  A FIVE STAR BOOK: I just finished reading Whis...
9  Positive  Whispers of the Wicked Saints: This was a easy...

Test data:
  Sentiment                                             Review
0  Positive  Great CD: My lovely Pat has one of the GREAT v...
1  Positive  One of the best game music soundtracks - for a...
2  Negative  Batteries died within a year ...: I bought thi...
3  Positive  works fine, but Ma

In [None]:
from sklearn.metrics import f1_score

# Calculate predictions for the test set
# Assuming you have already trained your model and obtained predictions
# Replace 'predictions' with your actual predictions
# Replace 'model' with your actual trained model
predictions = model.predict(X_test)
predictions = (predictions > 0.5).astype(int)  # Convert probabilities to binary predictions

# Convert predictions to sentiment labels
predicted_sentiments = [label_map[prediction[0]] for prediction in predictions]  # Accessing individual elements of the NumPy array

# Calculate F1 score
f1 = f1_score(test_sentiments, predicted_sentiments, average='binary', pos_label='Positive')

print("F1 Score:", f1)



F1 Score: 0.8146543234193421


In [None]:
# Example Amazon reviews
example_amazon_reviews = [
    "This product exceeded my expectations. Highly recommended!",
    "The quality of this product is very poor. I regret buying it.",
    "Great value for the price. Will buy again.",
    "I received a defective product. Disappointed with the purchase.",
    "Excellent customer service. They resolved my issue quickly.",
    "Worst product ever! Do not waste your money.",
    "Fast shipping and good packaging. Very satisfied.",
    "Not as described. Misleading product information.",
    "I love this product! It's exactly what I was looking for.",
    "Terrible experience with this seller. Avoid at all costs."
]

# Preprocess the example reviews
example_amazon_reviews_cleaned = clean_texts(example_amazon_reviews)

# Convert text to sequences
example_amazon_sequences = tokenizer.texts_to_sequences(example_amazon_reviews_cleaned)

# Pad sequences
example_amazon_sequences_padded = pad_sequences(example_amazon_sequences, maxlen=maxlen)

# Predict sentiment
predictions = model.predict(example_amazon_sequences_padded)

# Set a dynamic threshold based on validation set metrics during model training
# Replace 'optimal_threshold' with the threshold obtained during model training
optimal_threshold = 0.5

# Convert probabilities to sentiment labels using the optimal threshold
sentiment_labels = ['Positive' if pred >= optimal_threshold else 'Negative' for pred in predictions]

# Print example reviews and their predicted sentiments
for review, sentiment in zip(example_amazon_reviews, sentiment_labels):
    print(f"Review: {review}")
    print(f"Predicted Sentiment: {sentiment}")
    print()


--100.00%--Done !
Review: This product exceeded my expectations. Highly recommended!
Predicted Sentiment: Negative

Review: The quality of this product is very poor. I regret buying it.
Predicted Sentiment: Negative

Review: Great value for the price. Will buy again.
Predicted Sentiment: Positive

Review: I received a defective product. Disappointed with the purchase.
Predicted Sentiment: Negative

Review: Excellent customer service. They resolved my issue quickly.
Predicted Sentiment: Positive

Review: Worst product ever! Do not waste your money.
Predicted Sentiment: Negative

Review: Fast shipping and good packaging. Very satisfied.
Predicted Sentiment: Positive

Review: Not as described. Misleading product information.
Predicted Sentiment: Negative

Review: I love this product! It's exactly what I was looking for.
Predicted Sentiment: Positive

Review: Terrible experience with this seller. Avoid at all costs.
Predicted Sentiment: Negative

