<a href="https://colab.research.google.com/github/aganjasarthak/from_scratch/blob/main/NLP_1000.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
# Import Libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import KFold
from nltk.corpus import movie_reviews
import nltk
import re
from nltk.corpus import stopwords

# Download necessary NLTK resources
nltk.download('movie_reviews')
nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))

# Load Movie Reviews Dataset
texts = [" ".join(movie_reviews.words(fileid)) for fileid in movie_reviews.fileids()]
labels = [1 if fileid.split('/')[0] == 'pos' else 0 for fileid in movie_reviews.fileids()]

df = pd.DataFrame({"text": texts, "label": labels})

# Preprocessing
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Lowercase
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)  # Remove stopwords
    return text

df['text'] = df['text'].apply(preprocess_text)

# Tokenization and Padding
max_words = 10000  # Increase vocabulary size
max_len = 200  # Increase sequence length

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text'])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

X = padded_sequences
y = np.array(df['label'])

# K-Fold Cross Validation
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Train the model
fold = 1
accuracy_per_fold = []

for train_index, val_index in kf.split(X):
    print(f"\nTraining on Fold {fold}/{k}...")

    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

    # Build Model
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=max_words, output_dim=100, input_length=max_len),  # Embedding layer
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(1, activation='sigmoid')  # Binary classification
    ])

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    # Add Early Stopping
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3000000, restore_best_weights=True)

    # Train Model
    history = model.fit(X_train, y_train,
                        epochs=100,
                        batch_size=32,
                        validation_data=(X_val, y_val),
                        callbacks=[early_stopping],
                        verbose=1)

    # Evaluate Model
    loss, accuracy = model.evaluate(X_val, y_val, verbose=0)
    print(f"Fold {fold} Accuracy: {accuracy:.2f}")
    accuracy_per_fold.append(accuracy)

    # Sample Predictions
    print(f"\nSample Predictions for Fold {fold}:")
    predictions = model.predict(X_val)
    predictions = (predictions > 0.5).astype(int)  # Convert probabilities to binary labels

    for i in range(3):  # Display first 3 predictions
        print(f"Review: {df['text'].iloc[val_index[i]]}")
        print(f"Actual Sentiment: {'Positive' if y_val[i] == 1 else 'Negative'}")
        print(f"Predicted Sentiment: {'Positive' if predictions[i][0] == 1 else 'Negative'}\n")

    fold += 1

# Print Overall Results
print("\n--- k-Fold Cross-Validation Results ---")
print(f"Accuracy per fold: {accuracy_per_fold}")
print(f"Mean Accuracy: {np.mean(accuracy_per_fold):.2f}")
print(f"Standard Deviation: {np.std(accuracy_per_fold):.2f}")


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Training on Fold 1/5...
Epoch 1/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 258ms/step - accuracy: 0.5302 - loss: 1.3820 - val_accuracy: 0.7425 - val_loss: 0.9491
Epoch 2/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 232ms/step - accuracy: 0.7845 - loss: 0.7633 - val_accuracy: 0.7800 - val_loss: 0.6549
Epoch 3/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 249ms/step - accuracy: 0.9427 - loss: 0.3409 - val_accuracy: 0.7875 - val_loss: 0.6055
Epoch 4/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 263ms/step - accuracy: 0.9781 - loss: 0.1892 - val_accuracy: 0.7600 - val_loss: 0.8181
Epoch 5/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 247ms/step - accuracy: 0.9800 - loss: 0.1256 - val_accuracy: 0.7250 - val_loss: 1.1137
Epoch 6/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 227ms/step - accuracy: 0.9726 - loss: 0.1182 - val_accuracy: 0.7600 - val_loss

KeyboardInterrupt: 

In [4]:
# Print the first 100 actual and predicted sentiments from the final fold
print("\n--- First 100 Actual and Predicted Sentiments (Last Fold) ---")

# Get predictions for the last validation set
final_predictions = model.predict(X_val)
final_predictions = (final_predictions > 0.5).astype(int).flatten()  # Convert to binary labels

# Print the first 100 samples
for i in range(min(100, len(y_val))):  # Ensure we don't exceed dataset size
    actual = "Positive" if y_val[i] == 1 else "Negative"
    predicted = "Positive" if final_predictions[i] == 1 else "Negative"
    print(f"Review {i+1}: Actual: {actual}, Predicted: {predicted}")



--- First 100 Actual and Predicted Sentiments (Last Fold) ---
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 48ms/step
Review 1: Actual: Negative, Predicted: Negative
Review 2: Actual: Negative, Predicted: Negative
Review 3: Actual: Negative, Predicted: Negative
Review 4: Actual: Negative, Predicted: Negative
Review 5: Actual: Negative, Predicted: Negative
Review 6: Actual: Negative, Predicted: Negative
Review 7: Actual: Negative, Predicted: Negative
Review 8: Actual: Negative, Predicted: Negative
Review 9: Actual: Negative, Predicted: Negative
Review 10: Actual: Negative, Predicted: Positive
Review 11: Actual: Negative, Predicted: Negative
Review 12: Actual: Negative, Predicted: Positive
Review 13: Actual: Negative, Predicted: Negative
Review 14: Actual: Negative, Predicted: Positive
Review 15: Actual: Negative, Predicted: Negative
Review 16: Actual: Negative, Predicted: Negative
Review 17: Actual: Negative, Predicted: Positive
Review 18: Actual: Negative, Predicted:

In [6]:
# Count matches
matches = final_predictions == y_val

# Count positive and negative matches
positive_matches = np.sum((final_predictions == 1) & matches)
negative_matches = np.sum((final_predictions == 0) & matches)

print(f"\n--- Matching Predictions Breakdown ---")
print(f"Total Matching Predictions: {np.sum(matches)}")
print(f"Positive Matches: {positive_matches}")
print(f"Negative Matches: {negative_matches}")



--- Matching Predictions Breakdown ---
Total Matching Predictions: 311
Positive Matches: 169
Negative Matches: 142


In [8]:
import pandas as pd
from google.colab import files

# Prepare Data for Export
export_data = pd.DataFrame({
    "Review": [df['text'].iloc[i] for i in val_index],
    "Actual Sentiment": ["Positive" if label == 1 else "Negative" for label in y_val],
    "Predicted Sentiment": ["Positive" if pred == 1 else "Negative" for pred in final_predictions]
})

# Export to Excel
file_name = "sentiment_predictions.xlsx"
export_data.to_excel(file_name, index=False)

# Automatically download the file in Colab
files.download(file_name)

print(f"Predictions exported to {file_name} and downloaded!")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Predictions exported to sentiment_predictions.xlsx and downloaded!
