In [None]:
# Install Kaggle API and Required Libraries
!pip install kaggle pandas numpy tensorflow scikit-learn

# Upload kaggle.json manually in Google Colab
from google.colab import files
files.upload()

# Move kaggle.json to the correct directory and set permissions
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download Sentiment140 dataset from Kaggle
!kaggle datasets download -d kazanova/sentiment140

# Unzip the dataset
!unzip sentiment140.zip



Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
sentiment140.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  sentiment140.zip
replace training.1600000.processed.noemoticon.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the dataset
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1', header=None)

# Assign column names
df.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']

# Convert labels: 0 (negative) -> 0, 4 (positive) -> 1
df['target'] = df['target'].apply(lambda x: 0 if x == 0 else 1)

# Preprocessing function: remove URLs, mentions, hashtags, and special characters
def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)     # Remove mentions
    text = re.sub(r'#\w+', '', text)     # Remove hashtags
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove special characters
    text = text.lower().strip()  # Convert to lowercase and strip whitespace
    return text

# Apply preprocessing
df['text'] = df['text'].apply(preprocess_text)

# Sample a smaller dataset for faster training (adjust size as needed)
df_sample = df.sample(n=50000, random_state=42)  # Using 50,000 samples

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df_sample['text'], df_sample['target'], test_size=0.2, random_state=42)

# Tokenization and Padding
max_words = 20000  # Vocabulary size
max_length = 150   # Max length of input sequence
embedding_dim = 128 # Embedding size

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_length, padding='post')
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=max_length, padding='post')


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dropout

model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.3),  # To prevent overfitting
    Bidirectional(LSTM(32)),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


# Train the model
model.fit(X_train_seq, y_train, epochs=10, batch_size=64, validation_data=(X_test_seq, y_test))

Epoch 1/10




[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 34ms/step - accuracy: 0.6729 - loss: 0.5839 - val_accuracy: 0.7688 - val_loss: 0.4753
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 30ms/step - accuracy: 0.8285 - loss: 0.3931 - val_accuracy: 0.7716 - val_loss: 0.4975
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 33ms/step - accuracy: 0.8743 - loss: 0.3031 - val_accuracy: 0.7568 - val_loss: 0.5251
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 32ms/step - accuracy: 0.9037 - loss: 0.2320 - val_accuracy: 0.7531 - val_loss: 0.6870
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 36ms/step - accuracy: 0.9220 - loss: 0.1884 - val_accuracy: 0.7497 - val_loss: 0.7273
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 30ms/step - accuracy: 0.9391 - loss: 0.1497 - val_accuracy: 0.7462 - val_loss: 0.8244
Epoch 7/10
[1m625/625[0m 

<keras.src.callbacks.history.History at 0x7e133e705410>

In [None]:
# Evaluate the model on test data
loss, accuracy = model.evaluate(X_test_seq, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

# Test with a sample review
def predict_sentiment(text):
    text = preprocess_text(text)
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post')
    prediction = model.predict(padded_sequence)[0][0]
    sentiment = "Positive" if prediction > 0.5 else "Negative"
    return sentiment

# Example predictions
print(predict_sentiment("I love this movie! It's amazing."))
print(predict_sentiment("This is the worst film I have ever seen."))
print(predict_sentiment("I hate a music."))
print(predict_sentiment("Excellent performance! You did a brilliant job."))

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - accuracy: 0.7439 - loss: 1.2337
Test Accuracy: 0.7410
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
Positive
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
Negative
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
Negative
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step
Positive


In [None]:
!git clone https://github.com/bassembeso23/Task_NLP.git