In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from sklearn.model_selection import train_test_split
import os




In [2]:
csv_path = 'sentiment140/training.1600000.processed.noemoticon.csv'

if not os.path.exists(csv_path):
    raise FileNotFoundError(f"CSV file not found at path: {csv_path}")

# Read the CSV file
df = pd.read_csv(csv_path, encoding='latin-1', header=None)
df.columns = ['sentiment', 'id', 'date', 'query', 'user', 'text']

# Keep only the relevant columns
df = df[['text', 'sentiment']]

# Convert sentiment: 0 = negative, 4 = positive
df['sentiment'] = df['sentiment'].map({0: 0, 4: 1})

# Optional: Use a subset for faster training
df = df.sample(n=100000, random_state=42).reset_index(drop=True)

texts = df['text'].values
labels = df['sentiment'].values

In [3]:
vocab_size = 10000
max_len = 50

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    padded, labels, test_size=0.2, random_state=42
)

In [6]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64, input_length=max_len),
    SimpleRNN(64),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 50, 64)            640000    
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 32)                2080      
                                                                 
 dense_3 (Dense)             (None, 1)                 33        
                                                                 
Total params: 650369 (2.48 MB)
Trainable params: 650369 (2.48 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [7]:
history = model.fit(
    X_train, y_train,
    epochs=3,
    batch_size=128,
    validation_data=(X_test, y_test)
)

Epoch 1/3


Epoch 2/3
Epoch 3/3


In [8]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"\n Test Accuracy: {accuracy:.4f}")


 Test Accuracy: 0.7652
