In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv


# Importing Libraries

In [2]:
import pandas as pd
import numpy as np
import re
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

2025-05-02 08:49:27.561652: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746175767.778789      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746175767.844519      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Loading Dataset

In [3]:
df = pd.read_csv('/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1')

In [4]:
df.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


# Data Analysis

In [5]:
df.columns

Index(['0', '1467810369', 'Mon Apr 06 22:19:45 PDT 2009', 'NO_QUERY',
       '_TheSpecialOne_',
       '@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D'],
      dtype='object')

In [6]:
df.columns = ['target', 'id', 'date', 'query', 'user', 'text']

In [7]:
df = df[['target', 'text']]

In [8]:
df.head()

Unnamed: 0,target,text
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew


In [9]:
df['target'].unique()

array([0, 4])

In [10]:
df['target'].replace({4:1}, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['target'].replace({4:1}, inplace=True)


In [11]:
df['target'].value_counts()

target
1    800000
0    799999
Name: count, dtype: int64

The data is perfectly balanced

In [12]:
df.shape

(1599999, 2)

# Data Preprocessing

In [13]:
def clean_text(text):
    text = re.sub(r'@\w+', '', text) 
    text = re.sub(r'http\S+', '', text)  
    text = re.sub(r'[^a-zA-Z\s]', '', text) 
    return text.lower()

df['text'] = df['text'].apply(clean_text)

# Tokenization

In [14]:
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(df['text'])

In [15]:
sequences = tokenizer.texts_to_sequences(df['text'])

In [16]:
max_length=50
pad_seq = pad_sequences(sequences, maxlen=max_length, truncating='post')

# Splitting Data

In [17]:
X_train, X_test, y_train, y_test = train_test_split(pad_seq, df['target'], test_size=0.2)

# Model Architecture

In [18]:
from tensorflow.keras.layers import Dropout

In [19]:
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_length),
    
    Bidirectional(LSTM(32, return_sequences=True)),  
    Bidirectional(LSTM(16)),  

    Dropout(0.4),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

I0000 00:00:1746175833.066720      31 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1746175833.067398      31 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


In [20]:
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

In [21]:
model.summary()

# Training Model

In [22]:
history = model.fit(
    X_train, y_train,
    epochs=5,
    batch_size=128,
    validation_split=0.25,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=2)]
)

Epoch 1/5


I0000 00:00:1746175839.425930      92 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m7500/7500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 13ms/step - accuracy: 0.7623 - loss: 0.4857 - val_accuracy: 0.8065 - val_loss: 0.4180
Epoch 2/5
[1m7500/7500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 13ms/step - accuracy: 0.8155 - loss: 0.4082 - val_accuracy: 0.8184 - val_loss: 0.4045
Epoch 3/5
[1m7500/7500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 13ms/step - accuracy: 0.8250 - loss: 0.3895 - val_accuracy: 0.8206 - val_loss: 0.3997
Epoch 4/5
[1m7500/7500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 13ms/step - accuracy: 0.8317 - loss: 0.3776 - val_accuracy: 0.8237 - val_loss: 0.3922
Epoch 5/5
[1m7500/7500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 13ms/step - accuracy: 0.8369 - loss: 0.3694 - val_accuracy: 0.8229 - val_loss: 0.3999


# Model On Unseen Data

In [23]:
loss, acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {acc*100:.2f}%")

[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 5ms/step - accuracy: 0.8243 - loss: 0.3973
Test Accuracy: 82.48%


In [3]:
# We can still train our model for more number of epochs for much large accuracy