In [2]:
import tensorflow as tf

## Getting Data

In [4]:
import zipfile

with zipfile.ZipFile('archive.zip', 'r') as zip_ref:
    zip_ref.extractall('sentiment140')

In [7]:
import pandas as pd

df = pd.read_csv('sentiment140/training.1600000.processed.noemoticon.csv', 
                 encoding='latin-1', header=None, )

In [8]:
df.columns = ['target', 'id', 'date', 'flag', 'user', 'text']

In [10]:
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [15]:
df['target'] = df['target'].replace(4, 1)

In [16]:
df['target'].value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

## Preprocessing Data

### Cleaning Sentences

In [18]:
import re

def clean_tweet(text):
    """
    Clean a tweet by removing URLs, mentions, hashtags, 
    punctuation, and converting to lowercase.
    """
    text = text.lower()                                # Lowercase all text
    text = re.sub(r'http\S+|www\S+', '', text)         # Remove URLs
    text = re.sub(r'@\w+', '', text)                   # Remove mentions
    text = re.sub(r'#', '', text)                      # Remove '#' from hashtags
    text = re.sub(r'[^a-z\s]', '', text)               # Remove non-alphabetic characters
    text = re.sub(r'\s+', ' ', text).strip()           # Remove extra whitespace
    return text

In [19]:
df['clean_text'] = df['text'].apply(clean_tweet)

In [21]:
df['text'][0]

"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"

In [22]:
df['clean_text'][0]

'a thats a bummer you shoulda got david carr of third day to do it d'

### Tokenization and Padding of Cleaned Tweets

##### Tokenization

In [23]:
from tensorflow.keras.preprocessing.text import Tokenizer

vocab_size = 10000  # You can adjust this based on dataset size
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(df['clean_text'])

sequences = tokenizer.texts_to_sequences(df['clean_text'])

In [32]:
sequences[0]

[5, 102, 5, 1207, 8, 3426, 49, 863, 9709, 13, 1842, 32, 3, 41, 10, 384]

In [33]:
df['clean_text'][0]

'a thats a bummer you shoulda got david carr of third day to do it d'

In [35]:
tokenizer.word_index['thats']

102

#### Padding

In [36]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_length = 32  # Tweets are short; 32 is usually plenty
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

In [41]:
sequences[0]

[5, 102, 5, 1207, 8, 3426, 49, 863, 9709, 13, 1842, 32, 3, 41, 10, 384]

In [39]:
padded_sequences[0]

array([   5,  102,    5, 1207,    8, 3426,   49,  863, 9709,   13, 1842,
         32,    3,   41,   10,  384,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0])

In [43]:
type(df['target'].values)

numpy.ndarray

## Model (Neural Network)

### Train Test Split

In [44]:
X = padded_sequences
y = df['target'].values

In [49]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

### Building and Training an RNN Model

In [53]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout

#### Building

In [54]:
# Define parameters
embedding_dim = 100

In [60]:
# Build the model
model = Sequential()
model.add(Input(shape=(X_train.shape[1],)))  # Use max_length from padded sequences
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim))
model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Binary classification

In [61]:
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [62]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 32, 100)           1000000   
                                                                 
 lstm (LSTM)                 (None, 64)                42240     
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,046,465
Trainable params: 1,046,465
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=5,
    batch_size=128,
    verbose=1
)

#### Training

In [63]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=5,
    batch_size=128,
    verbose=1
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Upgraded Model

#### Building

In [66]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Bidirectional

model = Sequential()
model.add(Input(shape=(X_train.shape[1],)))  # input shape = (max_length,)
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim))
model.add(Bidirectional(LSTM(128, return_sequences=False)))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))  # Binary classification

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [67]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 32, 100)           1000000   
                                                                 
 bidirectional_2 (Bidirectio  (None, 256)              234496    
 nal)                                                            
                                                                 
 dropout_5 (Dropout)         (None, 256)               0         
                                                                 
 dense_6 (Dense)             (None, 64)                16448     
                                                                 
 dropout_6 (Dropout)         (None, 64)                0         
                                                                 
 dense_7 (Dense)             (None, 1)                 65        
                                                      

#### Training

In [68]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=2,               # Stop if val_loss doesn't improve for 2 epochs
    restore_best_weights=True
)

In [69]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=20,                # Start with 20 (early stopping will likely stop sooner)
    batch_size=128,
    callbacks=[early_stop],
    verbose=1
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
