In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder  # For label encoding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.optimizers import Adam
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df_ip_train = pd.read_csv("/content/drive/MyDrive/Sentiment_data/train.csv")
print(df_ip_train.shape)

(5600, 3)


In [None]:
# Part of preprocessing :  clean the tweets
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetical characters
    text = re.sub(r'#\w+', '', text) # remove hashtags
    text = text.lower()  # Convert to lowercase
    return text

In [None]:
tokens = df_ip_train['tweet'].values
tokens = [clean_text(token) for token in tokens]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokens)
word_index = tokenizer.word_index

In [None]:
# function to preprocess input data
def preprocess(df):
  tweets = df['tweet'].values
  sentiments = df['sentiment'].values

  # Convert the string sentiments to integers using LabelEncoder
  encoder = LabelEncoder()
  desired_op = encoder.fit_transform(sentiments)  # This will convert "positive" to 1 and "negative" to 0


  tweets = [clean_text(tweet) for tweet in tweets]

  # Tokenize the text data
  sequences = tokenizer.texts_to_sequences(tweets)

  # Pad the sequences to ensure uniform length
  max_sequence_length = 100
  input = pad_sequences(sequences, maxlen=max_sequence_length)
  return input, desired_op

In [None]:
# get GloVe
!wget https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
!unzip -q glove.6B.zip

--2025-04-23 07:10:20--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2025-04-23 07:12:59 (5.18 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]



In [None]:
embedding_dim = 200  # GloVe 200D embeddings
embeddings_index = {}

# Read the GloVe file and store the embeddings
with open('glove.6B.200d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding

In [None]:
# Making embedding matrix
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
# Object RNN model defintion
class RNNModel:
    def __init__(self, word_index, embedding_matrix, embedding_dim=200, max_sequence_length=100):
        # Initialize the parameters required for the model
        self.word_index = word_index
        self.embedding_matrix = embedding_matrix
        self.embedding_dim = embedding_dim
        self.max_sequence_length = max_sequence_length
        self.model = self.build_model()

    def build_model(self):
        # Build the RNN model
        model = Sequential()
        # Add the embedding layer with pre-trained GloVe embeddings
        model.add(Embedding(len(self.word_index) + 1, self.embedding_dim,
                            weights=[self.embedding_matrix], input_length=self.max_sequence_length,
                            trainable=False))

        # Add SimpleRNN layer
        model.add(SimpleRNN(25, dropout=0.0, recurrent_dropout=0.0))

        # Add the output layer with a sigmoid activation for binary classification
        model.add(Dense(1, activation='sigmoid'))

        # Compile the model
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

        return model

    def get_model(self):
        # Return the built model
        return self.model


In [None]:
rnn_model = RNNModel(word_index, embedding_matrix, embedding_dim=200, max_sequence_length=100)

# Get the model
sentiment_model = rnn_model.get_model()

#preprocess training data
input_train, desired_op_train = preprocess(df_ip_train)

# train model
sentiment_model.fit(input_train, desired_op_train, epochs=5, batch_size=64)



Epoch 1/5
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 48ms/step - accuracy: 0.8423 - loss: 0.3970
Epoch 2/5
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 30ms/step - accuracy: 0.9359 - loss: 0.2160
Epoch 3/5
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - accuracy: 0.9365 - loss: 0.1864
Epoch 4/5
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 27ms/step - accuracy: 0.9473 - loss: 0.1533
Epoch 5/5
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 29ms/step - accuracy: 0.9522 - loss: 0.1443


<keras.src.callbacks.history.History at 0x7d32d5887090>

In [None]:
df_test = pd.read_csv("/content/drive/MyDrive/Sentiment_data/test.csv")
print(df_test.shape)
input_test, desired_op_test = preprocess(df_test)
loss, accuracy = sentiment_model.evaluate(input_test, desired_op_test)
print( loss, accuracy)

(1400, 3)
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.9427 - loss: 0.1895
0.20046718418598175 0.9392856955528259
