# Twitter Sentiment Analysis

In [None]:
import kagglehub
import pandas as pd

# Download latest version
path = kagglehub.dataset_download("jp797498e/twitter-entity-sentiment-analysis")

print("Path to dataset files:", path)

train_ds = pd.read_csv(path + '/twitter_training.csv')
valid_ds = pd.read_csv(path + '/twitter_validation.csv')

Path to dataset files: /kaggle/input/twitter-entity-sentiment-analysis


In [None]:
train_ds.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [None]:
train_df = train_ds[['Positive', 'im getting on borderlands and i will murder you all ,']]

In [None]:
train_df.shape

(73995, 2)

In [None]:
train_df = train_df.dropna()

In [None]:
train_df.columns

Index(['Positive', 'im getting on borderlands and i will murder you all ,'], dtype='object')

In [None]:
train_df = train_df.rename(columns={'im getting on borderlands and i will murder you all ,': 'text'})

In [None]:
train_df

Unnamed: 0,Positive,text
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...
...,...,...
74676,Positive,Just realized that the Windows partition of my...
74677,Positive,Just realized that my Mac window partition is ...
74678,Positive,Just realized the windows partition of my Mac ...
74679,Positive,Just realized between the windows partition of...


In [None]:
import re
import string

def remove_URL(text):
  url = re.compile(r'https?://\S+|www\.\S+')
  return url.sub(r"", text)

def remove_punctuation(text):
  translator = str.maketrans("", "", string.punctuation)
  return text.translate(translator)



In [None]:
train_df['text'] = train_df['text'].map(remove_URL)
train_df['text'] = train_df['text'].map(remove_punctuation)

In [None]:
import nltk
nltk.download('stopwords')

english_stopwords = set(nltk.corpus.stopwords.words('english'))

def remove_stopwords(text):
  filtered_words = [word.lower() for word in text.split() if word not in english_stopwords]
  return ' '.join(filtered_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
train_df['text'] = train_df['text'].map(remove_stopwords)

In [None]:
valid_ds.head()

Unnamed: 0,3364,Facebook,Irrelevant,"I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣"
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...


In [None]:
valid_df = valid_ds.iloc[:, [-2, -1]]

In [None]:
valid_df.columns

valid_df = valid_df.rename(columns={'Irrelevant': 'Sentiment'})
valid_df = valid_df.rename(columns={'I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣': 'text'})

In [None]:
train_df = train_df.rename(columns={'Positive': 'Sentiment'})

In [None]:
valid_df['text'] = valid_df['text'].map(remove_URL)
valid_df['text'] = valid_df['text'].map(remove_punctuation)
valid_df['text'] = valid_df['text'].map(remove_stopwords)

In [None]:
train_sentences = train_df['text'].to_numpy()
train_labels = train_df['Sentiment'].to_numpy()
validation_sentences = valid_df['text'].to_numpy()
validation_labels = valid_df['Sentiment'].to_numpy()

In [None]:
train_sentences.shape

(73995,)

In [None]:
valid_df.isna().sum()

Unnamed: 0,0
Sentiment,0
text,0


In [None]:
validation_sentences.shape

(999,)

In [None]:
from collections import Counter

# Count unique words
def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count


counter = counter_word(train_df.text)

num_unique_words = len(counter)

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(train_sentences)

In [None]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(validation_sentences)

In [None]:
max_length = 40

train_padded = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
val_padded = tf.keras.preprocessing.sequence.pad_sequences(val_sequences, maxlen=max_length, padding='post', truncating='post')
train_padded.shape, val_padded.shape

((73995, 40), (999, 40))

In [None]:
train_padded[1]

array([  7,  95,  67, 350,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0], dtype=int32)

In [None]:
valid_df.Sentiment.value_counts()

Unnamed: 0_level_0,count
Sentiment,Unnamed: 1_level_1
Neutral,285
Positive,277
Negative,266
Irrelevant,171


In [None]:

train_labels[train_labels=='Negative'] = 0
train_labels[train_labels=='Irrelevant'] = 1
train_labels[train_labels=='Neutral'] = 2
train_labels[train_labels=='Positive'] = 3


validation_labels[validation_labels=='Negative'] = 0
validation_labels[validation_labels=='Irrelevant'] = 1
validation_labels[validation_labels=='Neutral'] = 2
validation_labels[validation_labels=='Positive'] = 3


In [None]:
train_labels = train_labels.astype(int)
validation_labels = validation_labels.astype(int)

In [None]:
sentiment_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(num_unique_words, 32, input_length=max_length),
    tf.keras.layers.LSTM(64, dropout=0.1),
    tf.keras.layers.Dense(4, activation='softmax')
])

sentiment_model.compile(
    loss = tf.keras.losses.SparseCategoricalCrossentropy(),
    optimizer = tf.keras.optimizers.Adam(),
    metrics = ['accuracy']
)

es = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=4,
    restore_best_weights=True
)


history = sentiment_model.fit(train_padded, train_labels, epochs=10, validation_data=(val_padded, validation_labels), callbacks=[es])

Epoch 1/10
[1m2313/2313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 51ms/step - accuracy: 0.4038 - loss: 1.2497 - val_accuracy: 0.7928 - val_loss: 0.5897
Epoch 2/10
[1m2313/2313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 38ms/step - accuracy: 0.7535 - loss: 0.6668 - val_accuracy: 0.9159 - val_loss: 0.2778
Epoch 3/10
[1m2313/2313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 38ms/step - accuracy: 0.8583 - loss: 0.4087 - val_accuracy: 0.9439 - val_loss: 0.1841
Epoch 4/10
[1m2313/2313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 38ms/step - accuracy: 0.8925 - loss: 0.3079 - val_accuracy: 0.9489 - val_loss: 0.1682
Epoch 5/10
[1m2313/2313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 37ms/step - accuracy: 0.9074 - loss: 0.2541 - val_accuracy: 0.9520 - val_loss: 0.1766
Epoch 6/10
[1m2313/2313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 37ms/step - accuracy: 0.9193 - loss: 0.2178 - val_accuracy: 0.9560 - val_loss: 0.1783
