In [1]:
import pandas as pd
import numpy as np


Load dataset


In [2]:
data = pd.read_csv('/content/judge-1377884607_tweet_product_company.csv', encoding='latin-1')


In [3]:
data.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [4]:
data.shape

(9093, 3)

In [5]:
data.isna().sum()

Unnamed: 0,0
tweet_text,1
emotion_in_tweet_is_directed_at,5802
is_there_an_emotion_directed_at_a_brand_or_product,0


In [6]:
#eliminate missing values
data = data.dropna()

In [7]:
data.columns

Index(['tweet_text', 'emotion_in_tweet_is_directed_at',
       'is_there_an_emotion_directed_at_a_brand_or_product'],
      dtype='object')

In [8]:
#changing header name of columns
new_headers = ['tweet','brand','sentiment']
data.columns = new_headers
data.head()

Unnamed: 0,tweet,brand,sentiment
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [9]:
data['sentiment'].unique()

array(['Negative emotion', 'Positive emotion',
       'No emotion toward brand or product', "I can't tell"], dtype=object)

In [10]:
data['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
Positive emotion,2672
Negative emotion,519
No emotion toward brand or product,91
I can't tell,9


In [11]:
#rename elements of sentiment column
#rename 'No emotion toward brand or product as neutral

data['sentiment'] = data['sentiment'].replace( 'No emotion toward brand or product', 'neutral')
data['sentiment'] = data['sentiment'].replace( 'Positive emotion', 'positive')
data['sentiment'] = data['sentiment'].replace( 'Negative emotion', 'negative')
data['sentiment'] = data['sentiment'].replace( 'I can\'t tell', 'no idea')

data['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,2672
negative,519
neutral,91
no idea,9


In [12]:
#label sentiment negative as 0,positive as 1,neutral as 2,no idea as 3
#label encoding for coulumn sentiment in another column
data['sentiment_label'] = data['sentiment'].factorize()[0]

data.head()


Unnamed: 0,tweet,brand,sentiment,sentiment_label
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,negative,0
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,positive,1
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,positive,1
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,negative,0
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,positive,1


In [13]:
data['sentiment_label'].value_counts()

Unnamed: 0_level_0,count
sentiment_label,Unnamed: 1_level_1
1,2672
0,519
2,91
3,9


cleaning text data

In [14]:
import nltk
import string
nltk.download('stopwords')
from nltk.corpus import stopwords
import re

stop_words = set(stopwords.words('english'))

def clean_tweet(tweet):
    # Handle potential float values (e.g., NaN)
    if isinstance(tweet, float):
        return '' # Or any other appropriate handling for float values
    #convert to lowercase
    tweet = tweet.lower()
    #remove urls
    tweet = re.sub(r'http\S+', '', tweet)
    #remove mentions
    tweet = re.sub(r'@\w+', '', tweet)
    #remove hashtags
    tweet = re.sub(r'#\w+', '', tweet)
    #remove numbers
    tweet = re.sub(r'\d+', '', tweet)
    #remove special characters
    tweet = re.sub(r'[^\w\s]', '', tweet)
    #remove punctuations
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    #remove stopwords
    tweet = ' '.join([word for word in tweet.split() if word not in stop_words])
    return tweet

data['cleaned_tweet'] = data['tweet'].apply(clean_tweet)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [15]:
data.head()

Unnamed: 0,tweet,brand,sentiment,sentiment_label,cleaned_tweet
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,negative,0,g iphone hrs tweeting dead need upgrade plugin...
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,positive,1,know awesome ipadiphone app youll likely appre...
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,positive,1,wait also sale
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,negative,0,hope years festival isnt crashy years iphone app
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,positive,1,great stuff fri marissa mayer google tim oreil...


test train split

In [16]:
from sklearn.model_selection import train_test_split
x = data['cleaned_tweet']
y = data['sentiment_label']
#split dataset into train,test and validation
x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.3, random_state=42)
x_valid, x_test, y_valid, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)

text vectorization

In [17]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
#using tf and idf
tfidf_vectorizer = TfidfVectorizer()
x_train = tfidf_vectorizer.fit_transform(x_train)
x_test = tfidf_vectorizer.transform(x_test)
x_valid = tfidf_vectorizer.transform(x_valid)

In [18]:
#conver sparse matrices to array
x_train = x_train.toarray()
x_test = x_test.toarray()
x_valid = x_valid.toarray()

In [19]:
# Get the minimum and the maximum length of reviews (number of features after vectorization)
print("Max length of a review (number of features):: ", max(len(x_train[0]), len(x_test[0])))
print("Min length of a review (number of features):: ", min(len(x_train[0]), len(x_test[0])))

Max length of a review (number of features)::  4529
Min length of a review (number of features)::  4529


Simple RNN

In [20]:
# creating a RNN model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Dropout
#model initialisation
RNN_model = Sequential()
#first RNN layer
RNN_model.add(SimpleRNN(units=128, input_shape=(x_train.shape[1], 1), activation='relu', return_sequences=True))
#first dropout layer for reducing overfitting
RNN_model.add(Dropout(0.2))
#second RNN layer
RNN_model.add(SimpleRNN(units=64, activation='relu'))
#second dropout layer
RNN_model.add(Dropout(0.2))
#output layer classifies input into 4 sentiment categories
RNN_model.add(Dense(units=4, activation='softmax'))
#printing model summary
RNN_model.summary()


  super().__init__(**kwargs)


Compile the model

In [21]:
RNN_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
y_train = tf.keras.utils.to_categorical(y_train, num_classes=4)
y_valid = tf.keras.utils.to_categorical(y_valid, num_classes=4)
# Training the model
history = RNN_model.fit(x_train, y_train,batch_size=64, epochs=5,verbose=1,validation_data=(x_valid, y_valid))

Epoch 1/5
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m296s[0m 8s/step - accuracy: 0.7199 - loss: 1.0298 - val_accuracy: 0.8239 - val_loss: 0.5519
Epoch 2/5
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m290s[0m 7s/step - accuracy: 0.8057 - loss: 0.6224 - val_accuracy: 0.8239 - val_loss: 0.5458
Epoch 3/5
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m257s[0m 7s/step - accuracy: 0.8152 - loss: 0.6177 - val_accuracy: 0.8239 - val_loss: 0.5380
Epoch 4/5
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m266s[0m 7s/step - accuracy: 0.8083 - loss: 0.6231 - val_accuracy: 0.8239 - val_loss: 0.5444
Epoch 5/5
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m254s[0m 7s/step - accuracy: 0.8012 - loss: 0.6191 - val_accuracy: 0.8239 - val_loss: 0.5381


Evaluate

In [22]:
#test the model
y_test = tf.keras.utils.to_categorical(y_test, num_classes=4) # One-hot encode y_test
test_loss, test_accuracy = RNN_model.evaluate(x_test, y_test)
print(f"Test Accuracy: {test_accuracy}")


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 687ms/step - accuracy: 0.7980 - loss: 0.5972
Test Accuracy: 0.8056679964065552


Sample prediction


In [31]:
#sample prediction
sample_review1 = '.@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead!  I need to upgrade. Plugin stations at #SXSW.'

# Clean the sample review using the same function used for training data
sample_review1_cleaned = clean_tweet(sample_review1)

# Transform the cleaned review into a numerical representation using the same TfidfVectorizer
sample_review1_vec = tfidf_vectorizer.transform([sample_review1_cleaned])

# Convert to array (if necessary, based on your TfidfVectorizer output)
sample_review1_vec = sample_review1_vec.toarray()

# Make the prediction
prediction = RNN_model.predict(sample_review1_vec)
# Step 4: Interpret prediction
class_labels = ['negative', 'positive', 'neutral', 'no_idea']  # Same order as during training
predicted_class_index = prediction.argmax(axis=1)[0]  # Get the index of the highest probability
predicted_label = class_labels[predicted_class_index]

# Output
print(f"Prediction probabilities: {prediction}")
print(f"Predicted Sentiment: {predicted_label}")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 299ms/step
Prediction probabilities: [[0.13523446 0.8351434  0.02496624 0.00465586]]
Predicted Sentiment: positive
