In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('judge-1377884607_tweet_product_company.csv',encoding = "ISO-8859-1")

In [3]:
# Set the option to display the full text content of DataFrame columns

pd.set_option('display.max_colwidth',None)
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,".@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",iPhone,Negative emotion
1,"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,iPad or iPhone App,Negative emotion
4,"@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",Google,Positive emotion


In [4]:
df = df.drop("emotion_in_tweet_is_directed_at", axis=1)

In [5]:
df.head()

Unnamed: 0,tweet_text,is_there_an_emotion_directed_at_a_brand_or_product
0,".@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",Negative emotion
1,"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.,Positive emotion
3,@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,Negative emotion
4,"@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",Positive emotion


In [6]:
# Renaming columns 

df.columns = ['tweet', 'sentiment']

In [7]:
df.head()

Unnamed: 0,tweet,sentiment
0,".@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",Negative emotion
1,"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.,Positive emotion
3,@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,Negative emotion
4,"@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",Positive emotion


In [8]:
df.shape

(9093, 2)

In [9]:
df['sentiment'].value_counts()

sentiment
No emotion toward brand or product    5389
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: count, dtype: int64

In [10]:
# Count the number of missing values in the 'tweet' column

df['tweet'].isna().sum()

1

In [11]:
# handle missing values

df['tweet'].fillna('', inplace=True)

In [12]:
# remove puncuations

import string
import re
def remove_pun(text):
    text = ''.join([i for i in text if i not in string.punctuation])
    # removing URL
    text = re.sub(r'http\S+|www\S+|\S+\.com\S+', '', text, flags=re.MULTILINE)
    # removing the tags from the text
    text = re.sub(r'(@\S+) | (#\S+)', r'', text)
    # removing the RT from the text
    text = re.sub(r'\bRT\b', r'', text)
    # removing repeated characters
    return re.sub(r'(.)1+', r'1', text)


In [13]:
 # Tokenize the given text into words

import nltk
nltk.download('punkt')
def tokenize(text):
    words = nltk.word_tokenize(text)
    return words

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
# Remove stopwords from the given text

nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
    output = [i for i in text if i not in stopwords]
    return output

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
def stemming(text):
  stem_text = [ps.stem(word) for word in text]
  return stem_text


In [16]:
# Lemmatize each word in the given text

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
wordnet_lemm = WordNetLemmatizer()
def lemma(text):
    lemm_text = [wordnet_lemm.lemmatize(word) for word in text]
    return lemm_text

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [17]:
def preprocess(df_col):
  corpus = []
  for item in df_col:
    new_item = remove_pun(item)
    new_item = new_item.lower()
    new_item = tokenize(new_item)
    new_item = remove_stopwords(new_item)
    new_item = lemma(new_item)
    corpus.append(' '.join(str(x) for x in new_item))
  return corpus

In [18]:
corpus = preprocess(df['tweet'])


In [19]:
x = corpus
y = df['sentiment']

In [20]:
# Encode the target variable
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y = to_categorical(y)

In [21]:
# Tokenize the text data using Keras Tokenizer

from keras.preprocessing import text
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(list(corpus))
tokenized_text = tokenizer.texts_to_sequences(corpus)

In [22]:
len(tokenized_text[0])

13

In [23]:
len(tokenized_text[1])

16

In [24]:
# Pad the tokenized_text to make all text sequences the same length (100)

from keras.utils import pad_sequences
x = pad_sequences(tokenized_text, maxlen=100)

In [25]:
# Split the data into training and testing sets

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [26]:
# Define a sequential model with Embedding, SimpleRNN, Dropout, Dense, and Softmax layers
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding,SimpleRNN,Dropout

# Create a Sequential model
model = Sequential()

# Add an Embedding layer
model.add(Embedding(input_dim = len(tokenizer.word_index)+1, output_dim=128, input_length=100))

# Add a SimpleRNN layer with 32 units
model.add(SimpleRNN(32))
model.add(Dropout(0.5))

# Add the final Dense layer with 4 units (for 4 classes)
model.add(Dense(4, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()
     

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 128)          1314176   
                                                                 
 simple_rnn (SimpleRNN)      (None, 32)                5152      
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                                 
 dense (Dense)               (None, 4)                 132       
                                                                 
Total params: 1,319,460
Trainable params: 1,319,460
Non-trainable params: 0
_________________________________________________________________


In [27]:
model.fit(x_train, y_train, epochs=10, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1f62bd2ed70>

In [28]:
accuracy_SimpleRNN = model.evaluate(x_test, y_test)[1]
print(f'Test Accuracy: {accuracy_SimpleRNN  * 100:.2f}%')

Test Accuracy: 62.67%


## LSTM

In [29]:
# Create a Sequential model
model2 = Sequential()

# Add an Embedding layer
model2.add(Embedding(input_dim = len(tokenizer.word_index)+1, output_dim=128, input_length=100))

# Add a LSTM layer with 64 units
model2.add(LSTM(64, dropout = 0.2))

# Add the final Dense layer with 4 units (for 4 classes)
model2.add(Dense(4, activation='softmax'))

model2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model2.summary() 
     

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 128)          1314176   
                                                                 
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 dense_1 (Dense)             (None, 4)                 260       
                                                                 
Total params: 1,363,844
Trainable params: 1,363,844
Non-trainable params: 0
_________________________________________________________________


In [30]:
model2.fit(x_train, y_train, epochs=10, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1f62c08ccd0>

In [32]:
accuracy_LSTM = model2.evaluate(x_test, y_test)[1]
print(f'Test Accuracy: {accuracy_LSTM * 100:.2f}%')

Test Accuracy: 62.78%
