<a href="https://colab.research.google.com/github/abakm/AL-ML_Assignment-1/blob/master/Assignment_06_Deep_Learning_for_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import libraries

In [1]:
import re
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense,Embedding
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Load dataset

In [2]:
# Load the data
df = pd.read_csv('./judge-1377884607_tweet_product_company.csv', encoding='ISO-8859-1')

# Remove ‘emotion_in_tweet_is_directed_at’ column
df.drop(columns=['emotion_in_tweet_is_directed_at'], inplace=True)

# print dataset
df.head()

Unnamed: 0,tweet_text,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion


# Preprocessing

In [3]:
STOPWORDS = stopwords.words('english')
def clean_text(text):
    # Convert text to lowercase
    text = str(text).lower().replace('\n',' ').replace(',','').replace('.','')

    # Remove links parts
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    text = re.sub(r'@\w+|#\w+', '', text)

    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Remove non alphabet and stopwords
    text = ' '.join([text for text in text.split() if text.isalpha() and text not in STOPWORDS])
    return text


In [4]:
df.dropna() # Remove stopwords
df.drop_duplicates() # Remove duplicates
df['tweet_text'] = df['tweet_text'].apply(lambda tweet_text:clean_text(text=tweet_text)) # clean data
emotion_mapping = {
    'I can\'t tell': 0,
    'Negative emotion': 1,
    'No emotion toward brand or product': 2,
    'Positive emotion': 3
}# Define integer of each output

y = df['is_there_an_emotion_directed_at_a_brand_or_product'].map(emotion_mapping) # Replace each emotion with its index
y

Unnamed: 0,is_there_an_emotion_directed_at_a_brand_or_product
0,1
1,3
2,3
3,1
4,3
...,...
9088,3
9089,2
9090,2
9091,2


# Tokenization

In [5]:
tweets = df['tweet_text'].values.tolist()
tokenizer.fit_on_texts(tweets)
tokenizer.word_index
total_words = len(tokenizer.word_index) + 1
print(total_words)

9054


# Padding

In [6]:
# Convert texts to sequences
x = tokenizer.texts_to_sequences(tweets)


print()
# Then pad the sequences
x= pad_sequences(x, maxlen=total_words, padding='pre')

[[7, 2190, 503, 714, 48, 876, 2191, 3415], [52, 77, 1184, 10, 316, 877, 1855, 67, 134, 259, 149, 23, 2649], [141, 134, 457], [224, 193, 408, 504, 3416, 193, 7, 10], [37, 346, 1443, 50, 49, 4, 645, 1084, 199, 3417, 11, 1085, 3418, 1444], [8, 3, 46, 622, 623, 389], [5139], [377, 79, 878, 2650, 2651, 1603, 57, 29, 740], [3419, 172, 1086, 305, 2, 930, 3, 10], [3420, 216, 595, 1303, 1185, 3421, 679, 543, 5, 1445], [277, 200, 84, 1604, 1087, 97, 741, 20], [122, 11, 433, 2192, 272, 117, 141, 776, 20, 10, 217], [273, 1605, 70, 29, 97, 1186, 218, 66, 132, 20, 10, 3422], [680, 62, 4, 681, 1088, 458, 272, 84, 570, 41, 2, 19, 209], [37, 3, 10], [526, 1606, 1000, 3, 10], [596, 597, 3, 183, 10, 6, 19], [3423, 3424, 98, 281, 135, 7, 82, 409, 434, 17, 53, 931, 282], [682, 1607, 2652, 35, 3425, 134, 932, 7, 10, 2653, 598], [306, 10, 2, 1856, 646, 2654, 17, 3, 10, 596, 597], [48, 106, 32, 9, 167, 156, 48, 1001, 9, 5, 6], [177, 683, 10, 3, 1608, 3426, 77, 1304, 3427, 274, 3, 317], [1857, 92, 933, 8, 3428

# Create Model

In [7]:
model = Sequential()
model.add(Embedding(total_words,10,input_shape=(total_words,)))
model.add(LSTM(100,return_sequences=True))
model.add(LSTM(60))
model.add(Dense(50,activation= 'relu'))
model.add(Dense(len(emotion_mapping),activation = 'softmax'))


  super().__init__(**kwargs)


# Compile the model

In [8]:
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# fit the model

In [9]:
model.fit(x,y,epochs = 10,batch_size=128, verbose=1)

Epoch 1/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 712ms/step - accuracy: 0.5572 - loss: 1.0575
Epoch 2/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 710ms/step - accuracy: 0.5937 - loss: 0.9275
Epoch 3/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 723ms/step - accuracy: 0.5908 - loss: 0.9156
Epoch 4/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 721ms/step - accuracy: 0.6085 - loss: 0.8770
Epoch 5/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 728ms/step - accuracy: 0.6935 - loss: 0.7645
Epoch 6/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 721ms/step - accuracy: 0.7472 - loss: 0.6814
Epoch 7/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 727ms/step - accuracy: 0.7726 - loss: 0.6208
Epoch 8/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 735ms/step - accuracy: 0.7918 - loss: 0.5691
Epoch 9/10
[1m72/72[0m [32m━━

<keras.src.callbacks.history.History at 0x7e4c20bb50d0>

# Predict the model

In [15]:
tweet = "Just got the new iPhone, love it!"
tweet = clean_text(tweet)
tweet = tokenizer.texts_to_sequences([tweet])
tweet = pad_sequences(tweet, maxlen=total_words, padding='pre')
prediction = model.predict(tweet)
print(prediction)
predicted_index = prediction.argmax(axis=1)[0]
predicted_emotion = next((emotion for emotion, index in emotion_mapping.items() if index == predicted_index), None)
print(predicted_emotion)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 252ms/step
[[0.03687102 0.12350205 0.10770477 0.7319222 ]]
Positive emotion
