<a href="https://colab.research.google.com/github/abakm/AL-ML_Assignment-1/blob/master/Assignment_06_Deep_Learning_for_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import libraries

In [23]:
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense,Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Load dataset

In [24]:
# Load the data
df = pd.read_csv('./judge-1377884607_tweet_product_company.csv', encoding='ISO-8859-1')

# Remove ‘emotion_in_tweet_is_directed_at’ column
df.drop(columns=['emotion_in_tweet_is_directed_at'], inplace=True)

# print dataset
df.head()

Unnamed: 0,tweet_text,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion


# Preprocessing

In [25]:
STOPWORDS = stopwords.words('english')
def clean_text(text):
    # Convert text to lowercase
    text = str(text).lower().replace('\n',' ').replace(',','').replace('.','')

    # Remove links parts
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    text = re.sub(r'@\w+|#\w+', '', text)

    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Remove non alphabet and stopwords
    text = ' '.join([text for text in text.split() if text.isalpha() and text not in STOPWORDS])
    return text


In [26]:
df.dropna() # Remove stopwords
df.drop_duplicates() # Remove duplicates
df['tweet_text'] = df['tweet_text'].apply(lambda tweet_text:clean_text(text=tweet_text)) # clean data
emotion_mapping = {
    'I can\'t tell': 0,
    'Negative emotion': 1,
    'No emotion toward brand or product': 2,
    'Positive emotion': 3
}# Define integer of each output

y = df['is_there_an_emotion_directed_at_a_brand_or_product'].map(emotion_mapping) # Replace each emotion with its index
y

Unnamed: 0,is_there_an_emotion_directed_at_a_brand_or_product
0,1
1,3
2,3
3,1
4,3
...,...
9088,3
9089,2
9090,2
9091,2


# Tokenization

In [27]:
tweets = df['tweet_text'].values.tolist()
tokenizer.fit_on_texts(tweets)
tokenizer.word_index
total_words = len(tokenizer.word_index) + 1
print(total_words)

9054


# Padding

In [28]:
# Convert texts to sequences
x = tokenizer.texts_to_sequences(tweets)

# Then pad the sequences
x= pad_sequences(x, maxlen=total_words, padding='pre')

# Create Model

In [29]:
model = Sequential()
model.add(Embedding(total_words,10,input_shape=(total_words,)))
model.add(LSTM(100,return_sequences=True))
model.add(LSTM(60))
model.add(Dense(50,activation= 'relu'))
model.add(Dense(len(emotion_mapping),activation = 'softmax'))


  super().__init__(**kwargs)


# Compile the model

In [30]:
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# fit the model

In [31]:
model.fit(x,y,epochs = 10,batch_size=128, verbose=1)

Epoch 1/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 705ms/step - accuracy: 0.5791 - loss: 1.0625
Epoch 2/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 707ms/step - accuracy: 0.5895 - loss: 0.9143
Epoch 3/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 715ms/step - accuracy: 0.6198 - loss: 0.8730
Epoch 4/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 714ms/step - accuracy: 0.7060 - loss: 0.7423
Epoch 5/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 715ms/step - accuracy: 0.7577 - loss: 0.6655
Epoch 6/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 730ms/step - accuracy: 0.7920 - loss: 0.5847
Epoch 7/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 739ms/step - accuracy: 0.8088 - loss: 0.5369
Epoch 8/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 737ms/step - accuracy: 0.8105 - loss: 0.5221
Epoch 9/10
[1m72/72[0m [32m━━

<keras.src.callbacks.history.History at 0x7e4ba2595a10>

# Predict the model

In [32]:
tweet = "Just got the new iPhone, love it!"
tweet = clean_text(tweet)
tweet = tokenizer.texts_to_sequences([tweet])
tweet = pad_sequences(tweet, maxlen=total_words, padding='pre')
prediction = model.predict(tweet)
print(prediction)
predicted_index = prediction.argmax(axis=1)[0]
predicted_emotion = next((emotion for emotion, index in emotion_mapping.items() if index == predicted_index), None)
print(predicted_emotion)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 397ms/step
[[0.03856758 0.08424331 0.14876638 0.72842276]]
Positive emotion
