In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Read the Data

In [None]:
import pandas as pd
import re

import warnings
warnings.filterwarnings('ignore')

In [None]:
column=['target','ids','date','flag','user','text']

In [None]:
path ='/content/drive/MyDrive/AI/NLP/twitter.csv'
df = pd.read_csv(path,encoding='ISO-8859-1',names=column)
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [None]:
df.tail(1)

Unnamed: 0,target,ids,date,flag,user,text
1599999,4,2193602129,Tue Jun 16 08:40:50 PDT 2009,NO_QUERY,RyanTrevMorris,happy #charitytuesday @theNSPCC @SparksCharity...


In [None]:
dataset = df[['text','target']]

In [None]:
dataset.head()

Unnamed: 0,text,target
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,is upset that he can't update his Facebook by ...,0
2,@Kenichan I dived many times for the ball. Man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0


In [None]:
dataset.target.unique()

array([0, 4])

In [None]:
dataset['target'] = dataset['target'].replace(4,1)
dataset.target.unique()

array([0, 1])

In [None]:
dataset.isna().sum()

Unnamed: 0,0
text,0
target,0


# Text Preprocessing

## 1.Remove URL

In [None]:
str(dataset['text'][0])

"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"

In [None]:
pattern = re.compile(r'http[s]?:\/\/\S+')
pattern.sub('',str(dataset['text'][0]))

"@switchfoot  - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"

In [None]:
def remove_url(text):
  pattern = re.compile(r'http[s]?:\/\/\S+')
  return pattern.sub('',text)

In [None]:
dataset['text'] = dataset['text'].apply(lambda x: remove_url(x))

In [None]:
dataset['text'].head()

Unnamed: 0,text
0,"@switchfoot - Awww, that's a bummer. You sho..."
1,is upset that he can't update his Facebook by ...
2,@Kenichan I dived many times for the ball. Man...
3,my whole body feels itchy and like its on fire
4,"@nationwideclass no, it's not behaving at all...."


## 2.Remove HTML Tags

In [None]:
def remove_tag(text):
  pattern = re.compile(r'<.*?>')
  return pattern.sub('',text)

In [None]:
dataset['text'] = dataset['text'].apply(lambda x: remove_tag(x))

In [None]:
dataset['text'].head()

Unnamed: 0,text
0,"@switchfoot - Awww, that's a bummer. You sho..."
1,is upset that he can't update his Facebook by ...
2,@Kenichan I dived many times for the ball. Man...
3,my whole body feels itchy and like its on fire
4,"@nationwideclass no, it's not behaving at all...."


## 3.Handling Emoticons

In [None]:
# Emojis
emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad',
          ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed',
          ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
          '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
          '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink',
          ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat',';D':'laughing'}


In [None]:
def remove_emoticons(text):
  for emoji in emojis:
    text = text.replace(emoji, "Emoji" + emojis[emoji])
  return text

In [None]:
dataset['text'] = dataset['text'].apply(lambda x: remove_emoticons(x))

In [None]:
! pip install emoji



In [None]:
import emoji

def remove_emoji(text):
  return emoji.demojize(text)

In [None]:
dataset['text'] = dataset['text'].apply(lambda x: remove_emoji(x))

In [None]:
text = 'Face with Hearts emoji 🥰 is often used to celebrate Valentine'
remove_emoji(text)

'Face with Hearts emoji :smiling_face_with_hearts: is often used to celebrate Valentine'

## 4.Handling User Names

In [None]:
def handle_username(text):
  pattern = re.compile(r'@[^\s]+')
  text = pattern.sub('Tuser',text)
  return text

In [None]:
dataset['text'] = dataset['text'].apply(lambda x: handle_username(x))

In [None]:
dataset['text'].head()

Unnamed: 0,text
0,"Tuser - Awww, that's a bummer. You shoulda g..."
1,is upset that he can't update his Facebook by ...
2,Tuser I dived many times for the ball. Managed...
3,my whole body feels itchy and like its on fire
4,"Tuser no, it's not behaving at all. i'm mad. w..."


## 5.Remove Punctuation

In [None]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
punc = string.punctuation
def remove_punc(text):
  return text.translate(str.maketrans('','',punc))

In [None]:
dataset['text'] = dataset['text'].apply(lambda x: remove_punc(x))

In [None]:
remove_punc('Hi !, how are you?')

'Hi  how are you'

## 6.Remove Chat words or Slang Words

In [None]:
slang = '/content/drive/MyDrive/AI/NLP/slang.txt'

In [None]:
slang

'/content/drive/MyDrive/AI/NLP/slang.txt'

In [None]:
with open(slang,'r') as f:
  lines = f.readlines()

In [None]:
lines[0]

'AFAIK=As Far As I Know\n'

In [None]:
lines[0].split('=')

['AFAIK', 'As Far As I Know\n']

In [None]:
lines[0].split('=')[0]

'AFAIK'

In [None]:
lines[0].split('=')[1][:-1]

'As Far As I Know'

In [None]:
slang_dict = {}
for i in range(len(lines)):
  slang_dict[lines[i].split('=')[0]] = lines[i].split('=')[1][:-1]

In [None]:
slang_dict

{'AFAIK': 'As Far As I Know',
 'AFK': 'Away From Keyboard',
 'ASAP': 'As Soon As Possible',
 'ATK': 'At The Keyboard',
 'ATM': 'At The Moment',
 'A3': 'Anytime, Anywhere, Anyplace',
 'BAK': 'Back At Keyboard',
 'BBL': 'Be Back Later',
 'BBS': 'Be Back Soon',
 'BFN': 'Bye For Now',
 'B4N': 'Bye For Now',
 'BRB': 'Be Right Back',
 'BRT': 'Be Right There',
 'BTW': 'By The Way',
 'B4': 'Before',
 'CU': 'See You',
 'CUL8R': 'See You Later',
 'CYA': 'See You',
 'FAQ': 'Frequently Asked Questions',
 'FC': 'Fingers Crossed',
 'FWIW': "For What It's Worth",
 'FYI': 'For Your Information',
 'GAL': 'Get A Life',
 'GG': 'Good Game',
 'GN': 'Good Night',
 'GMTA': 'Great Minds Think Alike',
 'GR8': 'Great!',
 'G9': 'Genius',
 'IC': 'I See',
 'ICQ': 'I Seek you (also a chat program)',
 'ILU': 'ILU: I Love You',
 'IMHO': 'In My Honest/Humble Opinion',
 'IMO': 'In My Opinion',
 'IOW': 'In Other Words',
 'IRL': 'In Real Life',
 'KISS': 'Keep It Simple, Stupid',
 'LDR': 'Long Distance Relationship',
 'LM

In [None]:
def remove_chatwords(text):
  new_text = []
  for w in text.split():
    if w.upper() in slang_dict:
      new_text.append(slang_dict[w.upper()])
    else:
      new_text.append(w)
  return " ".join(new_text)

In [None]:
remove_chatwords('rofl ! This is so funny')

'Rolling On The Floor Laughing ! This is so funny'

In [None]:
dataset['text'] = dataset['text'].apply(lambda x: remove_chatwords(x))

## 7.Convert into lower case

In [None]:
dataset['text'] = dataset['text'].str.lower()

## 8.Spelling Correction




In [None]:
! pip install textblob



In [None]:
from textblob import TextBlob

str(TextBlob('I Luve Honey').correct())

'I Have Money'

In [None]:
text = 'I Luve Honey'
tl = text.split()

In [None]:
# " ".join([str(TextBlob(i).correct()) for i in text.tl()])

In [None]:
! pip install autocorrect



In [None]:
from autocorrect import Speller
spell = Speller(lang='en')
print([spell(i) for i in tl])

['I', 'Live', 'Honey']


In [None]:
! pip install pyspellchecker



In [None]:
from spellchecker import SpellChecker


In [None]:
def spell_crorect(text):
  tl = text.split()
  spell = SpellChecker()
  misspelled = spell.unknown(tl)
  return " ".join([spell.correction(i) for i in tl])

In [None]:
spell_crorect("Thes is not my shurt")

'the is not my hurt'

Note: Since none of the spell correcting module working properly therefore we are not applying it on our datset


## 9.Tokennization

In [None]:
! pip install nltk



In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [None]:
word_tokenize('Hi! How are you?')

['Hi', '!', 'How', 'are', 'you', '?']

In [None]:
def word_token(text):
  return word_tokenize(text)

In [None]:
dataset_copy = dataset.copy()

In [None]:
dataset_copy.head()

Unnamed: 0,text,target
0,tuser awww thats a bummer you shoulda got davi...,0
1,is upset that he cant update his facebook by t...,0
2,tuser i dived many times for the ball managed ...,0
3,my whole body feels itchy and like its on fire,0
4,tuser no its not behaving at all im mad why am...,0


In [None]:
dataset['text'] = dataset['text'].apply(lambda x: word_token(x))

## 10.Remove Stop Word

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
print(len(stopwords.words('english')))

179


In [None]:
print(stopwords.fileids())

['arabic', 'azerbaijani', 'basque', 'bengali', 'catalan', 'chinese', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek', 'hebrew', 'hinglish', 'hungarian', 'indonesian', 'italian', 'kazakh', 'nepali', 'norwegian', 'portuguese', 'romanian', 'russian', 'slovene', 'spanish', 'swedish', 'tajik', 'turkish']


In [None]:
stop_w = stopwords.words('english')

text_list = word_tokenize('i love pizza')
clean_text = [word for word in text_list if word not in stop_w]
clean_text

['love', 'pizza']

In [None]:
from functools import lru_cache

@lru_cache(maxsize=50000)
def remove_stopwords(text):
  stop_w = stopwords.words('english')
  text_list = text.split()
  clean_text = [word for word in text_list if word not in stop_w]
  return clean_text

In [None]:
remove_stopwords('i love pizza')

['love', 'pizza']

In [None]:
dataset = dataset_copy.copy()

In [None]:
dataset.head()

Unnamed: 0,text,target
0,tuser awww thats a bummer you shoulda got davi...,0
1,is upset that he cant update his facebook by t...,0
2,tuser i dived many times for the ball managed ...,0
3,my whole body feels itchy and like its on fire,0
4,tuser no its not behaving at all im mad why am...,0


In [None]:
dataset['text'] = dataset['text'].apply(lambda x: remove_stopwords(x))

In [None]:
len(dataset['text'][0])

11

In [None]:
len(dataset_copy['text'][0])

88

## 11.Stemming

In [None]:
from nltk.stem.porter import PorterStemmer

In [None]:
st=PorterStemmer()
stem = lru_cache(maxsize=50000)(st.stem)

def stemming_on_data(list_words):
  text = [stem(word) for word in list_words]
  return text

In [None]:
dataset['text'] = dataset['text'].apply(lambda x: stemming_on_data(x))

In [None]:
dataset.head()

Unnamed: 0,text,target
0,"[tuser, awww, that, bummer, shoulda, got, davi...",0
1,"[upset, cant, updat, facebook, text, might, cr...",0
2,"[tuser, dive, mani, time, ball, manag, save, 5...",0
3,"[whole, bodi, feel, itchi, like, fire]",0
4,"[tuser, behav, im, mad, cant, see]",0


## 12.Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
def list_tosent(list_words):
  return " ".join(list_words)

  list_tosent(dataset['text'][0])


In [None]:
dataset['text'] = dataset['text'].apply(lambda x: list_tosent(x))

In [None]:
dataset.head()

Unnamed: 0,text,target
0,tuser awww that bummer shoulda got david carr ...,0
1,upset cant updat facebook text might cri resul...,0
2,tuser dive mani time ball manag save 50 rest g...,0
3,whole bodi feel itchi like fire,0
4,tuser behav im mad cant see,0


In [None]:
lm = WordNetLemmatizer()

@lru_cache(maxsize=50000)
def lemmatization_on_data(list_words):
  list_words = list_words.split()
  text = [lm.lemmatize(word) for word in list_words]
  return text

In [None]:
dataset['text'] = dataset['text'].apply(lambda x: lemmatization_on_data(x))

In [None]:
new_dataset = dataset.copy()

In [None]:
dataset['text'] = dataset['text'].apply(lambda x: list_tosent(x))

In [None]:
dataset.head()

Unnamed: 0,text,target
0,tuser awww that bummer shoulda got david carr ...,0
1,upset cant updat facebook text might cri resul...,0
2,tuser dive mani time ball manag save 50 rest g...,0
3,whole bodi feel itchi like fire,0
4,tuser behav im mad cant see,0


## 13.Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dataset['text'], dataset['target'], test_size = 0.2, random_state = 42)

In [None]:
tfidf = TfidfVectorizer(max_features=500000, ngram_range=(1,3), stop_words='english')

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [None]:
X_train_tfidf.shape

(1280000, 500000)

In [None]:
for i , f in enumerate(tfidf.get_feature_names_out()):
  print(i,f)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
495000 ye dont forget
495001 ye dont know
495002 ye dont like
495003 ye dont think
495004 ye dont want
495005 ye doubt
495006 ye download
495007 ye dream
495008 ye dress
495009 ye drink
495010 ye drive
495011 ye drop
495012 ye dude
495013 ye earli
495014 ye easi
495015 ye eat
495016 ye email
495017 ye emojiwink
495018 ye end
495019 ye english
495020 ye enjoy
495021 ye especi
495022 ye everi
495023 ye everybodi
495024 ye everyon
495025 ye everyth
495026 ye exactli
495027 ye exam
495028 ye excit
495029 ye exist
495030 ye expens
495031 ye extrem
495032 ye eye
495033 ye facebook
495034 ye fact
495035 ye fail
495036 ye famili
495037 ye fan
495038 ye fantast
495039 ye far
495040 ye favorit
495041 ye favourit
495042 ye feel
495043 ye feel better
495044 ye feel like
495045 ye fell
495046 ye figur
495047 ye final
495048 ye final got
495049 ye fine
495050 ye finish
495051 ye flight
495052 ye follow
495053 ye food
495054 ye forgot
4

## 14.Apply algorithm and predict the sentiment

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

y_pred = nb_model.predict(X_test_tfidf)
print(accuracy_score(y_test, y_pred))

0.7732625


## 15. Prediction

In [None]:
def sentiment(list_of_tweets):
  new_tweet = tfidf.transform(list_of_tweets)
  if nb_model.predict(new_tweet)==1:
    return 'Happy'

  else:
    return 'Unhappy'

In [None]:
new_tweet=['i am sad']
sentiment(new_tweet)

'Unhappy'

# Sentiment Analysis with RNN

In [None]:
dataset.head()

Unnamed: 0,text,target
0,tuser awww that bummer shoulda got david carr ...,0
1,upset cant updat facebook text might cri resul...,0
2,tuser dive mani time ball manag save 50 rest g...,0
3,whole bodi feel itchi like fire,0
4,tuser behav im mad cant see,0


In [None]:
words=set()

for data in new_dataset['text']:
  for word in data:
    words.add(word)

In [None]:
number_of_words = len(words)
number_of_words

396196

In [None]:
new_dataset['text'] = new_dataset['text'].apply(lambda x: list_tosent(x))


In [None]:
new_dataset.to_csv('/content/drive/MyDrive/AI/NLP/sentiment_analysis.csv', index = False)

## 1.Import Libraries

In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
max_features = 396196

## 2.Read the Data

In [5]:
new_dataset = pd.read_csv('/content/drive/MyDrive/AI/NLP/sentiment_analysis.csv')

In [6]:
new_dataset.head(1)

Unnamed: 0,text,target
0,tuser awww that bummer shoulda got david carr ...,0


In [7]:
new_dataset['text'] = new_dataset['text'].astype('str')

In [8]:
(new_dataset['text'].head())

Unnamed: 0,text
0,tuser awww that bummer shoulda got david carr ...
1,upset cant updat facebook text might cri resul...
2,tuser dive mani time ball manag save 50 rest g...
3,whole bodi feel itchi like fire
4,tuser behav im mad cant see


In [9]:
new_dataset['text'].values

array(['tuser awww that bummer shoulda got david carr third day emojilaugh',
       'upset cant updat facebook text might cri result school today also blah',
       'tuser dive mani time ball manag save 50 rest go bound', ...,
       'readi mojo makeov ask detail',
       'happi 38th birthday boo alll tear eye tupac amaru shakur',
       'happi charitytuesday tuser tuser tuser'], dtype=object)

## 3.Tokenize with the index

In [10]:
tokenizer_keras = Tokenizer(num_words=max_features, split=' ')
tokenizer_keras.fit_on_texts(new_dataset['text'].values)
X = tokenizer_keras.texts_to_sequences(new_dataset['text'].values)
X

[[1, 385, 52, 1078, 3041, 15, 721, 7461, 1663, 5, 1820],
 [607, 13, 228, 452, 372, 212, 243, 978, 84, 11, 195, 1073],
 [1, 3658, 229, 249, 879, 711, 515, 1159, 360, 3, 2803],
 [343, 668, 25, 2552, 8, 891],
 [1, 4071, 2, 470, 13, 24],
 [1, 343, 1936],
 [32, 401],
 [1, 91, 101, 17, 14, 24, 97, 105, 176, 176, 12, 21, 2, 435, 16, 685],
 [1, 687, 62],
 [1, 2125, 114911],
 [1308, 346, 2634, 489, 1428],
 [16535, 794],
 [1, 327, 1318, 31, 154, 12488, 1380, 2822],
 [1, 776, 741, 389, 99, 123, 324],
 [1, 2003, 106, 62, 2372, 27, 73, 3543, 25177, 114912],
 [1, 48, 15, 31, 20, 1, 2089],
 [2448, 929, 1608, 139, 1452, 31, 651, 25178, 4510, 456],
 [1246, 2285],
 [1, 597, 74, 127, 19, 24, 1537, 9, 3172],
 [1, 40, 522, 281, 2328, 1465, 281],
 [1, 5, 62, 4, 42, 125],
 [22, 65, 119, 302, 222, 3251, 3173, 7194, 74, 17, 14, 529],
 [1, 1270, 643, 573],
 [59, 3, 29],
 [30555, 294, 467, 47],
 [76, 119, 321, 83],
 [3, 243, 51, 31, 3018],
 [2, 55, 114913],
 [2804, 12, 21, 7281, 116, 121, 7281, 121, 4, 470],
 [1

In [11]:
type(X)

list

In [12]:
new_dataset['text'][0]

'tuser awww that bummer shoulda got david carr third day emojilaugh'

In [13]:
tokenizer_keras.word_index

{'tuser': 1,
 'im': 2,
 'go': 3,
 'get': 4,
 'day': 5,
 'good': 6,
 'work': 7,
 'like': 8,
 'love': 9,
 'dont': 10,
 'today': 11,
 'laugh': 12,
 'cant': 13,
 'eye': 14,
 'got': 15,
 'thank': 16,
 'tear': 17,
 'back': 18,
 'want': 19,
 'miss': 20,
 'loud': 21,
 'one': 22,
 'know': 23,
 'see': 24,
 'feel': 25,
 'think': 26,
 'realli': 27,
 'well': 28,
 'hope': 29,
 'night': 30,
 'watch': 31,
 'need': 32,
 'still': 33,
 'make': 34,
 'new': 35,
 'amp': 36,
 'home': 37,
 'look': 38,
 'come': 39,
 'oh': 40,
 '2': 41,
 'much': 42,
 'last': 43,
 'twitter': 44,
 'morn': 45,
 'great': 46,
 'tomorrow': 47,
 'wish': 48,
 'wait': 49,
 'ill': 50,
 'sleep': 51,
 'that': 52,
 'haha': 53,
 'way': 54,
 'sad': 55,
 'fun': 56,
 'tri': 57,
 'right': 58,
 'week': 59,
 'follow': 60,
 'happi': 61,
 'didnt': 62,
 'bad': 63,
 'would': 64,
 'friend': 65,
 'thing': 66,
 'sorri': 67,
 'tonight': 68,
 'say': 69,
 'take': 70,
 'nice': 71,
 'gonna': 72,
 'though': 73,
 'ive': 74,
 'better': 75,
 'hate': 76,
 'even': 

In [14]:
new_dataset['target'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 1600000 entries, 0 to 1599999
Series name: target
Non-Null Count    Dtype
--------------    -----
1600000 non-null  int64
dtypes: int64(1)
memory usage: 12.2 MB


In [15]:
y=pd.get_dummies(new_dataset['target']).values
y

array([[ True, False],
       [ True, False],
       [ True, False],
       ...,
       [False,  True],
       [False,  True],
       [False,  True]])

In [16]:
y[:2]

array([[ True, False],
       [ True, False]])

## 4.Pad Sequnce

In [17]:
X = pad_sequences(X)

## 5.Split Data

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42)

In [19]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1120000, 38), (480000, 38), (1120000, 2), (480000, 2))

In [20]:
valid_size = 240000
X_valid = X_test[-valid_size:]
y_valid = y_test[-valid_size:]
X_test = X_test[:-valid_size]
y_test = y_test[:-valid_size]

In [21]:
X_test.shape

(240000, 38)

In [22]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout, SimpleRNN, SpatialDropout1D,GRU, Bidirectional
from keras.optimizers import Adam
from keras.regularizers import l2
from keras.callbacks import EarlyStopping

## 6.Connect with TPU

In [23]:
embed_dim = 128

In [1]:
# To detect the TPU
import tensorflow as tf
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
print("All devices: ", tf.config.list_logical_devices('TPU'))

tpu_statergy = tf.distribute.TPUStrategy(tpu)

# with tpu_statergy.scope():
#   model = Sequential()
#   model.add(Embedding(max_features, embed_dim,input_length = X.shape[1]))
#   model.add(SpatialDropout1D(0.4))
#   model.add(SimpleRNN(196, dropout=0.2, recurrent_dropout=0.2))
#   model.add(Dense(2,activation='softmax', kernel_regularizer=l2(0.001)))

#   model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
#   print(model.summary())


All devices:  [LogicalDevice(name='/device:TPU:0', device_type='TPU'), LogicalDevice(name='/device:TPU:1', device_type='TPU'), LogicalDevice(name='/device:TPU:2', device_type='TPU'), LogicalDevice(name='/device:TPU:3', device_type='TPU'), LogicalDevice(name='/device:TPU:4', device_type='TPU'), LogicalDevice(name='/device:TPU:5', device_type='TPU'), LogicalDevice(name='/device:TPU:6', device_type='TPU'), LogicalDevice(name='/device:TPU:7', device_type='TPU')]


##7.Build the Model

###7.1 Simple RNN Model

In [27]:
with tpu_statergy.scope():
  model_rnn = Sequential()
  model_rnn.add(Embedding(max_features, embed_dim,input_length = X.shape[1]))
  model_rnn.add(SpatialDropout1D(0.4))
  model_rnn.add(Bidirectional(SimpleRNN(196, dropout=0.2, recurrent_dropout=0.2, return_sequences = True)))
  model_rnn.add(Bidirectional(SimpleRNN(196, dropout=0.2, recurrent_dropout=0.2)))
  model_rnn.add(Dense(2,activation='softmax', kernel_regularizer=l2(0.001)))

  model_rnn.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
  print(model_rnn.summary())


# Compile RNN Model :
from keras import callbacks

earlyStopping = callbacks.EarlyStopping(monitor='val_loss',
                                        patience=5,
                                        mode='min',
                                        restore_best_weights=True)

model_rnn.fit(X_train, y_train, epochs = 20,
          batch_size=1024,
          verbose = 1,
          validation_data=(X_valid, y_valid),
          callbacks=[earlyStopping])

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 38, 128)           50713088  
                                                                 
 spatial_dropout1d_3 (Spati  (None, 38, 128)           0         
 alDropout1D)                                                    
                                                                 
 bidirectional_2 (Bidirecti  (None, 38, 392)           127400    
 onal)                                                           
                                                                 
 bidirectional_3 (Bidirecti  (None, 392)               230888    
 onal)                                                           
                                                                 
 dense (Dense)               (None, 2)                 786       
                                                      

<keras.src.callbacks.History at 0x7a510c21aaa0>

###7.2 LSTM RNN Model

In [28]:
with tpu_statergy.scope():
  model_lstm = Sequential()
  model_lstm.add(Embedding(max_features, embed_dim,input_length = X.shape[1]))
  model_lstm.add(SpatialDropout1D(0.4))
  model_lstm.add(Bidirectional(LSTM(196, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))
  model_lstm.add(Bidirectional(LSTM(196, dropout=0.2, recurrent_dropout=0.2)))
  model_lstm.add(Dense(2,activation='softmax', kernel_regularizer=l2(0.001)))

  model_lstm.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
  print(model_lstm.summary())


# Compile LSTM Model :
from keras import callbacks

earlyStopping = callbacks.EarlyStopping(monitor='val_loss',
                                        patience=5,
                                        mode='min',
                                        restore_best_weights=True)

model_lstm.fit(X_train, y_train, epochs = 20,
          batch_size=1024,
          verbose = 1,
          validation_data=(X_valid, y_valid),
          callbacks=[earlyStopping])

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 38, 128)           50713088  
                                                                 
 spatial_dropout1d_4 (Spati  (None, 38, 128)           0         
 alDropout1D)                                                    
                                                                 
 bidirectional_4 (Bidirecti  (None, 38, 392)           509600    
 onal)                                                           
                                                                 
 bidirectional_5 (Bidirecti  (None, 392)               923552    
 onal)                                                           
                                                                 
 dense_1 (Dense)             (None, 2)                 786       
                                                      

<keras.src.callbacks.History at 0x7a4e34586350>

###7.3 GRU RNN Model

In [29]:
with tpu_statergy.scope():
  model_GRU = Sequential()
  model_GRU.add(Embedding(max_features, embed_dim,input_length = X.shape[1]))
  model_GRU.add(SpatialDropout1D(0.4))
  model_GRU.add(Bidirectional(GRU(196, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))
  model_GRU.add(Bidirectional(GRU(196, dropout=0.2, recurrent_dropout=0.2)))
  model_GRU.add(Dense(2,activation='softmax', kernel_regularizer=l2(0.001)))

  model_GRU.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
  print(model_GRU.summary())



# Compile GRU Model :
from keras import callbacks

earlyStopping = callbacks.EarlyStopping(monitor='val_loss',
                                        patience=5,
                                        mode='min',
                                        restore_best_weights=True)

model_GRU.fit(X_train, y_train, epochs = 20,
          batch_size=1024,
          verbose = 1,
          validation_data=(X_valid, y_valid),
          callbacks=[earlyStopping])

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 38, 128)           50713088  
                                                                 
 spatial_dropout1d_5 (Spati  (None, 38, 128)           0         
 alDropout1D)                                                    
                                                                 
 bidirectional_6 (Bidirecti  (None, 38, 392)           383376    
 onal)                                                           
                                                                 
 bidirectional_7 (Bidirecti  (None, 392)               693840    
 onal)                                                           
                                                                 
 dense_2 (Dense)             (None, 2)                 786       
                                                      

<keras.src.callbacks.History at 0x7a4dfc4b8c40>

In [None]:
X_test[0].size

38

In [None]:
pred = model_rnn.predict(X_test)



In [None]:
pred

array([[0.34587303, 0.65412694],
       [0.18184012, 0.8181599 ],
       [0.09331241, 0.90668756],
       ...,
       [0.46880502, 0.531195  ],
       [0.05761576, 0.9423843 ],
       [0.71405286, 0.2859471 ]], dtype=float32)

## 8.Prediction

In [None]:
def predict_sentiment_rnn(tweet):
  '''
  Predicts the sentiment of the tweet using the trained RNN model.

  Agrs:
      tweet = The tweet as a string.

  Return:
      'Happy' if the predicted sentiment is positive, 'Unhappy' otherwise

  '''

  # Preprocess to tweet

  tweet_seq = tokenizer_keras.texts_to_sequences([tweet])
  # Use tweet_pad which has been padded to match the expected shape of the model
  tweet_pad = pad_sequences(tweet_seq, maxlen=X.shape[1]) # X.shape[1] represents the maxlen used during training

  # Predict sentiment using the RNN Model
  # Use tweet_pad as an input to the predict function instead of tweet_seq
  prediction = model_GRU.predict(tweet_pad)[0]
  sentiment_label = np.argmax(prediction)

  # Return the sentiment label
  if sentiment_label == 1:
    return 'Happy'
  else:
    return 'Unhappy'

  # Example usage:
new_tweets = [
    "This is a great day ! I' am so happy.",
    "I'm feeling really down today.",
    "This product is amazing"
]

for tweet in new_tweets:
  predicted_sentiment = predict_sentiment_rnn(tweet)
  print(f"Tweet: '{tweet}'\nPredicted Sentiment: {predicted_sentiment}\n")

Tweet: 'This is a great day ! I' am so happy.'
Predicted Sentiment: Happy

Tweet: 'I'm feeling really down today.'
Predicted Sentiment: Unhappy

Tweet: 'This product is amazing'
Predicted Sentiment: Happy

