# Motivation and State of the Art
___

# Installing libraries
___

In [1]:
!pip install gensim --upgrade
!pip install tensorflow
!pip install numpy
!pip install keras --upgrade
!pip install pandas --upgrade
!pip install nltk
!pip install seaborn
!pip install mlxtend
!pip install textblob



In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import classification_report, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
from sklearn.model_selection import train_test_split
from mlxtend.plotting import plot_confusion_matrix
import matplotlib.cm as cm
from matplotlib import rcParams
from collections import Counter
from nltk.tokenize import RegexpTokenizer
from textblob import TextBlob
import re
import string
import tensorflow
from sklearn.metrics  import classification_report ,confusion_matrix
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import LSTM
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/c2210475029/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/c2210475029/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
2025-01-04 17:18:31.279432: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736011111.297696  554666 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736011111.303268  554666 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-04 17:18:31.321229: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructi

# Get the data
___

In [3]:
data = pd.read_csv("training.1600000.processed.noemoticon.csv", encoding = "ISO-8859-1", engine="python")
data.columns = ["label", "id", "date", "query", "username", "text"]

In [4]:
data.head()

Unnamed: 0,label,id,date,query,username,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


Only the 'text' and 'label' columns are necessary.

In [5]:
data=data[['text','label']]

In [6]:
data.head()

Unnamed: 0,text,label
0,is upset that he can't update his Facebook by ...,0
1,@Kenichan I dived many times for the ball. Man...,0
2,my whole body feels itchy and like its on fire,0
3,"@nationwideclass no, it's not behaving at all....",0
4,@Kwesidei not the whole crew,0


In [7]:
data.columns

Index(['text', 'label'], dtype='object')

In [8]:
print('data length:', len(data))

data length: 1599999


In [9]:
data.shape

(1599999, 2)

In [10]:
data.info

<bound method DataFrame.info of                                                       text  label
0        is upset that he can't update his Facebook by ...      0
1        @Kenichan I dived many times for the ball. Man...      0
2          my whole body feels itchy and like its on fire       0
3        @nationwideclass no, it's not behaving at all....      0
4                            @Kwesidei not the whole crew       0
...                                                    ...    ...
1599994  Just woke up. Having no school is the best fee...      4
1599995  TheWDB.com - Very cool to hear old Walt interv...      4
1599996  Are you ready for your MoJo Makeover? Ask me f...      4
1599997  Happy 38th Birthday to my boo of alll time!!! ...      4
1599998  happy #charitytuesday @theNSPCC @SparksCharity...      4

[1599999 rows x 2 columns]>

In [11]:
data.dtypes

text     object
label     int64
dtype: object

Label '4' gets changed to label '1'.

In [12]:
data.loc[data['label']==4, 'label'] = 1

In [13]:
data.info

<bound method DataFrame.info of                                                       text  label
0        is upset that he can't update his Facebook by ...      0
1        @Kenichan I dived many times for the ball. Man...      0
2          my whole body feels itchy and like its on fire       0
3        @nationwideclass no, it's not behaving at all....      0
4                            @Kwesidei not the whole crew       0
...                                                    ...    ...
1599994  Just woke up. Having no school is the best fee...      1
1599995  TheWDB.com - Very cool to hear old Walt interv...      1
1599996  Are you ready for your MoJo Makeover? Ask me f...      1
1599997  Happy 38th Birthday to my boo of alll time!!! ...      1
1599998  happy #charitytuesday @theNSPCC @SparksCharity...      1

[1599999 rows x 2 columns]>

In [14]:
data["label"].value_counts()

label
1    800000
0    799999
Name: count, dtype: int64

In [15]:
data_pos = data[data['label'] == 1]
data_neg = data[data['label'] == 0]

## Dataset reduction for testing

In [16]:
data_pos = data_pos.iloc[:int(70000)]
data_neg = data_neg.iloc[:int(70000)]
data["label"].value_counts()

label
1    800000
0    799999
Name: count, dtype: int64

In [17]:
data = pd.concat([data_pos, data_neg])

# Text cleaning
___

In [18]:
data.info

<bound method DataFrame.info of                                                      text  label
799999       I LOVE @Health4UandPets u guys r the best!!       1
800000  im meeting up with one of my besties tonight! ...      1
800001  @DaRealSunisaKim Thanks for the Twitter add, S...      1
800002  Being sick can be really cheap when it hurts t...      1
800003    @LovesBrooklyn2 he has that effect on everyone       1
...                                                   ...    ...
69995                                            Wow....       0
69996   @Dainfam0uszd  NO UR NOT.....U R MY ONLY P.E.O...      0
69997   I will be so much happier when I don't have to...      0
69998   @mariancall buy it and bring it up here I woul...      0
69999   @jenniferrr_gee we haven't had it yet  he's 't...      0

[140000 rows x 2 columns]>

Abbreviations are changed to whole words.

In [19]:
chat_word = {
    'AFAIK': 'As Far As I Know',
    'AFK': 'Away From Keyboard',
    'ASAP': 'As Soon As Possible',
    'ATK': 'At The Keyboard',
    'ATM': 'At The Moment',
    'A3': 'Anytime, Anywhere, Anyplace',
    'BAK': 'Back At Keyboard',
    'BBL': 'Be Back Later',
    'BBS': 'Be Back Soon',
    'BFN': 'Bye For Now',
    'B4N': 'Bye For Now',
    'BRB': 'Be Right Back',
    'BRT': 'Be Right There',
    'BTW': 'By The Way',
    'B4': 'Before',
    'CU': 'See You',
    'CUL8R': 'See You Later',
    'CYA': 'See You',
    'FAQ': 'Frequently Asked Questions',
    'FC': 'Fingers Crossed',
    'FWIW': "For What It's Worth",
    'FYI': 'For Your Information',
    'GAL': 'Get A Life',
    'GG': 'Good Game',
    'GN': 'Good Night',
    'GMTA': 'Great Minds Think Alike',
    'GR8': 'Great!',
    'G9': 'Genius',
    'IC': 'I See',
    'ICQ': 'I Seek you (also a chat program)',
    'ILU': 'ILU: I Love You',
    'IMHO': 'In My Honest/Humble Opinion',
    'IMO': 'In My Opinion',
    'IOW': 'In Other Words',
    'IRL': 'In Real Life',
    'KISS': 'Keep It Simple, Stupid',
    'LDR': 'Long Distance Relationship',
    'LMAO': 'Laugh My A.. Off',
    'LOL': 'Laughing Out Loud',
    'LTNS': 'Long Time No See',
    'L8R': 'Later',
    'MTE': 'My Thoughts Exactly',
    'M8': 'Mate',
    'NRN': 'No Reply Necessary',
    'OIC': 'Oh I See',
    'PITA': 'Pain In The A..',
    'PRT': 'Party',
    'PRW': 'Parents Are Watching',
    'QPSA?': 'Que Pasa?',
    'ROFL': 'Rolling On The Floor Laughing',
    'ROFLOL': 'Rolling On The Floor Laughing Out Loud',
    'ROTFLMAO': 'Rolling On The Floor Laughing My A.. Off',
    'SK8': 'Skate',
    'STATS': 'Your sex and age',
    'ASL': 'Age, Sex, Location',
    'THX': 'Thank You',
    'TTFN': 'Ta-Ta For Now!',
    'TTYL': 'Talk To You Later',
    'R': 'ARE',
    'U': 'You',
    'U2': 'You Too',
    'U4E': 'Yours For Ever',
    'WB': 'Welcome Back',
    'WTF': 'What The F...',
    'WTG': 'Way To Go!',
    'WUF': 'Where Are You From?',
    'W8': 'Wait...',
    '7K': 'Sick:-D Laugher',
    'TFW': 'That feeling when',
    'MFW': 'My face when',
    'MRW': 'My reaction when',
    'IFYP': 'I feel your pain',
    'TNTL': 'Trying not to laugh',
    'JK': 'Just kidding',
    'IDC': "I don't care",
    'ILY': 'I love you',
    'IMU': 'I miss you',
    'ADIH': 'Another day in hell',
    'ZZZ': 'Sleeping, bored, tired',
    'WYWH': 'Wish you were here',
    'TIME': 'Tears in my eyes',
    'BAE': 'Before anyone else',
    'FIMH': 'Forever in my heart',
    'BSAAW': 'Big smile and a wink',
    'BWL': 'Bursting with laughter',
    'BFF': 'Best friends forever',
    'CSL': "Can't stop laughing"
}

In [20]:
def short_conv(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_word:
            new_text.append(chat_word[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)
data['text'] = data['text'].apply(lambda text: short_conv(text))
data['text'].head()

799999      I LOVE @Health4UandPets You guys ARE the best!!
800000    im meeting up with one of my besties tonight! ...
800001    @DaRealSunisaKim Thanks for the Twitter add, S...
800002    Being sick can be really cheap when it hurts t...
800003       @LovesBrooklyn2 he has that effect on everyone
Name: text, dtype: object

Spelling Correction

In [21]:
#data['text'] = data['text'].apply(lambda text: TextBlob(text).correct().string)

In [22]:
#data['text'].head()

LOWERCASE

In [23]:
data['text']=data['text'].str.lower()
data['text'].head()

799999      i love @health4uandpets you guys are the best!!
800000    im meeting up with one of my besties tonight! ...
800001    @darealsunisakim thanks for the twitter add, s...
800002    being sick can be really cheap when it hurts t...
800003       @lovesbrooklyn2 he has that effect on everyone
Name: text, dtype: object

In [24]:
stopwords_list = stopwords.words('english')
", ".join(stopwords.words('english'))

"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [25]:
STOPWORDS = set(stopwords.words('english'))
def cleaning_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
data['text'] = data['text'].apply(lambda text: cleaning_stopwords(text))
data['text'].head()

799999                    love @health4uandpets guys best!!
800000    im meeting one besties tonight! cant wait!! - ...
800001    @darealsunisakim thanks twitter add, sunisa! g...
800002    sick really cheap hurts much eat real food plu...
800003                      @lovesbrooklyn2 effect everyone
Name: text, dtype: object

REMOVING USERNAME, URL, HASHTAGS and Non-Alphabets

In [26]:
urlPattern = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
userPattern = r"@[^\s]+"
hastagPattern = r"#[^\s]+"
alphaPattern = r"[^a-zA-Z]"

In [27]:
data['text'] = data['text'].apply(lambda text: re.sub(userPattern,'', text))
data['text'] = data['text'].apply(lambda text: re.sub(hastagPattern,'',text))
data['text'] = data['text'].apply(lambda text: re.sub(urlPattern,'',text))
data['text'] = data['text'].apply(lambda text: re.sub(alphaPattern,' ', text))
data['text'].head()

799999                                    love  guys best  
800000    im meeting one besties tonight  cant wait     ...
800001     thanks twitter add  sunisa  got meet hin show...
800002    sick really cheap hurts much eat real food plu...
800003                                      effect everyone
Name: text, dtype: object

REMOVE PUNCTUATIONS

In [28]:
english_punctuations = string.punctuation
punctuations_list = english_punctuations
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

In [29]:
data['text']= data['text'].apply(lambda x: cleaning_punctuations(x))
data['text'].head()

799999                                    love  guys best  
800000    im meeting one besties tonight  cant wait     ...
800001     thanks twitter add  sunisa  got meet hin show...
800002    sick really cheap hurts much eat real food plu...
800003                                      effect everyone
Name: text, dtype: object

Removing Sequences of 3 or more Identical Characters

In [30]:
sequencePattern = r"(.)\1\1+"
seqReplacePattern = r"\1\1"
def cleaning_repeating_char(text):
    return re.sub(sequencePattern, seqReplacePattern, text)

In [31]:
data['text'] = data['text'].apply(lambda x: cleaning_repeating_char(x))
data['text'].head()

799999                                    love  guys best  
800000    im meeting one besties tonight  cant wait  gir...
800001     thanks twitter add  sunisa  got meet hin show...
800002    sick really cheap hurts much eat real food plu...
800003                                      effect everyone
Name: text, dtype: object

TOKENIZATION

In [32]:
tokenizer = RegexpTokenizer(r'\w+')
data['text'] = data['text'].apply(tokenizer.tokenize)
data['text'].head()

799999                                   [love, guys, best]
800000    [im, meeting, one, besties, tonight, cant, wai...
800001    [thanks, twitter, add, sunisa, got, meet, hin,...
800002    [sick, really, cheap, hurts, much, eat, real, ...
800003                                   [effect, everyone]
Name: text, dtype: object

STEMMING

In [33]:
st = nltk.PorterStemmer()
def stemming_on_text(data):
    text = [st.stem(word) for word in data]
    return data

data['text']= data['text'].apply(lambda x: stemming_on_text(x))

LEMMATIZATION

In [34]:
lm = nltk.WordNetLemmatizer()
def lemmatizer_on_text(data):
    text = [lm.lemmatize(word) for word in data]
    return data

data['text'] = data['text'].apply(lambda x: lemmatizer_on_text(x))
data['text'].head()

799999                                   [love, guys, best]
800000    [im, meeting, one, besties, tonight, cant, wai...
800001    [thanks, twitter, add, sunisa, got, meet, hin,...
800002    [sick, really, cheap, hurts, much, eat, real, ...
800003                                   [effect, everyone]
Name: text, dtype: object

CREATING TRAINING,VALIDATION SET

In [35]:
X=data.text
y=data.label

In [36]:
max_len = 500
tok = Tokenizer(num_words=2000)
tok.fit_on_texts(X)
sequences = tok.texts_to_sequences(X)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)
sequences_matrix.shape

(140000, 500)

In [37]:
X_train, X_test, Y_train, Y_test = train_test_split(sequences_matrix, y, test_size=0.3, random_state=2)

MODEL

In [38]:
model = Sequential()
model.add(Embedding(2000,50,input_length=max_len))

# Add the first LSTM layer
model.add(LSTM(100, return_sequences=True,recurrent_dropout=0.2,dropout=0.2))  # Return sequences to pass to the next LSTM layer
model.add(Dropout(0.3))  # Dropout to prevent overfitting

# Add a second LSTM layer
model.add(LSTM(100, return_sequences=True,recurrent_dropout=0.2,dropout=0.2))
model.add(Dropout(0.3))

# Add a third LSTM layer
model.add(LSTM(100,recurrent_dropout=0.2,dropout=0.2))
model.add(Dropout(0.3))

# Add a fully connected layer
model.add(Dense(64, activation='relu'))  # Intermediate dense layer for more learning capacity
model.add(BatchNormalization())
model.add(Dropout(0.3))  # Dropout for dense layer

# Output layer
model.add(Dense(1, activation='sigmoid'))  # For classification

# Compile the model
model.compile(
    loss='binary_crossentropy',  # Use sparse if labels are integers
    optimizer='adam',
    metrics=['accuracy']
)
model.summary()

W0000 00:00:1736011149.234699  554666 gpu_device.cc:2344] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [39]:
checkpoint = EarlyStopping(monitor='val_accuracy',patience = 10 ,mode='max')
callbacks_lst = [checkpoint]

MODEL TRAINING

In [40]:
print("Training on GPU...") if tensorflow.config.list_physical_devices('GPU') else print("Training on CPU...")

Training on CPU...


In [None]:
history = model.fit(X_train,Y_train,batch_size=128,epochs=10, validation_split=0.1, callbacks=callbacks_lst)

Epoch 1/10
[1m197/690[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m15:23[0m 2s/step - accuracy: 0.5357 - loss: 0.6895

In [None]:

fig_loss = plt.figure()
plt.plot(history.history['loss'], color='teal', label='loss')
plt.plot(history.history['val_loss'], color='orange', label='val_loss')
fig_loss.suptitle('Loss', fontsize=20)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(loc="upper left")
plt.show()

In [None]:

fig_accuracy = plt.figure()
plt.plot(history.history['accuracy'], color='blue', label='accuracy')
plt.plot(history.history['val_accuracy'], color='green', label='val_accuracy')
fig_accuracy.suptitle('Accuracy', fontsize=20)
plt.legend(loc="upper left")
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.show()

In [None]:
eval = model.evaluate(X_test,Y_test)

In [None]:
print(eval)

In [None]:
model.save('model_d30000_b128_e7.keras')

In [None]:
Y_pred = model.predict(X_test) #getting predictions on the trained model

In [None]:
print(Y_pred)