In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing
import tensorflow as tf
from nltk.corpus import stopwords
import keras.backend as K
from keras.models import Sequential
import string
import re

from tensorflow.keras.preprocessing import sequence
from sklearn.datasets import fetch_20newsgroups
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Dense,Input,Embedding,Dropout,Conv1D,MaxPooling1D,GlobalMaxPooling1D,Dropout,Bidirectional,Flatten,BatchNormalization,SimpleRNN,LSTM
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model
#import transformers
#import tokenizers

from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm
from transformers import pipeline

from sklearn.metrics import accuracy_score
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
from tensorflow.keras.layers import Concatenate

In [3]:
DATASET_COLUMNS=['target','ids','date','flag','user','text']
DATASET_ENCODING = "ISO-8859-1"
data = pd.read_csv("training.1600000.processed.noemoticon.csv",encoding=DATASET_ENCODING, names=DATASET_COLUMNS)
data.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
print('length of data is', len(data))

length of data is 1600000


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   target  1600000 non-null  int64 
 1   ids     1600000 non-null  int64 
 2   date    1600000 non-null  object
 3   flag    1600000 non-null  object
 4   user    1600000 non-null  object
 5   text    1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [6]:
data.isna().sum()

target    0
ids       0
date      0
flag      0
user      0
text      0
dtype: int64

In [7]:
data.target.value_counts()

target
0    800000
4    800000
Name: count, dtype: int64

In [8]:
data['target'] = data['target'].replace(4,1)
data=data[['text','target']]
data.target.value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

In [9]:
stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
             'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
             'does', 'doing', 'down', 'during', 'each','few', 'for', 'from',
             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
             'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
             'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
             'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're',
             's', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
             'themselves', 'then', 'there', 'these', 'they', 'this', 'those',
             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
             'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
             "youve", 'your', 'yours', 'yourself', 'yourselves']

STOPWORDS = set(stopwordlist)
def cleaning_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

def clean_text(s):
    s = re.sub(r'http\S+', '', s)
    s = re.sub('(RT|via)((?:\\b\\W*@\\w+)+)', ' ', s)
    s = re.sub(r'@\S+', '', s)
    s = re.sub('&amp', ' ', s)
    return s

data['text'] = data['text'].apply(lambda text: cleaning_stopwords(text))
data['text'].head()
data['text'] = data['text'].apply(clean_text)

In [10]:
#Train test split
x=data.text
y=data.target

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y,
    test_size=0.05, shuffle = True, random_state = 8)

# Use the same function above for the validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, 
    test_size=0.1, random_state= 8)

In [11]:
X_test_df = pd.DataFrame(X_test) #for vader and transformer

In [12]:
#tokenize
max_features = 40000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train))
X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(X_test)

In [13]:
#sequencing for lstm
max_words = 100
X_train = sequence.pad_sequences(X_train, maxlen=max_words, padding = 'post')
X_val = sequence.pad_sequences(X_val, maxlen=max_words, padding = 'post')
X_test = sequence.pad_sequences(X_test, maxlen=max_words, padding = 'post')

In [14]:
#Evaluating proper format and shape
y_train = np.array(y_train)
y_val = np.array(y_val)
y_test = np.array(y_test)
print(X_train.shape,X_val.shape, X_test.shape)

(1368000, 100) (152000, 100) (80000, 100)


In [17]:
model = Sequential()
model.add(Embedding(40000, 10, input_length=X_train.shape[1]))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 10)           400000    
                                                                 
 conv1d (Conv1D)             (None, 100, 32)           992       
                                                                 
 max_pooling1d (MaxPooling1  (None, 50, 32)            0         
 D)                                                              
                                                                 
 conv1d_1 (Conv1D)           (None, 50, 32)            3104      
                                                                 
 max_pooling1d_1 (MaxPoolin  (None, 25, 32)            0         
 g1D)                                                            
                                                                 
 conv1d_2 (Conv1D)           (None, 25, 32)            3

In [18]:
model.fit(X_train, y_train, validation_data=(X_val,y_val), epochs=10, batch_size=256, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x23c9b446a10>

In [19]:
result = model.evaluate(X_test,y_test)
result



[0.4903734028339386, 0.8018875122070312]

In [21]:
import nltk

In [23]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\samar\AppData\Roaming\nltk_data...


True

In [15]:
class MySentences(object):
    def __init__(self, data_path):
        self.data_path = data_path
        self.sentences = []

    def __iter__(self):
        with open(self.data_path, 'r') as f:
            for line in f:
                sentence = line.strip().split(' ')
                self.sentences.append(sentence)

        return iter(self.sentences)

In [16]:
my_sentences = MySentences('sentiment140.csv')

In [17]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

In [18]:
import logging, os

In [19]:
glove_file = "glove.6B.100d.txt"
word2vec_output_file = "glove.6B.100d.word2vec"

In [20]:
glove2word2vec(glove_file, word2vec_output_file)

(400000, 100)

In [21]:
glove_model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

In [22]:
def word2vector(embedding_size=50,window_size=5,training_epochs=5,initial_lr=0.025,min_lr=0.0001, data_path='sentiment140.csv'):
    """
    generate word vectors
    """
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model = word2vec.Word2Vec(MySentences(data_path),
                               size=embedding_size,window=window_size,iter=training_epochs,
                               alpha=initial_lr,min_alpha=min_lr,
                               sg=1, min_count=2, workers=4, hs=0, negative=10)
    model_path=os.path.join("word2vec", "model-" + str(embedding_size))
    model.save(model_path)

In [23]:
import numpy as np

embedding_dim = 100  
max_features = 40000  # Number of unique words

embedding_matrix = np.zeros((max_features, embedding_dim))

for word, i in tokenizer.word_index.items():
    if i < max_features:
        try:
            embedding_vector = glove_model[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            # Word not found in GLOVE, use a random vector or zeros
            embedding_matrix[i] = np.random.normal(0, 1, embedding_dim)


In [78]:
import tensorflow as tf
import os
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, LSTM, Bidirectional, Attention, Concatenate, Flatten

In [79]:
class HANModel(Model):
    def __init__(self, max_words, max_sentences, embedding_matrix, word_hidden_units, sentence_hidden_units, num_classes):
        super(HANModel, self).__init__()

        self.max_words = max_words
        self.max_sentences = max_sentences
        self.word_hidden_units = word_hidden_units
        self.sentence_hidden_units = sentence_hidden_units

        # Word-level attention
        self.word_attention = Attention(use_scale=True)
        self.word_context = Dense(word_hidden_units, activation='tanh')

        # Sentence-level attention
        self.sentence_attention = Attention(use_scale=True)
        self.sentence_context = Dense(sentence_hidden_units, activation='tanh') 

        # Word embedding layer
        self.embedding = Embedding(
            input_dim=embedding_matrix.shape[0],
            output_dim=embedding_matrix.shape[1],
            weights=[embedding_matrix],
            input_length=max_words,
            trainable=False,
        )

        # Word-level LSTM
        self.word_lstm = Bidirectional(LSTM(word_hidden_units, return_sequences=True))

        # Sentence-level LSTM
        self.sentence_lstm = Bidirectional(LSTM(sentence_hidden_units, return_sequences=True))

        # Fully connected layers for classification
        self.fc1 = Dense(128, activation='relu')
        self.fc2 = Dense(64, activation='relu')
        self.output_layer = Dense(num_classes, activation='softmax')

    def call(self, inputs):
        # Word-level attention
        word_representations = self.embedding(inputs)
        word_representations = self.word_lstm(word_representations)

        word_attention_weights = self.word_attention([word_representations, word_representations])
        word_representations = Concatenate(axis=-1)([word_representations, word_attention_weights])
        word_representations = self.word_context(word_representations)

        # Sentence-level attention
        sentence_representations = self.sentence_lstm(word_representations)

        sentence_attention_weights = self.sentence_attention([sentence_representations, sentence_representations])
        sentence_representations = Concatenate(axis=-1)([sentence_representations, sentence_attention_weights])
        sentence_representations = self.sentence_context(sentence_representations)

        # Classification layers
        avg_sentence_representations = tf.reduce_mean(sentence_representations, axis=1)
        x = self.fc1(avg_sentence_representations)
        x = self.fc2(x)
        outputs = self.output_layer(x)

        return outputs

In [80]:
max_words = 100 
max_sentences = 10  
embedding_dim = 100  
word_hidden_units = 64  
sentence_hidden_units = 64 
num_classes = 2  

In [81]:
han_model = HANModel(max_words, max_sentences, embedding_matrix, word_hidden_units, sentence_hidden_units, num_classes)

In [82]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
han_model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [83]:
han_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x22b0effa890>

In [85]:
test_loss, test_accuracy = han_model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy}")

Test Accuracy: 0.8167999982833862
