In [24]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.metrics import f1_score, accuracy_score
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SpatialDropout1D, Conv1D, GlobalMaxPooling1D, Dense, Dropout, LSTM, Bidirectional
from tensorflow.keras import utils
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
import warnings
import nltk
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
warnings.filterwarnings(action='ignore')
import matplotlib.pyplot as plt

In [25]:
data_path = pd.read_csv('/kaggle/input/twitter-sentiment-dataset/Twitter_Data.csv')
df = data_path
print(df)

                                               clean_text  category
0       when modi promised “minimum government maximum...      -1.0
1       talk all the nonsense and continue all the dra...       0.0
2       what did just say vote for modi  welcome bjp t...       1.0
3       asking his supporters prefix chowkidar their n...       1.0
4       answer who among these the most powerful world...       1.0
...                                                   ...       ...
162975  why these 456 crores paid neerav modi not reco...      -1.0
162976  dear rss terrorist payal gawar what about modi...      -1.0
162977  did you cover her interaction forum where she ...       0.0
162978  there big project came into india modi dream p...       0.0
162979  have you ever listen about like gurukul where ...       1.0

[162980 rows x 2 columns]


In [26]:
df['clean_text'].fillna('', inplace=True)
all_tweets = ' '.join(df['clean_text'])
# Tokenize words
tokenized_text = [word_tokenize(text.lower()) for text in df['clean_text']]

# Word2Vec model
model = Word2Vec(tokenized_text, vector_size=100, window=5, min_count=1, workers=4)

# Most similar words
similar_words = model.wv.most_similar('happy', topn=5)

# Define input and target variables
X = df['clean_text']
y = df['category']

# Encode target variable
encoder = LabelEncoder()
y = encoder.fit_transform(y)
y = utils.to_categorical(y)

# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)

# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# Max sequence length
max_seq_length = max([len(seq) for seq in sequences])

# Pad sequences
X_pad = pad_sequences(sequences, maxlen=max_seq_length)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)


In [27]:
from tensorflow.keras import layers, backend as K, initializers
from keras import backend as K
from keras.layers import Layer

def squash(x, axis=-1):
    s_squared_norm = K.sum(K.square(x), axis, keepdims=True)
    scale = K.sqrt(s_squared_norm + K.epsilon())
    return x / scale

class CapsuleLayer(Layer):
    def __init__(self, num_capsule, dim_capsule, routings=3, **kwargs):
        super(CapsuleLayer, self).__init__(**kwargs)
        self.num_capsule = num_capsule
        self.dim_capsule = dim_capsule
        self.routings = routings

    def build(self, input_shape):
        self.W = self.add_weight(name='capsule_kernel', 
                                 shape=(input_shape[1], self.num_capsule * self.dim_capsule),
                                 initializer='glorot_uniform',
                                 trainable=True)

    def call(self, inputs):
        inputs_expand = K.expand_dims(inputs, 1)
        inputs_tiled = K.tile(inputs_expand, [1, self.num_capsule, 1, 1])
        inputs_hat = K.map_fn(lambda x: K.batch_dot(x, self.W), elems=inputs_tiled)
        inputs_hat = K.reshape(inputs_hat, (-1, inputs.shape[1], self.num_capsule, self.dim_capsule))

        b = K.zeros_like(inputs_hat[:, :, :, 0])  # Shape [batch, input_length, num_capsule]

        for i in range(self.routings):
            c = K.softmax(b, axis=-1)
            c_expand = K.expand_dims(c, -1)
            c_tiled = K.tile(c_expand, [1, 1, 1, self.dim_capsule])
            outputs = squash(K.sum(c_tiled * inputs_hat, axis=2))  # Sum across the input_length axis
            if i < self.routings - 1:
                b += K.sum(inputs_hat * K.expand_dims(outputs, 2), axis=-1)

        return outputs

    def compute_output_shape(self, input_shape):
        return (None, input_shape[1], self.num_capsule, self.dim_capsule)

In [28]:
vocab_size = 20000  # example vocab size
max_seq_length = 500  # e
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=100, input_shape=(max_seq_length,)))
model.add(SpatialDropout1D(0.5))
model.add(Bidirectional(LSTM(units=128, dropout=0.5, recurrent_dropout=0.5, return_sequences=True)))
model.add(CapsuleLayer(num_capsule=10,dim_capsule=16,routings=3))
model.add(Conv1D(64, kernel_size=3, padding='same', activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(units=128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(units=4, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

ValueError: Kernel shape must have the same length as input, but received kernel of shape (3, 16, 64) and input of shape (None, 500, 10, 16).

In [None]:
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=0.001)
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(X_train, y_train, batch_size=128, epochs=10, validation_split=0.1, callbacks=[reduce_lr, early_stop])