In [5]:
import os
from sklearn.model_selection import train_test_split
import keras
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from keras import layers

In [6]:
raw_data_folder='sentiment labelled sentences'

In [7]:
yelp_filepath=os.path.join(raw_data_folder,"yelp_labelled.txt")

In [8]:
import pandas as pd
df=pd.read_csv(yelp_filepath,names=['sentence','label'],sep='\t')
df.head()

Unnamed: 0,sentence,label
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [9]:
sentences=df["sentence"].values
y=df["label"].values

In [10]:
sentences_train,sentences_test,y_train,y_test=train_test_split(
    sentences,y,test_size=0.25,random_state=1000
)

In [11]:
tokenizer=Tokenizer(num_words=5000)

In [12]:
#Tokens formed from raw sentences
tokenizer.fit_on_texts(sentences)

#Vocabulary
tokenizer.index_word

{1: 'the',
 2: 'and',
 3: 'i',
 4: 'was',
 5: 'a',
 6: 'to',
 7: 'is',
 8: 'this',
 9: 'it',
 10: 'of',
 11: 'food',
 12: 'not',
 13: 'for',
 14: 'in',
 15: 'place',
 16: 'good',
 17: 'service',
 18: 'we',
 19: 'very',
 20: 'my',
 21: 'with',
 22: 'great',
 23: 'had',
 24: 'that',
 25: 'be',
 26: 'so',
 27: 'were',
 28: 'are',
 29: 'but',
 30: 'have',
 31: 'back',
 32: 'you',
 33: 'here',
 34: 'they',
 35: 'on',
 36: 'at',
 37: 'like',
 38: 'go',
 39: 'all',
 40: 'time',
 41: 'our',
 42: 'will',
 43: 'there',
 44: 'as',
 45: 'really',
 46: 'just',
 47: 'an',
 48: 'their',
 49: 'if',
 50: 'best',
 51: 'would',
 52: 'ever',
 53: 'also',
 54: 'friendly',
 55: 'up',
 56: 'only',
 57: 'never',
 58: 'one',
 59: 'no',
 60: 'your',
 61: 'restaurant',
 62: 'out',
 63: 'nice',
 64: "don't",
 65: 'been',
 66: 'what',
 67: 'amazing',
 68: 'again',
 69: 'from',
 70: 'delicious',
 71: 'vegas',
 72: 'did',
 73: 'by',
 74: 'which',
 75: 'pretty',
 76: 'some',
 77: 'me',
 78: 'came',
 79: 'when',
 80: 

In [13]:
X_train=tokenizer.texts_to_sequences(sentences_train)
X_test=tokenizer.texts_to_sequences(sentences_test)

#Adding 1 because of reserved 0 index for padding
vocab_size=len(tokenizer.word_index)+1

#padding the text
maxlen=100

X_train=pad_sequences(X_train,padding="post",maxlen=maxlen)
X_test=pad_sequences(X_test,padding="post",maxlen=maxlen)

In [14]:
def create_embedding_matrix(filepath,word_index,embedding_dim):
    vocab_size=len(word_index)+1 #Adding 1 because of reserved 0 index
    embedding_matrix=np.zeros((vocab_size,embedding_dim))
    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx=word_index[word]
                embedding_matrix[idx]=np.array(
                    vector,dtype=np.float32)[:embedding_dim]
    return embedding_matrix

In [15]:
embedding_dim=50
embedding_matrix=create_embedding_matrix('../glove.6B.50d.txt',tokenizer.word_index,embedding_dim)
embedding_matrix.shape

(2072, 50)

In [16]:
nonzero_elements=np.count_nonzero(np.count_nonzero(embedding_matrix,axis=1))
nonzero_elements/vocab_size

0.9507722007722008

In [17]:
#Building the network
model=tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size,embedding_dim,weights=[embedding_matrix],trainable=False))

2025-10-01 23:03:03.459733: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2025-10-01 23:03:03.460234: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2025-10-01 23:03:03.460537: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2025-10-01 23:03:03.460597: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-10-01 23:03:03.460968: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [18]:
model.add(tf.keras.layers.GlobalMaxPool1D()) #To downsample the incoming feature vectors
model.add(tf.keras.layers.Dense(10,activation='relu'))
model.add(tf.keras.layers.Dense(1,activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

In [19]:
history=model.fit(X_train,y_train,
                  epochs=10,
                  verbose=False,
                  validation_data=(X_test,y_test),
                  batch_size=10)

2025-10-01 23:03:30.789663: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


In [20]:
loss,accuracy=model.evaluate(X_train,y_train,verbose=False)
print("Training accuracy: {:.4f}".format(accuracy))

Training accuracy: 0.6813


In [21]:
loss,accuracy=model.evaluate(X_test,y_test,verbose=False)
print("Testing accuracy: {:.4f}".format(accuracy))

Testing accuracy: 0.6480


In [22]:
#Additionally training the word embeddings
model=tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size,embedding_dim,weights=[embedding_matrix],trainable=True))
model.add(tf.keras.layers.GlobalMaxPool1D())
model.add(tf.keras.layers.Dense(10,activation="relu"))
model.add(tf.keras.layers.Dense(1,activation="sigmoid"))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy']
             )
model.summary()

In [23]:
history=model.fit(X_train,y_train,
                  epochs=50,
                  verbose=False,
                  validation_data=(X_test,y_test),
                  batch_size=10)

In [24]:
loss,accuracy=model.evaluate(X_train,y_train,verbose=False)
print("Training accuracy {:.4f}".format(accuracy))
loss,accuracy=model.evaluate(X_test,y_test,verbose=False)
print("Testing accuracy {:.4f}".format(accuracy))

Training accuracy 1.0000
Testing accuracy 0.7840
