In [4]:
from __future__ import division, print_function
from gensim import models
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout, Reshape, Flatten, concatenate, Input, Conv1D, GlobalMaxPooling1D, Embedding
from keras.layers import LSTM

from keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer

from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os
import collections
import re
import string

### Read data

In [6]:
data = pd.read_csv('/kaggle/input/imdb-sentiment/imdb_labelled.txt', header = None, delimiter='\t')

In [7]:
data.columns = ['Text', 'Label']

In [8]:
data.head()

Unnamed: 0,Text,Label
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [9]:
data.Label.unique()

array([0, 1])

In [10]:
data.shape

(748, 2)

In [11]:
pos = []
neg = []
for l in data.Label:
    if l == 0:
        pos.append(0)
        neg.append(1)
    elif l == 1:
        pos.append(1)
        neg.append(0)

In [12]:
data['Pos']= pos
data['Neg']= neg

In [13]:
data.head()

Unnamed: 0,Text,Label,Pos,Neg
0,"A very, very, very slow-moving, aimless movie ...",0,0,1
1,Not sure who was more lost - the flat characte...,0,0,1
2,Attempting artiness with black & white and cle...,0,0,1
3,Very little music or anything to speak of.,0,0,1
4,The best scene in the movie was when Gerardo i...,1,1,0


### Clean data

In [14]:
def remove_punct(text):
    text_nopunct = ''
    text_nopunct = re.sub('['+string.punctuation+']', '', text)
    return text_nopunct

data['Text_Clean'] = data['Text'].apply(lambda x: remove_punct(x))

In [15]:
from nltk import word_tokenize, WordNetLemmatizer
tokens = [word_tokenize(sen) for sen in data.Text_Clean] 

In [16]:
def lower_token(tokens): 
    return [w.lower() for w in tokens]    
    
lower_tokens = [lower_token(token) for token in tokens] 

In [17]:
from nltk.corpus import stopwords
stoplist = stopwords.words('english')

In [18]:
def remove_stop_words(tokens): 
    return [word for word in tokens if word not in stoplist]

In [19]:
filtered_words = [remove_stop_words(sen) for sen in lower_tokens] 

In [20]:
result = [' '.join(sen) for sen in filtered_words] 

In [21]:
data['Text_Final'] = result

In [22]:
data['tokens'] = filtered_words

In [23]:
data = data[['Text_Final', 'tokens', 'Label', 'Pos', 'Neg']]

In [24]:
data[:4]

Unnamed: 0,Text_Final,tokens,Label,Pos,Neg
0,slowmoving aimless movie distressed drifting y...,"[slowmoving, aimless, movie, distressed, drift...",0,0,1
1,sure lost flat characters audience nearly half...,"[sure, lost, flat, characters, audience, nearl...",0,0,1
2,attempting artiness black white clever camera ...,"[attempting, artiness, black, white, clever, c...",0,0,1
3,little music anything speak,"[little, music, anything, speak]",0,0,1


### Split data into test and train

In [25]:
data_train, data_test = train_test_split(data, test_size=0.10, random_state=42)

In [26]:
all_training_words = [word for tokens in data_train["tokens"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in data_train["tokens"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))

7218 words total, with a vocabulary size of 2881
Max sentence length is 789


In [27]:
all_test_words = [word for tokens in data_test["tokens"] for word in tokens]
test_sentence_lengths = [len(tokens) for tokens in data_test["tokens"]]
TEST_VOCAB = sorted(list(set(all_test_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_test_words), len(TEST_VOCAB)))
print("Max sentence length is %s" % max(test_sentence_lengths))

580 words total, with a vocabulary size of 457
Max sentence length is 24


### Load Google News Word2Vec model

In [33]:
word2vec_path = '/kaggle/input/googlenewsvectors/GoogleNews-vectors-negative300.bin'
word2vec = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [34]:
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_comments, generate_missing=False):
    embeddings = clean_comments['tokens'].apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)

### Get Embeddings

In [35]:
training_embeddings = get_word2vec_embeddings(word2vec, data_train, generate_missing=True)

In [36]:
MAX_SEQUENCE_LENGTH = 50
EMBEDDING_DIM = 300

### Tokenize and Pad sequences

In [37]:
tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)
tokenizer.fit_on_texts(data_train["Text_Final"].tolist())
training_sequences = tokenizer.texts_to_sequences(data_train["Text_Final"].tolist())

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

Found 2881 unique tokens.


In [38]:
train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [39]:
train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)

(2882, 300)


In [40]:
test_sequences = tokenizer.texts_to_sequences(data_test["Text_Final"].tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

### Define CNN

In [41]:
def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index):
    
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=False)
    
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    convs = []
    filter_sizes = [2,3,4,5,6]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=200, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)


    l_merge = concatenate(convs, axis=1)

    x = Dropout(0.1)(l_merge)  
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    preds = Dense(labels_index, activation='sigmoid')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()
    return model

In [42]:
label_names = ['Pos', 'Neg']

In [43]:
y_train = data_train[label_names].values

In [44]:
x_train = train_cnn_data
y_tr = y_train

In [45]:
model = ConvNet(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, 
                len(list(label_names)))



### Train CNN

In [46]:
num_epochs = 10
batch_size = 34

In [47]:
hist = model.fit(x_train, y_tr, epochs=num_epochs, validation_split=0.1, shuffle=True, batch_size=batch_size)

Epoch 1/10


I0000 00:00:1733120543.298034     110 service.cc:145] XLA service 0x5b5994f505c0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1733120543.298096     110 service.cc:153]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0


[1m12/18[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m0s[0m 5ms/step - acc: 0.5472 - loss: 0.7057 

I0000 00:00:1733120548.468668     110 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 275ms/step - acc: 0.5788 - loss: 0.6891 - val_acc: 0.6765 - val_loss: 0.5818
Epoch 2/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - acc: 0.8490 - loss: 0.4198 - val_acc: 0.7647 - val_loss: 0.5172
Epoch 3/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - acc: 0.9250 - loss: 0.2280 - val_acc: 0.7941 - val_loss: 0.3977
Epoch 4/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - acc: 0.9784 - loss: 0.1137 - val_acc: 0.8382 - val_loss: 0.4244
Epoch 5/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - acc: 0.9861 - loss: 0.0573 - val_acc: 0.7941 - val_loss: 0.5544
Epoch 6/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - acc: 0.9993 - loss: 0.0274 - val_acc: 0.8088 - val_loss: 0.4810
Epoch 7/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - acc: 0.9931 - loss: 0.

### Test CNN

In [48]:
predictions = model.predict(test_cnn_data, batch_size=1024, verbose=1)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 690ms/step


In [49]:
labels = [1, 0]

In [50]:
prediction_labels=[]
for p in predictions:
    prediction_labels.append(labels[np.argmax(p)])

In [51]:
sum(data_test.Label==prediction_labels)/len(prediction_labels)

0.7733333333333333

In [52]:
data_test.Label.value_counts()

Label
0    44
1    31
Name: count, dtype: int64

In [53]:
def predict_sentiment(sentence, tokenizer, model, max_sequence_length=50):
    import re
    import string
    from keras.preprocessing.sequence import pad_sequences

    # Text preprocessing
    def preprocess_text(text):
        text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
        text = text.lower()  # Lowercase
        return text

    # Preprocess the input sentence
    clean_sentence = preprocess_text(sentence)

    # Tokenize and pad
    sequence = tokenizer.texts_to_sequences([clean_sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_sequence_length)

    # Predict sentiment
    prediction = model.predict(padded_sequence)
    sentiment = "Positive" if prediction[0][0] > 0.5 else "Negative"

    return sentiment


In [61]:
sentence = "This movie was fantastic and uplifting!"
sentiment = predict_sentiment(sentence, tokenizer, model)
print(f"Sentiment: {sentiment}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
Sentiment: Positive


In [59]:
# Save the trained model
model.save("sentiment_model_CNN.h5")


In [60]:
import pickle

# Save the tokenizer
with open("tokenizer_CNN.pkl", "wb") as f:
    pickle.dump(tokenizer, f)
