In [1]:
import tensorflow as tf
import numpy as np
import os
import csv
import pandas as pd

In [2]:
! pip install kaggle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
! mkdir ~/.kaggle

In [5]:
! cp kaggle.json ~/.kaggle/

In [6]:
! chmod 600 ~/.kaggle/kaggle.json

In [7]:
! kaggle datasets download lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
  0% 0.00/25.7M [00:00<?, ?B/s] 19% 5.00M/25.7M [00:00<00:00, 50.7MB/s]
100% 25.7M/25.7M [00:00<00:00, 164MB/s] 


In [8]:
! unzip "/content/imdb-dataset-of-50k-movie-reviews.zip"

Archive:  /content/imdb-dataset-of-50k-movie-reviews.zip
  inflating: IMDB Dataset.csv        


In [9]:
df = pd.read_csv("/content/IMDB Dataset.csv")
print(df.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [10]:
null = pd.isnull(df["review"])
print(df[null])

Empty DataFrame
Columns: [review, sentiment]
Index: []


In [11]:
null = pd.isnull(df["sentiment"])
print(df[null])

Empty DataFrame
Columns: [review, sentiment]
Index: []


In [12]:
reviews = df['review'].tolist()
sentiments = df['sentiment'].tolist()

In [13]:
sentiments_final = np.array([])
for sentiment in sentiments:
  if sentiment == 'positive':
    sentiments_final = np.append(sentiments_final, 1.0)
  else:
    sentiments_final = np.append(sentiments_final, 0.0)

In [14]:
print(sentiments_final.shape)

(50000,)


In [15]:
train_reviews = reviews[:40000]
val_reviews = reviews[40000:48000]
test_reviews = reviews[48000:]

train_sentiments = np.array(sentiments_final[:40000])
val_sentiments = np.array(sentiments_final[40000:48000])
test_sentiments = np.array(sentiments_final[48000:])

In [16]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_reviews)
words = tokenizer.word_index
print(len(words.items()))

112174


In [17]:
vocab_size=10000

In [18]:
train_sequences = tokenizer.texts_to_sequences(train_reviews)

In [19]:
len_1500 = 0
len_1000 = 0
len_500 = 0
len_100 = 0
len_rem = 0
for i in range(len(train_sequences)):
  if len(train_sequences[i])>1500:
    len_1500+=1
  if len(train_sequences[i])>1000 and len(train_sequences[i])<=1500:
    len_1000+=1
  if len(train_sequences[i])>500 and len(train_sequences[i])<=1000:
    len_500+=1
  if len(train_sequences[i])>100 and len(train_sequences[i])<=500:
    len_100+=1
  if len(train_sequences[i])>0 and len(train_sequences[i])<=100:
    len_rem+=1

print(len_1500, len_1000, len_500, len_100, len_rem)

9 156 3091 32046 4698


In [20]:
maxlen = 600

In [21]:
train_sequences = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, padding='post', truncating='post', maxlen=maxlen)

val_sequences = tokenizer.texts_to_sequences(val_reviews)
val_sequences = tf.keras.preprocessing.sequence.pad_sequences(val_sequences, padding='post', truncating='post', maxlen=maxlen)

test_sequences = tokenizer.texts_to_sequences(test_reviews)
test_sequences = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, padding='post', truncating='post', maxlen=maxlen)

In [22]:
print(train_sequences.shape)
print(train_sentiments.shape)

(40000, 600)
(40000,)


In [25]:
embedding_dim = 32
model = tf.keras.Sequential([
                             tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=maxlen),
                             tf.keras.layers.GlobalAveragePooling1D(),
                             tf.keras.layers.Dense(128, activation='relu'),
                             tf.keras.layers.Dropout(0.2),
                             tf.keras.layers.Dense(32, activation='relu'),
                             tf.keras.layers.Dropout(0.2),
                             tf.keras.layers.Dense(1, activation='sigmoid')
])

In [26]:
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.summary()
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=10)
history = model.fit(train_sequences, train_sentiments, epochs = 10, validation_data=(val_sequences, val_sentiments), callbacks = [callback])

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 600, 32)           320000    
                                                                 
 global_average_pooling1d_1   (None, 32)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_3 (Dense)             (None, 128)               4224      
                                                                 
 dropout_2 (Dropout)         (None, 128)               0         
                                                                 
 dense_4 (Dense)             (None, 32)                4128      
                                                                 
 dropout_3 (Dropout)         (None, 32)                0         
                                                      