In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from keras.datasets import imdb


**Load the IMDb Dataset from Keras**

In [2]:
# Load the dataset from Keras
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000)

# View a sample of the data
print(f"Number of training samples: {len(X_train)}")
print(f"Number of test samples: {len(X_test)}")


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step
Number of training samples: 25000
Number of test samples: 25000


Since Keras provides the IMDb dataset as integer-encoded sequences (where each integer represents a word index in a predefined vocabulary), you will need to decode these sequences back to text. After that, you can tokenize, remove stopwords, and lemmatize the text.

Here’s how to decode the sequences and preprocess the data:

In [3]:
# Dictionary for decoding
word_index = imdb.get_word_index()

# Reverse the word index to get word -> integer mapping
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [4]:
# Function to decode sequences back to text
def decode_review(encoded_review):
    return ' '.join([reverse_word_index.get(i - 3, '?') for i in encoded_review])  # Subtract 3 to adjust for special tokens

In [5]:
# Decode the training and testing data
X_train_text = [decode_review(sequence) for sequence in X_train]
X_test_text = [decode_review(sequence) for sequence in X_test]

# Example of decoded review
print(X_train_text[0])  # A decoded review from the training set

? this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert ? is an amazing actor and now the same being director ? father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for ? and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also ? to the two little boy's that played the ? of norman and paul they were just brilliant children are often left out of the ? list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have done don't you thi

**Tokenization, Lemmatization, and Stopword Removal**

In [9]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [11]:
lemmatizer = WordNetLemmatizer()

In [7]:
# Preprocess the text: tokenization, lemmatization, and stopword removal
def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenize and convert to lowercase
    tokens = [word for word in tokens if word.isalpha()]  # Remove non-alphabetic characters
    # stop_words = set(stopwords.words('english'))  # Get a set of English stop words
    # tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens if not word in stopwords.words('english')]  # Remove stopwords
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatize words
    return tokens

In [12]:
X_train_processed = [preprocess_text(review) for review in X_train_text]
X_test_processed = [preprocess_text(review) for review in X_test_text]


print(X_train_processed[0])

['film', 'brilliant', 'casting', 'location', 'scenery', 'story', 'direction', 'everyone', 'really', 'suited', 'part', 'played', 'could', 'imagine', 'robert', 'amazing', 'actor', 'director', 'father', 'came', 'scottish', 'island', 'loved', 'fact', 'real', 'connection', 'film', 'witty', 'remark', 'throughout', 'film', 'great', 'brilliant', 'much', 'bought', 'film', 'soon', 'released', 'would', 'recommend', 'everyone', 'watch', 'fly', 'fishing', 'amazing', 'really', 'cried', 'end', 'sad', 'know', 'say', 'cry', 'film', 'must', 'good', 'definitely', 'also', 'two', 'little', 'boy', 'played', 'norman', 'paul', 'brilliant', 'child', 'often', 'left', 'list', 'think', 'star', 'play', 'grown', 'big', 'profile', 'whole', 'film', 'child', 'amazing', 'praised', 'done', 'think', 'whole', 'story', 'lovely', 'true', 'someone', 'life', 'shared', 'u']


**Train Word2Vec Model**

In [13]:
# Train a Word2Vec model using the processed text
word2vec_model = Word2Vec(X_train_processed, vector_size=100, window=5, min_count=2, workers=4)

# Save the trained Word2Vec model for future use
word2vec_model.save("word2vec_imdb.model")

# Check vector for a sample word (e.g., "movie")
print(word2vec_model.wv['movie'])


[-7.0100147e-01  3.3571888e-02 -6.5474308e-01  7.5278234e-01
 -1.7766446e-01  3.0636993e-01 -2.6455063e-01  1.4769996e+00
  1.9885531e-01  1.4200774e+00  3.5958952e-01 -9.9265796e-01
 -3.4149426e-01  1.2615637e+00  7.2078621e-01  1.0112547e+00
  2.4706569e+00  1.6615695e+00 -2.5543730e+00  3.8968956e-01
  8.5993171e-01 -7.5668710e-01  1.5998696e-03 -3.3721652e-02
  6.5366036e-01  4.1717997e-01  1.1104670e+00  8.6769027e-01
 -5.1387459e-01  5.0137514e-01  1.0212207e+00 -1.1259012e+00
  1.0457695e+00 -1.9565071e+00 -8.1409270e-01 -7.5399530e-01
 -4.0384975e-01  1.4104832e+00  4.7384375e-01  1.4285653e+00
  7.2376698e-01  2.1295300e-01 -8.7697989e-01 -3.4063476e-01
 -3.7501824e-01  7.9737914e-01  3.2376841e-01  1.3028572e-01
 -6.0997355e-01  8.7747037e-01  1.5011286e+00  6.0768968e-01
  6.4694488e-01 -3.6116379e-01  4.8106751e-01 -2.0336757e+00
  7.5223124e-01 -8.8873461e-02 -1.2028288e+00  1.9037361e-01
 -7.1483362e-01  1.3986726e-01  6.8058372e-01 -1.2948154e+00
 -3.2808474e-01 -5.77976

**Create Feature Vectors for Reviews**

In [14]:
# Function to get average Word2Vec vector for a review
def get_average_word2vec(tokens_list, model, vector_size=100):
    # Filter out words not in the model's vocabulary
    valid_tokens = [model.wv[word] for word in tokens_list if word in model.wv]

    # If no valid tokens are found, return a vector of zeros
    if valid_tokens:
        return np.mean(valid_tokens, axis=0)
    else:
        return np.zeros(vector_size)

# Apply this function to both training and testing data
X_train_vectors = np.array([get_average_word2vec(review, word2vec_model) for review in X_train_processed])
X_test_vectors = np.array([get_average_word2vec(review, word2vec_model) for review in X_test_processed])

# Example shape of the resulting vector
print(X_train_vectors.shape)  # Should print (number of reviews, vector size)


(25000, 100)


**Random Forest Classifier**

In [16]:
# Train a Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_vectors, y_train)

# Predict on the test data
y_pred_rf = rf_model.predict(X_test_vectors)

# Evaluate the model
print("Random Forest Accuracy: ", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


Random Forest Accuracy:  0.82368
              precision    recall  f1-score   support

           0       0.82      0.82      0.82     12500
           1       0.82      0.83      0.82     12500

    accuracy                           0.82     25000
   macro avg       0.82      0.82      0.82     25000
weighted avg       0.82      0.82      0.82     25000



**Using BERT for Sentiment Analysis**

**Step 1: Install Necessary Libraries**

In [17]:
!pip install transformers datasets torch

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

**Step 2: Fine-tune BERT on IMDb Dataset**

In [18]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch

# Load IMDb dataset from Hugging Face
dataset = load_dataset("imdb")

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",          # output directory
    evaluation_strategy="epoch",     # evaluation strategy to adopt during training
    learning_rate=2e-5,              # learning rate
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    num_train_epochs=3,              # number of training epochs
    weight_decay=0.01,               # strength of weight decay
)

# Initialize Trainer
trainer = Trainer(
    model=model,                         # the model to be trained
    args=training_args,                  # training arguments
    train_dataset=tokenized_datasets["train"],         # training dataset
    eval_dataset=tokenized_datasets["test"],           # evaluation dataset
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print(results)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 