### **IMDB Sentiment Analysis** - Binary Classification
- **Date**: Mar 8, 2024  
- **Task**: Create a model to classify reviews into positive or negative using the attention mechanism 
- **Procedure**: Analyze data with pandas, create nn model in TensorFlow, implement transformers
- **Dataset source**: https://www.kaggle.com/datasets/columbine/imdb-dataset-sentiment-analysis-in-csv-format/data   
- **References**: https://github.com/PhilChodrow/PIC16B/blob/7d12d32e070e7ff3840b971c0ce4185ef1911796/discussion/tmdb.ipynb#L758

In [1]:
# Step 0. Load libraries and custom functions
# Matrices and datasets ------------------------------------------------
import pandas as pd
import numpy as np
# Graphics -------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns
# Text processors
import re
import string
#import nltk
#from nltk.corpus import stopwords
#nltk.download('stopwords')
from wordcloud import WordCloud
# Machine Learning -----------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import CountVectorizer
# Deep Learning --------------------------------------------------------
import keras
import tensorflow as tf
from keras import layers
from keras.layers import TextVectorization
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Custom functions -----------------------------------------------------
def sentence_fixed_split(x:list, words: int):
    """
    Split a list of sentences into a list of fixed length sentences.
    
    Parameters
    ----------
    x: list
        sentence as a list of words
    words: int 
        number of fixed words required

    Returns
    -------
    list
        a list of fixed length sentences

    Example
    -------
        df = pd.DataFrame({'text':['In our darkest hour, I will prevail as always']})
        df['text'] = df['text'].apply(lambda x: sentence_fixed_split(x, 4))
        df = df.explode('text')
        text
        ----
        In our darkest hour,
        I will prevail as
        always
    """
    words_lenght = len(x.split(' '))
    if words_lenght>1 and words > 1 and words_lenght > words:
        return [' '.join(x.split(' ')[i:i+words]) for i in range(0, len(x.split(' ')), words)]
    else:
        return x
    
def plot_accuracy_loss_tfmodel(model, epochs: int):
    '''
    Plots the accuracy and loss curves of a TensorFlow model

    Parameters
    ----------
    model
        A tensorflow model
    epochs
        Number of epochs the model was trained for

    Returns
    -------
    A 2 columns 1 row plot of accuracy and loss curves
    '''
    epochs_range = range(1, epochs + 1)
    plt.figure(figsize=(8, 4))
    plt.subplot(1,2,1)
    plt.plot(epochs_range, model.history['accuracy'], 'b', label='Training accuracy')
    plt.plot(epochs_range, model.history['val_accuracy'], 'b--', label='Validation accuracy')
    plt.legend()
    plt.subplot(1,2,2)
    plt.plot(epochs_range, model.history['loss'], 'b', label='Training loss')
    plt.plot(epochs_range, model.history['val_loss'], 'b--', label='Validation loss')
    plt.legend()
    plt.show()
    plt.close()

def plot_confusion_matrix(y_true, y_pred):
    '''
    Plots the confusion matrix and precision/recall metrics

    Parameters
    ----------
    y_true
        True labels
    y_pred
        Predicted labels

    Returns
    -------
    A plot and the metrics
    '''
    cm = confusion_matrix(y_pred, y_true)
    plt.figure(figsize=(4, 4))
    sns.heatmap(cm, annot=True, fmt='g')
    plt.title('Confusion Matrix')
    plt.show()
    plt.close
    print(classification_report(y_pred, y_true))

def analyze_wrong_predictions(dataset, y_pred, samples):
    '''
    Prints samples of wrong predictions on a dataset

    Parameters
    ----------
    dataset
        data with values
    y_pred
        list of predictions
    samples
        number of samples required

    Returns
    -------
    Text with true label and reviews
    '''
    dataset['prediction'] = y_pred
    for index, row in dataset[dataset.label != dataset.prediction].sample(samples).iterrows():
        print(f'label: {row.label}, {row.text}')
        print('--------------------------------')

In [2]:
# Step 1. Load data
# 1.1 Read csv and get basic info
df_train = pd.read_csv('../data/01_IMDB_Train.csv')
df_val = pd.read_csv('../data/01_IMDB_Valid.csv')
df_test = pd.read_csv('../data/01_IMDB_Test.csv')
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    40000 non-null  object
 1   label   40000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 625.1+ KB


In [3]:
# 1.2 Preprocess data based on observed information
df_train = df_train[~df_train.text.duplicated()]
df_val = df_val[~df_val.text.duplicated()]
df_train.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)

In [4]:
# Step 2. Create a neural network using transformers
# 2.1 Create tensorflow dataset
def make_data(dataset):
    return tf.data.Dataset.from_tensor_slices(
        (
            {'text':dataset['text']},
            dataset['label']
        )
    )
train = make_data(df_train).batch(32)
val = make_data(df_val).batch(32)
test = make_data(df_test).batch(32)

2024-03-09 10:48:10.489953: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Max
2024-03-09 10:48:10.489985: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 32.00 GB
2024-03-09 10:48:10.490012: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 10.67 GB
2024-03-09 10:48:10.490300: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-03-09 10:48:10.490680: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [5]:
# 2.2 Create transformer layer
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_head=num_heads,
            key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation='relu'),
             layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]
        attention_output = self.attention(inputs, inputs, attention_mask=mask)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)
    
    def get_config(self):
        config = super().get_config()
        config.update({
            'embed_dim': self.embed_dim,
            'num_heads': self.num_heads,
            'dense_dim': self.dense_dim
        })
        return config