In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# Load Dataset

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/youtube-sentiments/youtube_labeled.csv', usecols=['text', 'emotion'])

df

# Process Data

In [None]:
x = df['text']
y = df['emotion']

In [None]:
EMOTIONS = df['emotion'].unique()
N_EMOTIONS = len(EMOTIONS)
N_EMOTIONS

## Categorical Data

In [None]:
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical

encoder = LabelEncoder()
encoder.fit(y)
y = encoder.transform(y)
y = to_categorical(y)

In [None]:
encoder.classes_

In [None]:
decode_map = {
    0: 'constructive feedback/idea',
    1: 'negative',
    2: 'neutral/other', 
    3: 'positive', 
    4: 'sadness', 
}

In [None]:
y[0:5]

## Train and Test Dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    x,
    y,
    test_size=0.2
)

# Model

In [None]:
# Path to model in TensorFlow Hub
model_hub_path = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"


# Build first module layers using TensorFlow Hub model
hub_layer = hub.KerasLayer(model_hub_path, input_shape=[], dtype=tf.string, trainable=False)

In [None]:
BEST = True

if BEST:
    model = tf.keras.models.Sequential([
        hub_layer,
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dropout(0.8),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(N_EMOTIONS, activation='sigmoid')
    ])
else:
    model = tf.keras.models.Sequential([
        hub_layer,
        tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, axis=-1), input_shape=[None]),
        tf.keras.layers.Conv1D(filters=32, kernel_size=5, strides=1, padding='causal', activation='relu'),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True, dropout=0.5)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, dropout=0.5)),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(N_EMOTIONS, activation='sigmoid')
    ])


model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    optimizer=tf.keras.optimizers.Adam(0.001),
    metrics=['accuracy']
)

model.summary()

## Fit Model

### Cyclical Learning

Resource: https://github.com/bckenstler/CLR

In [None]:
from tensorflow.keras.callbacks import *
from tensorflow.keras import backend as K

class CyclicLR(Callback):
    """This callback implements a cyclical learning rate policy (CLR).
    The method cycles the learning rate between two boundaries with
    some constant frequency, as detailed in this paper (https://arxiv.org/abs/1506.01186).
    The amplitude of the cycle can be scaled on a per-iteration or 
    per-cycle basis.
    This class has three built-in policies, as put forth in the paper.
    "triangular":
        A basic triangular cycle w/ no amplitude scaling.
    "triangular2":
        A basic triangular cycle that scales initial amplitude by half each cycle.
    "exp_range":
        A cycle that scales initial amplitude by gamma**(cycle iterations) at each 
        cycle iteration.
    For more detail, please see paper.
    
    # Example
        ```python
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., mode='triangular')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```
    
    Class also supports custom scaling functions:
        ```python
            clr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.))
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., scale_fn=clr_fn,
                                scale_mode='cycle')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```    
    # Arguments
        base_lr: initial learning rate which is the
            lower boundary in the cycle.
        max_lr: upper boundary in the cycle. Functionally,
            it defines the cycle amplitude (max_lr - base_lr).
            The lr at any cycle is the sum of base_lr
            and some scaling of the amplitude; therefore 
            max_lr may not actually be reached depending on
            scaling function.
        step_size: number of training iterations per
            half cycle. Authors suggest setting step_size
            2-8 x training iterations in epoch.
        mode: one of {triangular, triangular2, exp_range}.
            Default 'triangular'.
            Values correspond to policies detailed above.
            If scale_fn is not None, this argument is ignored.
        gamma: constant in 'exp_range' scaling function:
            gamma**(cycle iterations)
        scale_fn: Custom scaling policy defined by a single
            argument lambda function, where 
            0 <= scale_fn(x) <= 1 for all x >= 0.
            mode paramater is ignored 
        scale_mode: {'cycle', 'iterations'}.
            Defines whether scale_fn is evaluated on 
            cycle number or cycle iterations (training
            iterations since start of cycle). Default is 'cycle'.
    """

    def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',
                 gamma=1., scale_fn=None, scale_mode='cycle'):
        super(CyclicLR, self).__init__()

        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.mode = mode
        self.gamma = gamma
        if scale_fn == None:
            if self.mode == 'triangular':
                self.scale_fn = lambda x: 1.
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = lambda x: 1/(2.**(x-1))
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = lambda x: gamma**(x)
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        self.clr_iterations = 0.
        self.trn_iterations = 0.
        self.history = {}

        self._reset()

    def _reset(self, new_base_lr=None, new_max_lr=None,
               new_step_size=None):
        """Resets cycle iterations.
        Optional boundary/step size adjustment.
        """
        if new_base_lr != None:
            self.base_lr = new_base_lr
        if new_max_lr != None:
            self.max_lr = new_max_lr
        if new_step_size != None:
            self.step_size = new_step_size
        self.clr_iterations = 0.
        
    def clr(self):
        cycle = np.floor(1+self.clr_iterations/(2*self.step_size))
        x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1)
        if self.scale_mode == 'cycle':
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle)
        else:
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations)
        
    def on_train_begin(self, logs={}):
        logs = logs or {}

        if self.clr_iterations == 0:
            K.set_value(self.model.optimizer.lr, self.base_lr)
        else:
            K.set_value(self.model.optimizer.lr, self.clr())        
            
    def on_batch_end(self, epoch, logs=None):
        
        logs = logs or {}
        self.trn_iterations += 1
        self.clr_iterations += 1

        self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
        self.history.setdefault('iterations', []).append(self.trn_iterations)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)
        
        K.set_value(self.model.optimizer.lr, self.clr())

In [None]:
clr = CyclicLR(base_lr=0.0003, max_lr=0.003,step_size=2000., mode='triangular2')

results = model.fit(
    X_train,
    y_train,
    epochs=50,
    validation_split=0.1,
    shuffle=True,
    batch_size=8,
    callbacks=[clr]
)

## Test Model

In [None]:
model.evaluate(X_test, y_test)

## Plot Accuracy and Loss

In [None]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

def plot_results(model):
    acc = model.history['accuracy']
    loss = model.history['loss']
    val_acc = model.history['val_accuracy']
    val_loss = model.history['val_loss']
    epochs = range(len(acc))

    plt.figure(figsize=(10, 5), dpi=100)
    plt.plot(epochs, loss, label='Training Loss')
    plt.plot(epochs, acc, label='Training Accuracy')
    plt.legend()
    plt.show()

    plt.figure(figsize=(10, 5), dpi=100)
    plt.plot(epochs, val_loss, label='Validation Loss')
    plt.plot(epochs, val_acc, label='Validation Accuracy')
    plt.legend()
    plt.show()

In [None]:
plot_results(results)

# Label Unlabeled Dataset

In [None]:
unlabeled_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/youtube-sentiments/data.csv', usecols=['text'])

unlabeled_df

## Label and update both the model and the dataset

In [None]:
# Function to label the dataset using the model previously trained and a threshold
def get_labels(df, model):
    # Init result array that maps text to emotion
    result_text = []
    result_emotion = []

    for t in df['text']:
        # Get classification from model
        probability_prediction = model.predict(np.array([t]))[0]
        idx = np.argmax(probability_prediction)
        if probability_prediction[idx] >= 0.80:
            # Reverse categorical
            prediction = to_categorical(idx)
            prediction = encoder.inverse_transform([len(prediction)-1])
            # Add the result to the arrays
            result_text.append(t)
            result_emotion.append(prediction[0])
            # Remove example from dataset
            df.drop(df.loc[df['text']==t].index, inplace=True)

    return result_text, result_emotion, df


# Function to automate the update of the model and the data through labeling
def self_learning(labeled_df, unlabeled_df, model):
    # Init counter for the iterations
    epoch = 0

    new_df = labeled_df.copy()

    while len(unlabeled_df) > 0:

        # Stop after a while
        if epoch == 10: break

        new_labeled_text, new_labeled_emotion, unlabeled_df = get_labels(unlabeled_df, model)

        # Create a dataframe with the new data
        model_labeled_data = pd.DataFrame(data={ 'text': new_labeled_text, 'emotion': new_labeled_emotion })

        # Concatenate previous data frame with new data frame

        new_df = pd.concat([new_df, model_labeled_data])

        # Feature and label
        x = new_df['text']
        y = new_df['emotion']

        # Categorical data
        y = encoder.transform(y)
        y = to_categorical(y)

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            x,
            y,
            test_size=0.3
        )

        results = model.fit(
            X_train,
            y_train,
            epochs=10,
            validation_split=0.3,
            shuffle=True,
            batch_size=1,
            verbose=0,
            callbacks=[clr]
        )

        print('Epoch: ',epoch)
        print('Data added: ', len(new_labeled_text))
        print('Data remaining: ', len(unlabeled_df))
        print('Updated dataset dimension: ', len(new_df))
        print('Model performance:')
        model.evaluate(X_test, y_test)
        print('')

        # Update the iteration counter
        epoch += 1

    return model, new_df

In [None]:
updated_model, updated_df = self_learning(labeled_df=df, unlabeled_df=unlabeled_df, model=model)

## Label the entire dataset at the same time

In [None]:
# Label the entire dataset and create a new dataframe
def label_dataset(df, model):
    # Init result array that maps text to emotion
    result_text = []
    result_emotion = []

    for t in df['text']:
        # Get classification from model
        probability_prediction = model.predict(np.array([t]))[0]
        idx = np.argmax(probability_prediction)
        # Reverse categorical
        prediction = to_categorical(idx)
        prediction = encoder.inverse_transform([len(prediction)-1])
        # Add the result to the arrays
        result_text.append(t)
        result_emotion.append(prediction[0])

    return result_text, result_emotion

result_text, result_emotion = label_dataset(unlabeled_df, model)

labeled_by_model_df = pd.DataFrame({ 'text': result_text, 'emotion': result_emotion })
labeled_by_model_df.head()

#### Save dataframe labeled by the model

In [None]:
labeled_by_model_df.to_csv('/tmp/labeled_by_model_df.csv', index=False)

# Test Model with New Predictions

### Starting model prediction

In [None]:
sentence = 'you are great'

prediction = np.argmax(model.predict(np.array([sentence])))

decode_map[prediction]

### Updated model test

In [None]:
sentence = 'you are great'

prediction = np.argmax(updated_model.predict(np.array([sentence])))

decode_map[prediction]