---
# **Sentiment Analysis**
---

---

# **1. Installation**

---

## i. Generating a reponse


In [1]:
import sys
import logging
from psutil import virtual_memory

In [2]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
ram_gb = virtual_memory().total / 1e9

In [3]:
tf_response = {
    'error': None,
    'TF version': '',
    'COLAB': None,
    'GPU': False,
    'ram_gb': ''
}

In [None]:
try:
    # drive
    from google.colab import drive
    IN_COLAB = 'google.colab' in sys.modules

    # updating tensorflow version
    %tensorflow_version 2.x

    # tensorflow-gpu
    !pip install tensorflow-gpu # !pip install tensorflow_text # I could use BERT
    
    # NLP (nltk, stanza, spacy)
    !pip install nltk 
    !pip install stanza
    !pip install spacy
    !spacy download en_core_web_sm # sm md lg
    !python -m spacy download en
except OSError as error:
    # debugging error
    response['error'] = logging.debug('You are not using your specify version of TensorFlow')
    IN_COLAB = False

    # install requirements
    !pip install -r '../requirements.txt'
finally:
    tf_response['COLAB'] = IN_COLAB
    
    # Importing tensroflow core
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras.preprocessing.text import Tokenizer
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense, Flatten, LSTM, Dropout, Activation, Embedding, Bidirectional

    from sklearn.model_selection import train_test_split
    
    # GPU and RAM response
    if tf.config.list_physical_devices('GPU'):
        GPU = True
        tf_response['GPU'] = GPU
        tf_response['TF_version'] = tf.__version__
        
        if tf_response['COLAB'] == True:
            if gpu_info.find('failed') >= 0:
                print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator')
                print('Re-execute this cell.')
            else:
                print(gpu_info)
            
            if ram_gb < 20:
                print('To enable a high-RAM runtime, select the Runtime > "Change runtime type menu"')
                print('Select high-RAM in the runtime shape dropdown')
                print('Re-execute this cell')
                tf_response['ram_gb'] = 'low-RAM runtime'
            else:
                tf_response['ram_gb'] = 'high-RAM runtime'
            print('\nRuntime {:.2f} GB of available RAM\n'.format(ram_gb))



In [None]:
tf_response

## ii. Importing modules


In [None]:
# Data analysis
from collections import Counter
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sys
import re

%matplotlib inline

# Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator # to create a Word Cloud
from PIL import Image # Pillow with WordCloud to image manipulation

In [None]:
# nltk
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

STOPWORDS = set(stopwords.words('english'))

In [None]:
# Stanza NLP
import stanza

stanza.download('en', package='ewt', processors='tokenize,mwt,pos,lemma', verbose=True)
stNLP = stanza.Pipeline(processors='tokenize,mwt,pos,lemma',
                      lang='en',
                      use_gpu=True)

In [None]:
# testing stanza
doc = stNLP('Barack Obama was born in Hawai.')
print('\n')
print(*[f'word: {word.text+" "}\tlemma: {word.lemma}' for sent in doc.sentences for word in sent.words], sep='\n')

In [None]:
# Spacy NLP
import spacy
spNLP = spacy.load('en_core_web_sm')
spNLP.max_length = 103950039 # or higher
# spacy.prefer_gpu() #will not work with stanza


---

# **2. Hyperparameters**

---

In [None]:
EPOCHS = 30
vocab_size = 5000
embedding_dim = 64
max_lenght = 200
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_portion = .8

FILE = 'datasets/categories_dataset.csv'
main_labels = ['confident', 'unconfident', 'pos_hp', 'neg_hp', 'interested', 'uninterested', 'happy', 'unhappy', 'friendly', 'unfriendly']

---

# **3. Lemmatization**

---

In [None]:
# lemmatizion
# stanza
def stanza_lemma(text):
    doc = stNLP(text)
    return ' '.join([word.lemma for sent in doc.sentences for word in sent.words])

In [None]:
def nltk_lemma(text):
    lemmatizer = WordNetLemmatizer()
    lemmatizer.lemmatize(text)

---
# **4. Load dataset**
---

In [None]:
def load_clean_dataset():
    !mkdir -p datasets
    !wget -nc https://raw.githubusercontent.com/Y4rd13/sentiment-analysis/master/datasets/results/categories_dataset.csv -P datasets
    df = pd.read_csv('./datasets/categories_dataset.csv', encoding='utf-8')
    x, y = df.word, df.category
    return x, y

---
# **5. Prepare dataset**
---

In [None]:
def prepare_dataset(test_size=0.2, validation_size=0.2):
    print('preparing the dataset...\n')
    
    # load dataset
    # split dataset (as string into panda.core.series.Serie object)
    x, y = load_clean_dataset()
    
    # create/split train, validation and test
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size)
    x_train, x_validation, y_train, y_validation = train_test_split(x_train, y_train, test_size=validation_size)

    # pandas.core.series.Series to numpy array
    x_train, y_train = np.array(x_train), np.array(y_train)
    x_validation, y_validation =  np.array(x_validation), np.array(y_validation)
    x_test, y_test = np.array(x_test), np.array(y_test)

    return (x_train, y_train), (x_validation, y_validation), (x_test, y_test)

In [None]:
# test prepare_dataset function
(x_train, y_train), (x_validation, y_validation), (x_test, y_test) = prepare_dataset()

In [None]:
import csv
def prepare_dataset_testing():
    print('preparing the dataset...\n')
           
    # test with csv module
    labels, texts = [], [] 
    with open('./datasets/categories_dataset.csv', 'r', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        next(reader)
        for row in reader:
            print(row)
            labels.append(row[0])
            text = row[1]
            for i in STOPWORDS:
                token = ' ' + i + ' '
                text = text.replace(token, ' ')
                text = text.replace(' ', ' ')
            texts.append(text)

    # split
    train_size = int(len(texts) * training_portion)

    train_texts = texts[:train_size]
    train_labels = labels[:train_size]

    validation_texts = texts[train_size:]
    validation_labels = labels[train_size:]

    # tokenizer
    tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)

    tokenizer.fit_on_texts(train_texts)
    word_index = tokenizer.word_index

    # text to sequences: triain_texts
    train_sequences = tokenizer.texts_to_sequences(train_texts)

    # padding and truncating sequences: train_seq
    train_padded = pad_sequences(sequences=train_sequences, maxlen=max_lenght,
                                 padding=padding_type, truncating=trunc_type)

    # same process: validation_texts
    valdiation_sequences = tokenizer.texts_to_sequences(validation_texts)
    validation_padded = pad_sequences(valdiation_sequences, maxlen=max_lenght, padding=padding_type, truncating=trunc_type)

    # tokenize & sequences to train and validation LABELS
    label_tokenizer = Tokenizer()
    label_tokenizer.fit_on_texts(labels)

    training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
    validation_label_seq = np.array(label_tokenizer.texts_to_sequences(validation_labels))

    print(label_tokenizer.word_index)

    return train_padded, training_label_seq, validation_padded, validation_label_seq

---
# **4. Build model**
---

In [None]:
def build_model(learning_rate=0.0001, opt='adam', loss='categorical_crossentropy'):
    print('building the model...\n')

    # model
    model = Sequential()

    # layers
    model.add(Embedding(vocab_size, embedding_dim))
    model.add(Dropout(0.5))
    model.add(Bidirectional(LSTM(embedding_dim)))

    # softmax output layer
    model.add(tf.keras.layers.Dense(units=2, # 10 
                                    activation='softmax'))

    # optimizer & loss
    opt = tf.optimizers.Adam(learning_rate=learning_rate)
    loss='sparse_categorical_crossentropy'

    # compile model
    model.compile(optimizer=opt,
                  loss=loss,
                  metrics=['accuracy'])
    model.summary()

    return model

---
# **6. Train model** 
---

In [None]:
def train(model, x_train, y_train, x_validation, y_validation,
          epochs, batch_size=32, patience=5, 
          verbose=2, monitor='accuracy'):
    
    print('training...\n')

    # callback
    early_callback = tf.keras.callbacks.EarlyStopping(monitor=monitor, # also try 'val_loss'
                                                      verbose=1, mode='auto', restore_best_weights=True,
                                                      min_delta=1e-3, patience=patience)

    # train model
    history = model.fit(x_train, y_train,
                        batch_size=batch_size, epochs=epochs, verbose=verbose,
                        validation_data=(x_validation, y_validation), # x_test, y_test
                        callbacks=[early_callback])
    return history

---
# **7. Plotting history**
---

In [None]:
def plot_history_(history):
    fitModel_dict = history.history
    acc = fitModel_dict['accuracy']
    val_acc = fitModel_dict['val_accuracy']
    epochs = range(1, len(acc) + 1)
    
    plt.figure(figsize=(15, 8))
    plt.plot(epochs, acc, 'bo', label = 'Training acc')
    plt.plot(epochs, val_acc, 'b', label = 'Validation acc')
    plt.title('Training and validation accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend(loc = 'lower right')
    plt.ylim((0.5, 1))

    plt.show()

def plot_history(history, string):
    fitModel_dict = history.history
    plt.plot(fitModel_dict[string])
    plt.plot/fitModel_dict['val_' + string]
    plt.xlabel('Epochs')
    plt.ylabel(string)
    plt.legend([string, 'val_' + string])
    plt.show()

# test
#plot_history(history, 'accuracy')
#plot_history(history, 'loss')

---
# **8. Main**
---

In [None]:
def main():
    # prepare the dataset
    #x_train, y_train, x_validation, y_validation, x_test, y_test = prepare_dataset()
    train_padded, training_label_seq, validation_padded, validation_label_seq = prepare_dataset_testing()

    # build the model
    model = build_model()

    # train the model
    history = train(model=model, x_train=train_padded, y_train=training_label_seq,
                    x_validation=validation_padded, y_validation=validation_label_seq,
                    epochs=EPOCHS, verbose=1)

    # plot the training
    plot_history(history)

    # evaluate the model
    test_loss, test_accuracy = model.evaluate(x_test, y_test)
    print('\nTest:\nLoss: {}\nAccuracy: {}').format(loss, accuracy * 100)

    # save the model
    model.save(model.h5)

---
# **9. __name__ == "__main__"**
---

In [None]:
if __name__ == "__main__":
    main()