<a href="https://colab.research.google.com/github/axel-sirota/tf-dev-nlp/blob/main/module3/TF_Developer_NLP_Module3_Demo1_Analysing_Sentiment_With_OHE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Analysing Sentiment

Let's first import everything and load the dataset

In [36]:
!pip install textblob

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [37]:
import multiprocessing
import tensorflow as tf
import sys
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda, Input
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from textblob import TextBlob, Word
from keras_preprocessing.sequence import pad_sequences
from keras.initializers import Constant
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.nn import leaky_relu
import numpy as np
import random
import os
import pandas as pd
import gensim
import warnings
import nltk

fraction_of_negatives_to_downsample=0.3
TRACE = False

def set_seeds_and_trace():
  os.environ['PYTHONHASHSEED'] = '0'
  np.random.seed(42)
  tf.random.set_seed(42)
  random.seed(42)
  if TRACE:
    tf.debugging.set_log_device_placement(True)

def set_session_with_gpus_and_cores():
  cores = multiprocessing.cpu_count()
  gpus = len(tf.config.list_physical_devices('GPU'))
  config = tf.compat.v1.ConfigProto( device_count = {'GPU': gpus  , 'CPU': cores} , intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
  sess = tf.compat.v1.Session(config=config) 
  K.set_session(sess)

set_seeds_and_trace()
set_session_with_gpus_and_cores()
warnings.filterwarnings('ignore')
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Now we will download the dataset and explore it

In [38]:
%%writefile get_data.sh
if [ ! -f yelp.csv ]; then
  wget -O consumer_complaints.csv https://www.dropbox.com/s/tp39uf1jgxfrfn2/comcast_consumeraffairs_complaints.csv?dl=0
fi

Overwriting get_data.sh


In [39]:
!bash get_data.sh


--2022-11-10 18:12:55--  https://www.dropbox.com/s/tp39uf1jgxfrfn2/comcast_consumeraffairs_complaints.csv?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.81.18, 2620:100:6035:18::a27d:5512
Connecting to www.dropbox.com (www.dropbox.com)|162.125.81.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: /s/raw/tp39uf1jgxfrfn2/comcast_consumeraffairs_complaints.csv [following]
--2022-11-10 18:12:56--  https://www.dropbox.com/s/raw/tp39uf1jgxfrfn2/comcast_consumeraffairs_complaints.csv
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... ^C


In [40]:
path = './consumer_complaints.csv'
complaints = pd.read_csv(path)
complaints.head()


EmptyDataError: ignored

In [None]:
complaints.rating.value_counts()

In [None]:
complaints[complaints.rating == 0].text.iloc[0]

In [None]:
complaints[complaints.rating == 5].text.iloc[0]

As one can see, most of the reviews are negative, which makes sense because one normally does a review when angry or disappointed. We will therefore aggregate all 2-5 star reviews as "not negative / positive" and 0-1 star reviews as negative. The imbalance of the dataset means we will need a good metric to later verify our models and we will need to downsample the negatives

In [None]:
complaints['transformed_rating'] = complaints.rating.map({0:0, 1:0, 2:2, 3:1, 4:1, 5:1})
positives = complaints[complaints['transformed_rating'] == 1]
negatives = complaints[complaints['transformed_rating'] == 0].sample(frac=fraction_of_negatives_to_downsample)
downsampled_complaints = pd.concat([positives, negatives], axis=0)[['text', 'transformed_rating']].rename(columns={'transformed_rating':'rating'}).sample(frac=1).reset_index(drop=True)

downsampled_complaints

In [None]:
# Define X and y.
X = downsampled_complaints.text
y = downsampled_complaints.rating

In [None]:
y.value_counts()

## Creating the corpus and tokenizing

In [None]:
corpus_with_ix = [(ix, ' '.join(TextBlob(sentence).words)) for ix, sentence in X.iteritems() if type(sentence) == str and len(TextBlob(sentence).words) > 3]

In [None]:
corpus_with_ix[0]

In [None]:
corpus_df = pd.DataFrame(corpus_with_ix, columns=['index', 'text'])

In [None]:
corpus_df.head()

In [None]:
y_filtered = y[corpus_df['index']].to_numpy()

In [None]:
corpus = [sentence for ix, sentence in corpus_with_ix]

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
tokenized_corpus = tokenizer.texts_to_sequences(corpus)
nb_samples = sum(len(s) for s in corpus)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
nb_samples, vocab_size

In [None]:
tokenized_corpus[0][:6]

In [None]:
def get_maximum_review_length(tokenized_corpus):
    maximum = 0
    for sentence in tokenized_corpus:
        candidate = len(sentence)
        if candidate > maximum:
            maximum = candidate
    return maximum
max_review_length = get_maximum_review_length(tokenized_corpus)

In [None]:
final_X = np.zeros((len(tokenized_corpus), max_review_length))
for ix, tokenized_sentence in enumerate(tokenized_corpus):
    tokenized_sentence.extend([0]*(max_review_length-len(tokenized_sentence)))
    final_X[ix] = tokenized_sentence

## Doing the train_test split and defining model

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(final_X, y_filtered, test_size = 0.4, random_state=42)

In [None]:
X_train_tensor = tf.constant(X_train)
X_test_tensor = tf.constant(X_test)
y_train_tensor = tf.constant(y_train)
y_test_tensor = tf.constant(y_test)

In [None]:
X_train_tensor.shape

In [None]:
model = Sequential()
model.add(Input(shape = (max_review_length,)))
model.add(Dense(100, activation=leaky_relu))
model.add(Dense(50, activation=leaky_relu))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(), metrics=["accuracy", tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
model.summary()

In [None]:
test_point = np.zeros((1, max_review_length))
for ix, token in enumerate(tokenizer.texts_to_sequences(['Horrible'])[0]):
  test_point[0,ix] = token
model(tf.constant(test_point))

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='precision', patience=5, min_delta = 1e-4, restore_best_weights=True)

In [None]:
history = model.fit(X_train_tensor, y_train_tensor, epochs = 100, workers = 5, callbacks=[callback])

## Some plots

In [None]:
import matplotlib.pyplot as plt
# function for plotting loss
def plot_metrics(train_metric, val_metric=None, metric_name=None, title=None, ylim=5):
    plt.title(title)
    plt.ylim(0,ylim)
    plt.plot(train_metric,color='blue',label=metric_name)
    if val_metric is not None: plt.plot(val_metric,color='green',label='val_' + metric_name)
    plt.legend(loc="upper right")

# plot loss history
plot_metrics(history.history['loss'], history.history['val_loss'], "Loss", "Loss", ylim=1.0)


In [None]:
plot_metrics(history.history['accuracy'], history.history['val_accuracy'], "Accuracy", "Accuracy", ylim=1.0)


# Some manual validation

In [None]:
test_point = np.zeros((1, max_review_length))
for ix, token in enumerate(tokenizer.texts_to_sequences(['Horrible service and internet'])[0]):
  test_point[0,ix] = token
model.predict(tf.constant(test_point)) < 0.5

In [None]:
i = np.random.randint(0, X_test_tensor.shape[0])
prediction = model.predict(np.array([X_test_tensor[i]])) > 0.5
prediction == y_test_tensor[i]

In [None]:
model.evaluate(X_test_tensor, y_test_tensor)

Therefore, the model predicted correctly that the review was positive!