In [1]:
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim import utils

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lukemoberly/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
from LondonEmotions.utils import create_embedding_matrix
from tensorflow.python.lib.io import file_io

In [4]:
def clean_data(data):
    """
    clean and preprocess data
    """
    # Process reviews that are numbers
    data['Text'] = data['Text'].astype(str)
    
    # Remove numbers
    data['clean_text'] = data['Text'].apply(
        lambda x: ''.join([let for let in x if not let.isdigit()])
        )
    # Lowercase text
    data['clean_text'] = data['clean_text'].apply(
        lambda x: x.lower()
        )
    # Strip whitespace
    data['clean_text'] = data['clean_text'].apply(
        lambda x: x.strip()
        )
    # Remove punctuation
    data['clean_text'] = data['clean_text'].apply(
        lambda x: ''.join(let for let in x if not let in string.punctuation)
        )
    # Tokenization with nltk
    data['clean_text'] = data['clean_text'].apply(
        lambda x: word_tokenize(x)
    )
    # Remove stopwords
    # stop_words = set(stopwords.words('english'))
    # data['clean_text'] = data['clean_text'].apply(
    #     lambda x: [word for word in x if word not in stop_words]
    #     )
    # Lemmatizing with nltk
    lemmatizer = WordNetLemmatizer()
    data['clean_text'] = data['clean_text'].apply(
        lambda x: ' '.join(lemmatizer.lemmatize(word) for word in x)
        )

    # Tokenizing text
    data['tokenized_text'] = [utils.simple_preprocess(line, deacc=True) for line in data['clean_text']]
    # Return data
    return data

In [5]:
review_df = pd.read_csv('../raw_data/prediction.csv')
review_df.rename(columns = {'review': 'Text'}, inplace=True)
review_df.head()

Unnamed: 0,place_id,lat,lng,Text
0,ChIJiwVttYym2EcRUdFHMteOfCo,51.563524,0.070761,Nice budget hotel....room is clean and well ma...
1,ChIJiwVttYym2EcRUdFHMteOfCo,51.563524,0.070761,This hotel is well managed with great staff on...
2,ChIJiwVttYym2EcRUdFHMteOfCo,51.563524,0.070761,"10 our of 10 lovely service, clean and homely...."
3,ChIJiwVttYym2EcRUdFHMteOfCo,51.563524,0.070761,"Very Dirty, carpet never been cleaned, Curry s..."
4,ChIJiwVttYym2EcRUdFHMteOfCo,51.563524,0.070761,Even 1 star is too much for this terrible hote...


In [6]:
review_df = clean_data(review_df)
review_df.head()

Unnamed: 0,place_id,lat,lng,Text,clean_text,tokenized_text
0,ChIJiwVttYym2EcRUdFHMteOfCo,51.563524,0.070761,Nice budget hotel....room is clean and well ma...,nice budget hotelroom is clean and well mainta...,"[nice, budget, hotelroom, is, clean, and, well..."
1,ChIJiwVttYym2EcRUdFHMteOfCo,51.563524,0.070761,This hotel is well managed with great staff on...,this hotel is well managed with great staff on...,"[this, hotel, is, well, managed, with, great, ..."
2,ChIJiwVttYym2EcRUdFHMteOfCo,51.563524,0.070761,"10 our of 10 lovely service, clean and homely....",our of lovely service clean and homely with lo...,"[our, of, lovely, service, clean, and, homely,..."
3,ChIJiwVttYym2EcRUdFHMteOfCo,51.563524,0.070761,"Very Dirty, carpet never been cleaned, Curry s...",very dirty carpet never been cleaned curry sta...,"[very, dirty, carpet, never, been, cleaned, cu..."
4,ChIJiwVttYym2EcRUdFHMteOfCo,51.563524,0.070761,Even 1 star is too much for this terrible hote...,even star is too much for this terrible hotel ...,"[even, star, is, too, much, for, this, terribl..."


In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.lib.io import file_io
import pickle

In [9]:
num_classes = 5
embed_num_dims = 300
max_seq_len = 300
class_names = ['joy', 'worry', 'anger', 'sad', 'neutral']

reviews = review_df['tokenized_text']

sentences_pred = [[_ for _ in sentence] for sentence in reviews]

texts_pred = [' '.join([x for x in sentence]) for sentence in sentences_pred]

# Tokenize text (convert to integers)
filepath = '../raw_data/tokenizer.pickle'
with file_io.FileIO(filepath, mode='rb') as handle:
    tokenizer = pickle.load(handle)

sequence_pred = tokenizer.texts_to_sequences(texts_pred)

index_of_words = tokenizer.word_index

# vacab size is number of unique words + reserved 0 index for padding
vocab_size = len(index_of_words) + 1

# Padding text sentences
X_pred_pad = pad_sequences(sequence_pred, maxlen = max_seq_len )

In [10]:
X_pred_pad

array([[   0,    0,    0, ...,  197,   84,  537],
       [   0,    0,    0, ..., 3334,  541,  876],
       [   0,    0,    0, ...,  876,   19,   23],
       ...,
       [   0,    0,    0, ..., 8558,  271,  684],
       [   0,    0,    0, ..., 5649,   73, 1052],
       [   0,    0,    0, ...,  749,  294,  211]], dtype=int32)

In [17]:
# Encode target
encoding = {
'anger': 0,
'joy': 1,
'worry': 2,
'neutral': 3,
'sad': 4
}

# Create embedding matrix
file_path = '../embeddings/wiki-news-300d-1M.vec'

embedd_matrix = create_embedding_matrix(file_path, index_of_words, embed_num_dims)
print(embedd_matrix.shape)

(44733, 300)


In [18]:
from tensorflow.keras.models import load_model
model = load_model('../raw_data/saved_model_2.pb')

In [19]:
X_pred_pad

array([[   0,    0,    0, ...,  197,   84,  537],
       [   0,    0,    0, ..., 3334,  541,  876],
       [   0,    0,    0, ...,  876,   19,   23],
       ...,
       [   0,    0,    0, ..., 8558,  271,  684],
       [   0,    0,    0, ..., 5649,   73, 1052],
       [   0,    0,    0, ...,  749,  294,  211]], dtype=int32)

In [20]:
preds = model.predict(X_pred_pad)

In [21]:
print(preds)

[[0.02905648 0.19406627 0.50860006 0.0276215  0.24065568]
 [0.12818776 0.07312929 0.45927614 0.07705467 0.2623521 ]
 [0.05074719 0.5071262  0.15375517 0.19909781 0.08927361]
 ...
 [0.08875266 0.20280546 0.5838012  0.05758075 0.06705988]
 [0.05030318 0.08474547 0.2960779  0.5027143  0.06615911]
 [0.09284611 0.13587722 0.32487068 0.05110097 0.3953051 ]]


In [22]:
preds_categorical = []
for prediction in preds:
    preds_categorical.append(np.argmax(prediction))

In [23]:
encoding = {
    0: 'anger',
    1: 'joy',
    2: 'worry',
    3: 'neutral',
    4: 'sad'
}
pred_series = pd.Series(preds_categorical)

In [24]:
review_predictions = pred_series.map(encoding)

In [25]:
review_predictions.value_counts()

worry      794
joy        749
neutral    214
sad        202
anger       86
dtype: int64

In [26]:
review_df['emotion'] = review_predictions

In [27]:
review_df.head()

Unnamed: 0,place_id,lat,lng,Text,clean_text,tokenized_text,emotion
0,ChIJiwVttYym2EcRUdFHMteOfCo,51.563524,0.070761,Nice budget hotel....room is clean and well ma...,nice budget hotelroom is clean and well mainta...,"[nice, budget, hotelroom, is, clean, and, well...",worry
1,ChIJiwVttYym2EcRUdFHMteOfCo,51.563524,0.070761,This hotel is well managed with great staff on...,this hotel is well managed with great staff on...,"[this, hotel, is, well, managed, with, great, ...",worry
2,ChIJiwVttYym2EcRUdFHMteOfCo,51.563524,0.070761,"10 our of 10 lovely service, clean and homely....",our of lovely service clean and homely with lo...,"[our, of, lovely, service, clean, and, homely,...",joy
3,ChIJiwVttYym2EcRUdFHMteOfCo,51.563524,0.070761,"Very Dirty, carpet never been cleaned, Curry s...",very dirty carpet never been cleaned curry sta...,"[very, dirty, carpet, never, been, cleaned, cu...",joy
4,ChIJiwVttYym2EcRUdFHMteOfCo,51.563524,0.070761,Even 1 star is too much for this terrible hote...,even star is too much for this terrible hotel ...,"[even, star, is, too, much, for, this, terribl...",joy


In [28]:
review_df.to_csv('../raw_data/review_predictions.csv')