In [1]:
import json
import pandas as pd
import numpy as np
import re
# display all rows and columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('max_colwidth',1000)
# filter warningsb
import warnings
warnings.filterwarnings('ignore')

In [3]:
# generate query dataframe
with open('friends.json') as f:
    data = json.load(f)
df = pd.DataFrame()
for i in range(1000):
    for record in data[i]:
        series = pd.Series({"sentence":record['utterance'],"emotion":record['emotion']})
        df = df.append(series, ignore_index=True)
# choose queries more than 5 words, which provide more information
df2=df[df['sentence'].apply(lambda x: len(str(x).split())>=5)]
df2.reset_index(drop=True, inplace=True)

In [4]:
# query  = df2[df2.index.isin([66,157,160,189,149,380,386,32,91,155,176,272,659,25,127,152,174,1303,2184,7419])]
query  = df2[df2.index.isin([174,176,189,272,380,386,659,1301,3509,7419,25,32,66,91,127,149,152,155,157,160])]
# 174,176,189,272,380,386,659,1301,3509,7419,25,32,66,91,127,149,152,155,157,160
# 1301,3509,

## Data pre- processing

In [5]:
# classify emotions into 3 categories
df2=df2[df2['emotion']!='non-neutral']
sent_to_id  = {"anger":0,'sadness':0,'disgust':0,'fear':0,'surprise':1, "joy":1,"neutral":2}
df2["sentiment_id"] = df2['emotion'].map(sent_to_id)
df2.head()

Unnamed: 0,emotion,sentence,sentiment_id
1,neutral,"Or! Or, we could go to the bank, close our accounts and cut them off at the source.",2
2,sadness,"Aww, man, now we wont be bank buddies!",0
3,neutral,"Ohh, you guys, remember that cute client I told you about? I bit him.",2
5,neutral,"Well, next time your massaging him, you should try and distract yourself.",2
6,joy,Yeah! Yeah! Yeah! Like-like when Im doing something exciting and I dont wanna get,1


In [6]:
# data cleaning
from bs4 import BeautifulSoup
def cleanText(text):
    text = BeautifulSoup(text, "lxml").text
    text = re.sub(r'\|\|\|', r' ', text) 
    text = re.sub(r'http\S+', r'<URL>', text)
    text = text.lower()
    text = text.replace('x', '')
    return text
df2['cleaned_sentence'] = df2['sentence'].apply(cleanText)

## LSTM Model Building

### Data preperation

In [7]:
# import pacakges
from tqdm import tqdm
from keras.preprocessing.text import Tokenizer
import nltk
from nltk.corpus import stopwords
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
import gensim
from gensim.models.doc2vec import TaggedDocument

In [8]:
# split train/test dataset
train, test = train_test_split(df2, test_size=0.000001 , random_state=42)
# tokenize the words
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) <= 0:
                continue
            tokens.append(word.lower())
    return tokens
# tagged datasets
train_tagged = train.apply(lambda x: TaggedDocument(words=tokenize_text(x['cleaned_sentence']), tags=[x.sentiment_id]), axis=1)
test_tagged = test.apply(lambda x: TaggedDocument(words=tokenize_text(x['cleaned_sentence']), tags=[x.sentiment_id]), axis=1)

# The maximum number of words to be used. (most frequent)
max_fatures = 500000

# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 50

#tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer = Tokenizer(num_words=max_fatures, split=' ', filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df2['cleaned_sentence'].values)
X = tokenizer.texts_to_sequences(df2['cleaned_sentence'].values)
X = pad_sequences(X,maxlen=MAX_SEQUENCE_LENGTH)
print('Found %s unique tokens.' % len(X))

Found 7100 unique tokens.


### build doc2vev model for embeddings

In [9]:
d2v_model = Doc2Vec(dm=1, dm_mean=1, size=20, window=8, min_count=1, workers=1, alpha=0.065, min_alpha=0.065)
d2v_model.build_vocab([x for x in tqdm(train_tagged.values)])
print(d2v_model)

100%|██████████| 7099/7099 [00:00<00:00, 2372538.97it/s]


Doc2Vec(dm/m,d20,n5,w8,s0.001)


In [10]:
# save the vectors in a new matrix
embedding_matrix = np.zeros((len(d2v_model.wv.vocab)+ 1, 20))
for i, vec in enumerate(d2v_model.docvecs.vectors_docs):
    while i in vec <= 1000:
          embedding_matrix[i]=vec

### Model building

In [11]:
# import pacakges
import keras
from keras.layers import LSTM, Dense, Embedding
from keras.models import Sequential
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

In [12]:
## initiate the model
# init layer
model = Sequential()

# emmbed word vectors
model.add(Embedding(len(d2v_model.wv.vocab)+1,20,input_length=X.shape[1],weights=[embedding_matrix],trainable=True))

# learn the correlations
def split_input(sequence):
     return sequence[:-1], tf.reshape(sequence[1:], (-1,1))
model.add(LSTM(50,return_sequences=False))
model.add(Dense(3,activation="softmax"))

# output model skeleton
model.summary()
model.compile(optimizer="adam",loss="binary_crossentropy",metrics=['acc'])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 20)            120080    
_________________________________________________________________
lstm (LSTM)                  (None, 50)                14200     
_________________________________________________________________
dense (Dense)                (None, 3)                 153       
Total params: 134,433
Trainable params: 134,433
Non-trainable params: 0
_________________________________________________________________


In [13]:
# split train/test dataset for LSTM model
Y = pd.get_dummies(df2['sentiment_id']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.15, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(6035, 50) (6035, 3)
(1065, 50) (1065, 3)


In [14]:
# model fit
batch_size = 32
history=model.fit(X_train, Y_train, epochs =50, batch_size=batch_size, verbose = 2)

Epoch 1/50
189/189 - 2s - loss: 0.5819 - acc: 0.5508
Epoch 2/50
189/189 - 2s - loss: 0.5211 - acc: 0.6013
Epoch 3/50
189/189 - 2s - loss: 0.4456 - acc: 0.6590
Epoch 4/50
189/189 - 2s - loss: 0.3792 - acc: 0.7312
Epoch 5/50
189/189 - 2s - loss: 0.3068 - acc: 0.7978
Epoch 6/50
189/189 - 2s - loss: 0.2445 - acc: 0.8469
Epoch 7/50
189/189 - 2s - loss: 0.2026 - acc: 0.8766
Epoch 8/50
189/189 - 2s - loss: 0.1787 - acc: 0.8915
Epoch 9/50
189/189 - 2s - loss: 0.1574 - acc: 0.9080
Epoch 10/50
189/189 - 2s - loss: 0.1423 - acc: 0.9158
Epoch 11/50
189/189 - 2s - loss: 0.1266 - acc: 0.9279
Epoch 12/50
189/189 - 2s - loss: 0.1195 - acc: 0.9304
Epoch 13/50
189/189 - 2s - loss: 0.1076 - acc: 0.9364
Epoch 14/50
189/189 - 2s - loss: 0.1018 - acc: 0.9408
Epoch 15/50
189/189 - 2s - loss: 0.0951 - acc: 0.9422
Epoch 16/50
189/189 - 2s - loss: 0.0857 - acc: 0.9501
Epoch 17/50
189/189 - 2s - loss: 0.0855 - acc: 0.9490
Epoch 18/50
189/189 - 2s - loss: 0.0785 - acc: 0.9553
Epoch 19/50
189/189 - 2s - loss: 0.07

In [15]:
# evaluate the model
_, train_acc = model.evaluate(X_train, Y_train, verbose=2)
_, test_acc = model.evaluate(X_test, Y_test, verbose=2)
print('Train: %.3f, Test: %.4f' % (train_acc, test_acc))

189/189 - 1s - loss: 0.0223 - acc: 0.9881
34/34 - 0s - loss: 2.5057 - acc: 0.5296
Train: 0.988, Test: 0.5296


In [16]:
# validation
validation_size = 500

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]
score,acc = model.evaluate(X_test, Y_test, verbose = 1, batch_size = batch_size)

print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

score: 2.33
acc: 0.55


In [17]:
model.save('Mymodel.h5')

In [18]:
# get query emotion
def get_label(x):
    seq = tokenizer.texts_to_sequences([x])
    padded = pad_sequences(seq, maxlen=50, dtype='int32', value=0)
    pred = model.predict(padded)
    labels = ['0','1','2']
    return labels[np.argmax(pred)]

In [19]:
# checking
query['new']=  query['sentence'].apply(lambda x: get_label(x))
query

Unnamed: 0,emotion,sentence,new
25,anger,You had no right to tell me you ever had feelings for me.,0
32,joy,"Oh, it's so romantic to send people off on their honeymoon.",1
66,neutral,This witness won't return my calls so we're gonna see if we can surprise him coming home.,2
91,joy,"Okay, okay, come on, you can do it. You can do it!",1
127,anger,My fault?! You threatened the judge!,0
149,neutral,Everyone knows who you are.,2
152,sadness,"I mean, well, 'cause when I was growing up, you know my dad left, and my mother died, and my stepfather went to jail, so I barely had enough pieces of parents to make one whole one.",0
155,joy,"It's just, it's just the luckiest baby in the whole world.",1
157,neutral,"Okay, it's a typical New York City apartment. Two girls are just hanging out.",2
160,neutral,It's my new perfume. Why don't you come closer where you can,2


In [20]:
#### if using the function. e.g+
message='''it's just, it's just the luckiest baby in the whole world.'''
emotion_category=get_label(message)

In [21]:
emotion_category

'1'

In [22]:
emotion_category

'1'