# Sentiment Classifier - (Unsupervised Learning)

> In this notebook we build a unsurpervised sentiment classifier from youtube comments.

In [1]:
from pathlib import Path
from david.tokenizers import WordTokenizer, YTCommentsDataset

In [3]:
def save_train_test(datasets, out_dir, files=['train.txt', 'test.txt']):
    out_dir = Path(out_dir)
    if not out_dir.exists(): out_dir.mkdir(parents=True)
    for i, dataset in enumerate(datasets):
        out_file = out_dir.joinpath(files[i])
        print(f'saving {len(dataset)} samples to file in {out_file}')
        with out_file.open('w', encoding='utf8') as f:
            for sequence in dataset:
                f.write(f'{sequence}\n')

In [4]:
train_dataset, test_dataset = YTCommentsDataset.split_train_test(1000, subset=0.8)
save_train_test([train_dataset, test_dataset], out_dir='yt_dataset')

saving 800 samples to file in yt_dataset/train.txt
saving 200 samples to file in yt_dataset/test.txt


In [5]:
tokenizer = WordTokenizer(document=train_dataset)
print(tokenizer)

< WordTokenizer(vocab_size=5103) >


## embeddings with keras

> In order to use the sequence models for embeddings in Keras - Our `vocab_index` dictionary needs to be `indexed` by token frequency. In short, sort `vocabulary` by token frequency and while assinging `i` index to each token. (first index most start at i=`1`).

In [8]:
vocab_embeddings = {}
vocab_tokens, _ = zip(*tokenizer.vocab_count.most_common())
for index, token in enumerate(vocab_tokens, start=1):
    vocab_embeddings[token] = index

# our vocabulary is now in the correct format
list(vocab_embeddings.items())[:5]

[('.', 1), ('the', 2), (',', 3), ('i', 4), ('to', 5)]

In [9]:
# the is is the original vocab order (now lets update the tokenizer)
list(tokenizer.vocab_index.items())[:5]

[('this', 1), ('is', 2), ('very', 3), ('good', 4), ('way', 5)]

In [10]:
# update the tokenizer's vocabulary.
tokenizer.vocab_index = vocab_embeddings

# lets test a string by embedding to ids:
indexed_string = tokenizer.convert_string_to_ids('hello, world!')
print(indexed_string)

[469, 3, 124, 18]


In [11]:
# now lets decode the ids back to a string. (It worked!)
print(tokenizer.convert_ids_to_string(indexed_string))

hello, world!


In [25]:
def doc_to_sequences(document):
    embeddings = []
    for string in document:
        tokens = tokenizer.tokenize(string)
        if tokens is not None:
            token_ids = tokenizer._encode(tokens)
            embeddings.append(token_ids)
    return embeddings

doc_sequences = doc_to_sequences(train_dataset)
print(doc_sequences[0])

[17, 9, 83, 66, 132, 5, 1417, 95, 264, 51, 2207, 2208, 246, 1, 842, 2209, 76, 1]


In [26]:
tokenizer.convert_ids_to_string(doc_sequences[0])

'this is very good way to wake up myself from dreaming fairy life. feeling energetic now.'

## GloVe Embeddings

> Now that we have our document transformed. We will use GloVe's pre-trained embeddings and fit it on our `doc_sequences`.

In [30]:
from david.models import GloVe
GloVe.vocab_files  # We will fit the sequences to 100d

{'300d': '/home/ego/david_models/glove/glove.6B/glove.6B.300d.txt',
 '50d': '/home/ego/david_models/glove/glove.6B/glove.6B.50d.txt',
 '200d': '/home/ego/david_models/glove/glove.6B/glove.6B.200d.txt',
 '100d': '/home/ego/david_models/glove/glove.6B/glove.6B.100d.txt'}

In [31]:
glove_embeddings = GloVe.fit_embeddings(tokenizer.vocab_index, vocab_dim="100d")

Loading vocab file from /home/ego/david_models/glove/glove.6B/glove.6B.100d.txt
num-dim:(100), vocab-size: 5103 
*** embedding vocabulary...



In [32]:
# our sequences our now embedded to with glove's embeddings.
glove_embeddings.shape

(5103, 100)

In [44]:
def largest_string_sequence(document: list, tokenizer: callable) -> int:
    """Obtain the size of the larget sequence in a document.

    - The tokenizer function can be as simple as:
        >>> def tokenizer(string): return string.split()
    """
    tokenizer_func = lambda sequence: len(tokenizer(sequence))
    largest_string = max(document, key=tokenizer_func)
    return len(tokenizer(largest_string))

largest_string = largest_string_sequence(train_dataset, tokenizer.tokenize)
print(largest_string)

362


In [48]:
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.layers.embeddings import Embedding

# creating a single embedding layer with keras Sequential model.
model = Sequential()

vocab_size, dimensions = glove_embeddings.shape
input_length = largest_string
embedding_layer = Embedding(vocab_size, dimensions,
                            weights=[glove_embeddings],
                            input_length=input_length,
                            trainable=False)
model.add(embedding_layer)
model.add(Flatten())
model.add(Dense(1, activation="sigmoid"))
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["acc"])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 362, 100)          510300    
_________________________________________________________________
flatten_2 (Flatten)          (None, 36200)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 36201     
Total params: 546,501
Trainable params: 36,201
Non-trainable params: 510,300
_________________________________________________________________


In [49]:
from keras.preprocessing.sequence import pad_sequences

# padding the document's sequences by the shape of the max string sequence length
padded_doc_sequences = pad_sequences(doc_sequences, input_length, padding="post")
padded_doc_sequences[0]

array([  17,    9,   83,   66,  132,    5, 1417,   95,  264,   51, 2207,
       2208,  246,    1,  842, 2209,   76,    1,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [58]:
from david.text.prep import get_sentiment_polarity
# I forgot to do this bedfore getting this far so this is an attempt to
# get some labels for trainig the keras model. (This is just for testing anyway)
def get_doc_sentiment_labels(doc_sequences):
    sentiment_labels = []
    for token_ids in doc_sequences:
        string = tokenizer.convert_ids_to_string(token_ids)
        polarity = get_sentiment_polarity(string)
        sentiment_labels.append(1 if polarity > 0 else 0)
    return sentiment_labels

In [59]:
original_text0 = train_dataset[0]  # checking the index order from both sets.
embedded_text0 = tokenizer.convert_ids_to_string(doc_sequences[0])
text0_sentiment = get_sentiment_polarity(embedded_text0)
print(original_text0)
print(embedded_text0)
print('polarity:', text0_sentiment)

This is very Good Way to Wake up myself from dreaming Fairy Life. Feeling Energetic Now.
this is very good way to wake up myself from dreaming fairy life. feeling energetic now.
polarity: 0.705


In [60]:
y_doc_sentiment = get_doc_sentiment_labels(doc_sequences)
y_doc_sentiment[:5]  # 1 for positive, 0 for negative.

[1, 1, 0, 0, 0]

In [61]:
# okay we are good! we can use these sentiment labels to test the keras model!
len(y_doc_sentiment), len(doc_sequences)

(800, 800)

## training the model

In [62]:
model.fit(padded_doc_sequences, y_doc_sentiment, epochs=100, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.callbacks.History at 0x7f16500c7d50>

In [63]:
# of course this is not a proper evaluation!
loss, accuracy = model.evaluate(padded_doc_sequences, y_doc_sentiment, verbose=1)
print(f"accuracy: {round(accuracy*100, 2)}%")

accuracy: 100.0%


In [88]:
def pad_string_input(text: str, maxlen=largest_string):
    tokens = tokenizer.tokenize(text)
    embedd = tokenizer.convert_tokens_to_ids(tokens)
    return pad_sequences([embedd], maxlen=maxlen, padding="post")

embedd_input = pad_string_input("hello world this is a new text")
string_input = tokenizer.convert_ids_to_tokens(embedd_input.tolist()[0])
print(embedd_input)
print(string_input)

[[ 469  124   17    9    7   98 1078    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0   

In [93]:
def predict_sentiment(text, k=0.60):
    embedd_input = pad_string_input(text)
    embedd_score = model.predict(embedd_input)[0]

    prediction = "{} -> {}-score : ({})%"
    if embedd_score[0] >= k:
        prediction = prediction.format(
            text, 'positive', round(embedd_score[0] * 100, 2))
    else:
        prediction = prediction.format(
            text, 'negative', round(embedd_score[0] * 100, 2))
        
    print(prediction)

In [94]:
predict_sentiment("hello world this is a super happy text!")

hello world this is a super happy text! -> positive-score : (87.99)%


In [95]:
predict_sentiment("I hate you!!, and everything about it")

I hate you!!, and everything about it -> negative-score : (23.84)%


In [96]:
# We have improved our sentiment classifier!
get_sentiment_polarity("I hate you!!, and everything about it")

-1.0