In [19]:
from typing import Tuple, List, Sequence

from david.text import normalize_whitespace
from david.text import get_sentiment_polarity
from david.text import remove_punctuation
from david.text import unicode_to_ascii
from david.server import CommentsSql

from wasabi import msg
import spacy

In [5]:
def preprocess_texts(batch, minlen=20):
    batch_size = len(batch)
    msg.warn(f"* Preprocessing batch with {batch_size} samples...")
    comments = []
    for sequence in batch:
        text = normalize_whitespace(unicode_to_ascii(sequence))
        if len(text) > minlen and text not in comments:
            comments.append(text) 
    msg.good(f'* Removed {batch_size-len(comments)} comments'
             f' from original batch size of {len(comments)}')
    return comments

def texts_to_sentences(texts, spacy_model='en_core_web_sm'):
    msg.warn('* Transforming texts to sentences...')
    sentences = []
    nlp = spacy.load(spacy_model)
    for idx, doc in enumerate(nlp.pipe(texts)):
        for sent in doc.sents:
            text = sent.text
            polar = get_sentiment_polarity(remove_punctuation(text))
            sentences.append((text, polar))
    msg.good('* Done! here is some information about what happened.')
    msg.info(f'* Before: {len(texts)} & After: {len(sentences)} 🤖')
    return sentences

def load_train_data(sentences) -> Tuple[List[str], List[str], List[Tuple[str, float]]]:
    msg.warn('Converting sentences as training data...')
    y_test = [] # comments with no sentiment score
    x_train, x_labels = [], []
    for sent, sentiment in sentences:
        if sentiment == .0:
            y_test.append((sent, sentiment))
        else:
            x_labels.append(1 if sentiment > 0 else 0)
            x_train.append(sent)
    return (x_train, x_labels, y_test)
    msg.good(f'* Done! texts: {len(train_texts)}, labels: {len(train_labels)}.')

In [6]:
QUERY = "%make a video%"

# fetch from a unbox-DB w/comments scrapped from all unbox-therapy's channel.
db1 = CommentsSql('unbox')
texts1 = [q.text for q in db1.fetch_comments(QUERY)]

# fetch from v1-DB with random scrapped videos from various categories.
db2 = CommentsSql('v1')
texts2 = [q.text for q in db2.fetch_comments(QUERY)]

# chaining the preprocessing pipeline on combined texts.
train_data, train_labels, test_data = load_train_data(
    texts_to_sentences(preprocess_texts(texts1+texts2)))

[38;5;3m⚠ * Preprocessing batch with 1483 samples...[0m
[38;5;2m✔ * Removed 46 comments from original batch size of 1437[0m
[38;5;3m⚠ * Transforming texts to sentences...[0m
[38;5;2m✔ * Done! here is some information about what happened.[0m
[38;5;4mℹ * Before: 1437 & After: 3141 🤖[0m
[38;5;3m⚠ Converting sentences as training data...[0m


In [7]:
from david.tokenizers import Tokenizer
tokenizer = Tokenizer(document=train_data)
print(tokenizer)

<Tokenizer(vocab_size=2551)>


In [8]:
tokenizer.index_vocab_to_frequency()
tokenizer.bag_of_tokens(5)

[('a', 1), ('video', 2), ('make', 3), ('the', 4), ('.', 5)]

In [9]:
tokenizer.most_common(5)

[('a', 731), ('video', 538), ('make', 518), ('the', 490), ('.', 390)]

In [10]:
string = "hello, world! this a text from yt comments :)"
str2idx = tokenizer.convert_string_to_ids(string)
idx2tok = tokenizer.convert_ids_to_tokens(str2idx)
tok2str = tokenizer.convert_tokens_to_string(idx2tok)

# you can convert from any input to another (all possible states available).
for example in (str2idx, idx2tok, tok2str): print(f"* {example}")

* [373, 7, 335, 23, 22, 1, 1015, 59, 1674, 451, 681]
* ['hello', ',', 'world', '!', 'this', 'a', 'text', 'from', 'yt', 'comments', ':)']
* hello, world! this a text from yt comments :)


In [11]:
# save and loading your vocabulary.
vectors_file = "vecotors.pkl"
tokenizer.save_vectors(vectors_file)

# Reload your vocab without having to pass the dataset again!
# tokeniner = Tokenizer(vectors_file)

In [12]:
from david.models import GloVe
from david.text import largest_string_sequence

from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.layers.embeddings import Embedding

Using TensorFlow backend.


In [13]:
glove_embeddings = GloVe.fit_embeddings(tokenizer.vocab_index, vocab_dim="100d")

[38;5;2m✔ Loading vocab file from
/home/ego/david_models/glove/glove.6B/glove.6B.100d.txt[0m
[38;5;2m✔ num-dim:(100), vocab-size: 2552[0m
[38;5;2m✔ *** embedding vocabulary 🤗 ***[0m


In [14]:
vocab_size, dimensions = glove_embeddings.shape
seqmaxlen = largest_string_sequence(train_data, tokenizer.tokenize)
model = Sequential()
embedding_layer = Embedding(vocab_size, dimensions,
                            weights=[glove_embeddings],
                            input_length=seqmaxlen,
                            trainable=False)
model.add(embedding_layer)
model.add(Flatten())
model.add(Dense(1, activation="sigmoid"))
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["acc"])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 62, 100)           255200    
_________________________________________________________________
flatten_1 (Flatten)          (None, 6200)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 6201      
Total params: 261,401
Trainable params: 6,201
Non-trainable params: 255,200
_________________________________________________________________


In [17]:
from keras.preprocessing.sequence import pad_sequences

# Here we use the sequences from the tokenizer and we can now train our model
sequences = tokenizer.document_to_sequences(train_data)
padded_sequences = pad_sequences(list(sequences), padding="post")
model.fit(padded_sequences, train_labels, epochs=100, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.callbacks.History at 0x7f107b9f6590>

In [222]:
import numpy as np

def nearest_emoji(score):
    """Find the nearest emoji matching a sentiment value."""
    EMOJI_EMOTIONS = {
        99: '😍', 95: '🤗', 90: '😀', 80: '😁', 70: '😊',
        75: '😅', 55: '😑', 50: '😶', 45: '😒', 35: '😬',
        30: '😳', 25: '😤', 20: '😠', 10: '😡', 1: '🤬',
    }
    array = np.asarray(list(EMOJI_EMOTIONS.keys()))
    index = (np.abs(array - score)).argmin()
    emoji_index = array[index]
    if emoji_index:
        return EMOJI_EMOTIONS[emoji_index]
    return '❓'

def pad_input(string: str, maxlen: int) -> List[List[Sequence[int]]]:
    """New inputs need follow the same encoding steps as the dataset."""
    tokens = tokenizer.tokenize(string)
    embedd = tokenizer.convert_tokens_to_ids(tokens)
    return pad_sequences([embedd], maxlen=maxlen, padding="post")

def predict(string: str, k=.5, model=model, maxlen=seqmaxlen) -> str:
    """Print the prediction for new inputs from the trained model."""
    embedd_input = pad_input(string, maxlen)
    embedd_score = model.predict(embedd_input)[0]
    if embedd_score[0] >= k: return (1, round(embedd_score[0]*100, 4))
    else: return (0, round(embedd_score[0]*100, 4))

def print_predict(string: str, k=.6):
    label, score = predict(string, k=k)
    emoji = nearest_emoji(score)
    out = "input: {} : {} ({})%"
    if label == 1:
        out = out.format(string, f'<pos:({emoji})>', score)
    else:
        out = out.format(string, f'<neg:({emoji})>', score)
    print(out)

In [223]:
print_predict("hello there i am so glad this demo worked")

input: hello there i am so glad this demo worked : <pos:(🤗)> (94.6934)%


In [224]:
# the model is sensative to punctuation which makes sense (!!) displays exitment
print_predict("hello there! i am so glad this demo worked!")

input: hello there! i am so glad this demo worked! : <pos:(🤗)> (95.1013)%


> Here we see that our model learned to detect - happy face `:)` and sad face `:(`

- `hate` + `love` + `:)` => `(67.5018)%`

- `love` + `hate` + `:)` => `(66.5365)%` 

- `hate` + `love` + `:(` => `(58.949)%`

- `love` + `hate` + `:(` => `(57.8881)%`

In [230]:
emotion_face = {'pos': ":)", 'neg': ":("}
love_but_hate = "I love this, but hate it {}"
hate_but_love = "I hate this, but love it {}"

print_predict(love_but_hate.format(emotion_face["pos"]))
print_predict(love_but_hate.format(emotion_face["neg"]))
print()
print_predict(hate_but_love.format(emotion_face["pos"]))
print_predict(hate_but_love.format(emotion_face["neg"]))

input: I love this, but hate it :) : <pos:(😊)> (66.5365)%
input: I love this, but hate it :( : <neg:(😑)> (57.8881)%

input: I hate this, but love it :) : <pos:(😊)> (67.5018)%
input: I hate this, but love it :( : <neg:(😑)> (58.949)%


In [209]:
import random
# load the training data to test the model!
y_data, _ = zip(*test_data)

In [228]:
for comment in random.sample(y_data, k=20):
    old_score = get_sentiment_polarity(comment)
    _, new_score = predict(comment)
    label = nearest_emoji(new_score)
    text = normalize_whitespace(comment)
    print('💬 (old={}, new={})\n {} - {}\n'.format(
        old_score, new_score, label, text))

💬 (old=0.0, new=79.2524)
 😁 - I want to buy the A50 it is a same look to A70 can u make a video on the A50

💬 (old=0.0, new=91.3449)
 😀 - Can you please make a video on OLED BURNS?

💬 (old=0.0, new=70.945)
 😊 - Make a video of snapdragon 655 processor

💬 (old=0.0, new=90.9218)
 😀 - I've already had to replace 2 keys

💬 (old=0.0, new=92.7388)
 🤗 - Make a video about hp spectre folio

💬 (old=0.0, new=82.5488)
 😁 - +??

💬 (old=0.0, new=79.2417)
 😁 - And iPhone 11 is
still not out for sale

💬 (old=0.0, new=91.6293)
 😀 - you should make mare videos>>>>

💬 (old=0.0, new=48.7595)
 😶 - Have you seen the 'Tecno phantom 9'

💬 (old=0.0, new=82.5576)
 😁 - He’s only going back to Apple

💬 (old=0.0, new=98.586)
 😍 - I like seeing you guys coming together to make a video.

💬 (old=0.25, new=55.7028)
 😑 - ;)

💬 (old=0.0, new=99.1161)
 😍 - Are you going to make a video about the 38" LG Monitor?

💬 (old=0.0, new=58.3881)
 😑 - please leave a comment here to Unbox Therapy to make a video on this issue.

💬 

## The End - Save the Model and Tokenizer

In [231]:
import os
def save2path(dirname, filename):
    if not os.path.exists(dirname):
        os.makedirs(dirname, exist_ok=True)
    return os.path.join(dirname, filename)

ROOT_DIR = 'ytc_sentiment'
MODEL_DIR = os.path.join(ROOT_DIR, 'model')
VOCAB_DIR = os.path.join(ROOT_DIR, 'vocab')
MODEL_FILE = save2path(MODEL_DIR, 'model.h5')
VECTORS_FILE = save2path(VOCAB_DIR, 'vectors.pkl')

In [232]:
# save the model's and tokenizer's sources of information!
model.save(MODEL_FILE) 
tokenizer.save_vectors(VECTORS_FILE)
del_existing_model = False
del_existing_tokenizer = False
if del_existing_model:
    del model
if del_existing_tokenizer:
    del tokenizer

In [234]:
from keras.models import load_model
# returns a compiled model identical to the previous one
sentiment_model = load_model(MODEL_FILE)
sentiment_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 62, 100)           255200    
_________________________________________________________________
flatten_1 (Flatten)          (None, 6200)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 6201      
Total params: 261,401
Trainable params: 6,201
Non-trainable params: 255,200
_________________________________________________________________


In [235]:
# returns tokenizer identical to the previous one
sentiment_tokenizer = Tokenizer(VECTORS_FILE)
print(sentiment_tokenizer)

<Tokenizer(vocab_size=2551)>


In [236]:
sentiment_tokenizer.bag_of_tokens(5)

[('a', 1), ('video', 2), ('make', 3), ('the', 4), ('.', 5)]

In [237]:
!pwd

/home/ego/notebooks/random
