# Data pipeline

Prototyping some code for my data processing pipeline

In [1]:
import pandas as pd
import numpy as np

# Step 1: Process comments

# We'll operate on a single csv file then extend to all of them.
filename = "label-app/data/filtered/filtered_comments_set_533.csv"
df = pd.read_csv(filename, index_col=0)
df.sort_values(by='clikes', ascending=False).head()

Unnamed: 0,cid,text,time,author,clikes,cdislikes,video_id,desc,category,lang
173,UgxLdrxL9M7QS9hcG-B4AaABAg,I watch these like I’m actually going to even ...,8 months ago,Umar Ahmad,1706,0,1i_hs2-VYOU,-,-,en
122,UgzXbPauCtgqh70dTIZ4AaABAg,"Food is so beautiful to me, no matter what rel...",8 months ago (edited),01emercado01,927,0,1i_hs2-VYOU,-,-,en
254,UgwWQziec5aPXNjheip4AaABAg,I I just love pasta in general. It's so soft y...,8 months ago,01emercado01,575,0,1i_hs2-VYOU,-,-,en
151,UgzZEtAOKC6cbiNENO14AaABAg,The kind of quality content I subscribed to Ta...,8 months ago,Jacelyn,508,0,1i_hs2-VYOU,-,-,en
276,UgwOmH8No4ofddRYigx4AaABAg,I love these types of videos!😂,8 months ago,liza galstyan,357,0,1i_hs2-VYOU,-,-,en


In [2]:
from keras.models import load_model
from deepmoji import attlayer

modell = load_model('../DeepMoji/mood_model_3_deepmoji.h5', 
                  custom_objects={'AttentionWeightedAverage': attlayer.AttentionWeightedAverage})

Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


In [3]:
from __future__ import print_function, division
import examples.example_helper
import json
import csv
import numpy as np
from deepmoji.sentence_tokenizer import SentenceTokenizer
from deepmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH
from deepmoji.finetuning import calculate_batchsize_maxlen

def top_elements(array, k):
    ind = np.argpartition(array, -k)[-k:]
    return ind[np.argsort(array[ind])][::-1]

# tokenize text
try:
    texts = [unicode(x) for x in df['text']]
except UnicodeDecodeError:
    texts = [x.decode('utf-8') for x in df['text']]

batch_size, maxlen = calculate_batchsize_maxlen(texts)

with open(VOCAB_PATH, 'r') as f:
    vocab = json.load(f)
st = SentenceTokenizer(vocab, 30)

tokenized, _, _ = st.tokenize_sentences(texts)

In [75]:
batch_size, maxlen

(250, 30)

In [5]:
prob = modell.predict(tokenized)

In [6]:
scores = []
for i, t in enumerate(texts):
    t_tokens = tokenized[i]
    t_score = [t]
    t_prob = prob[i]
    ind_top = top_elements(t_prob, 4)
    t_score.extend(ind_top)
    t_score.extend([t_prob[ind] for ind in ind_top])
    scores.append(t_score)
    print(t_score)

[u'\U0001f355', 1, 3, 2, 0, 0.35959065, 0.26776937, 0.22418712, 0.14845291]
[u"+Nyan cat I've sp\u0435nt 4 d\U0001d5bays, 13 hours and 36 m\U0001d5f6nut\u0435s On YOuTube s\U0001d5f6nc\u0435 26.6.2018! Ch\U0001d41eck y\U0001d7b8ur st\u0251ts: hujf.viewr.stream", 0, 3, 2, 1, 0.52203053, 0.21475583, 0.15005422, 0.11315944]
[u"\u200b@SmashyPlaysdude legit it's not I've spent 10 days and i can't remember the rest but yeah its not", 0, 3, 2, 1, 0.44934809, 0.29328978, 0.14869337, 0.10866882]
[u"Yo, I can't find a video on tasty of this category that is so cliche.", 0, 3, 2, 1, 0.82411247, 0.17150226, 0.0026078019, 0.0017773458]
[u'after playing Mogeko Castle when i see the word prosciutto all i think about are perverted fluffy yellow creatures', 3, 0, 2, 1, 0.62531501, 0.34937444, 0.016298283, 0.0090123583]
[u'watching this while making spaghetti and making myself impatient', 3, 0, 2, 1, 0.35909867, 0.26440665, 0.19513424, 0.18136045]
[u'Love me some noodles', 1, 2, 3, 0, 0.47340128, 0.3929

In [None]:
with open(OUTPUT_PATH, 'wb') as csvfile:
    writer = csv.writer(csvfile, delimiter=',', lineterminator='\n')
    writer.writerow(['Text', 'Top5%',
                     'Emoji_1', 'Emoji_2', 'Emoji_3', 'Emoji_4', 'Emoji_5',
                     'Pct_1', 'Pct_2', 'Pct_3', 'Pct_4', 'Pct_5'])
    for i, row in enumerate(scores):
        try:
            writer.writerow(row)
        except Exception:
            print("Exception at row {}!".format(i))


In [33]:
tokenized

array([[   13,   130,  1143, ...,     0,     0,     0],
       [ 2156,  1143,     0, ...,     0,     0,     0],
       [  131,    15, 19276, ...,     0,     0,     0],
       ..., 
       [  141,    13,  4637, ...,     0,     0,     0],
       [19276,  1143,  6468, ...,     0,     0,     0],
       [ 6694,    15,   282, ...,     0,     0,     0]], dtype=uint16)

In [34]:
t_prob

array([ 0.16226242,  0.08958624,  0.27161729,  0.47653398], dtype=float32)

In [35]:
top_elements(t_prob, 4)

array([3, 2, 0, 1])

In [40]:
def top_elements(array, k):
    ind = np.argpartition(array, -k)[-k:]
    return ind[np.argsort(array[ind])][::-1]

In [43]:
ind = top_elements(t_prob, 4)

In [44]:
t_prob[ind]

array([ 0.47653398,  0.27161729,  0.16226242,  0.08958624], dtype=float32)

In [54]:
scores

[[u'You should cake yourself!!!',
  1,
  3,
  0,
  2,
  0.49445996,
  0.30475858,
  0.15653455,
  0.044246923],
 [u'Bird cake', 3, 1, 2, 0, 0.45423833, 0.25813305, 0.17498097, 0.11264759],
 [u'Make a ruler cake\U0001f4cf\u2764\ufe0f\U0001f370',
  3,
  1,
  2,
  0,
  0.55748606,
  0.23249693,
  0.12079125,
  0.089225776],
 [u'Can you do a suicide squad cake',
  3,
  0,
  1,
  2,
  0.65930659,
  0.19624397,
  0.07696496,
  0.067484505],
 [u'Electric mixer cake !!!!',
  1,
  2,
  3,
  0,
  0.98402333,
  0.0070231976,
  0.0057838699,
  0.0031697252],
 [u'I wonder how much merch Yolanda sells in one day\U0001f914\U0001f914',
  3,
  0,
  1,
  2,
  0.61781484,
  0.23446654,
  0.078345098,
  0.069373578],
 [u'So it is it his bday or smh? Lol',
  3,
  0,
  1,
  2,
  0.61016661,
  0.2542029,
  0.071941935,
  0.063688599],
 [u'I have a question have you ever slammed your fingers in the fridge',
  3,
  0,
  2,
  1,
  0.72912538,
  0.25755563,
  0.0074484558,
  0.0058705551],
 [u'can you make a rai

In [52]:
f = lambda x: top_elements(x, 4)
f2 = f(prob)
f2

array([[[[0, 1, 2, 3],
         [0, 1, 2, 3],
         [1, 0, 2, 3],
         [1, 0, 2, 3]],

        [[0, 1, 2, 3],
         [0, 1, 2, 3],
         [1, 0, 2, 3],
         [1, 0, 2, 3]],

        [[0, 1, 2, 3],
         [0, 1, 2, 3],
         [1, 0, 2, 3],
         [1, 0, 2, 3]],

        [[0, 1, 2, 3],
         [1, 0, 2, 3],
         [0, 1, 2, 3],
         [1, 0, 2, 3]]],


       [[[0, 1, 2, 3],
         [0, 1, 2, 3],
         [1, 0, 2, 3],
         [1, 0, 2, 3]],

        [[0, 1, 2, 3],
         [0, 1, 2, 3],
         [1, 0, 2, 3],
         [1, 0, 2, 3]],

        [[0, 1, 2, 3],
         [0, 1, 2, 3],
         [1, 0, 2, 3],
         [1, 0, 2, 3]],

        [[0, 1, 2, 3],
         [1, 0, 2, 3],
         [0, 1, 2, 3],
         [1, 0, 2, 3]]],


       [[[0, 1, 2, 3],
         [0, 1, 2, 3],
         [1, 0, 2, 3],
         [1, 0, 2, 3]],

        [[0, 1, 2, 3],
         [0, 1, 2, 3],
         [1, 0, 2, 3],
         [1, 0, 2, 3]],

        [[0, 1, 2, 3],
         [0, 1, 2, 3],
         [

In [51]:
g = lambda x: [t_prob[ind] for ind in x]
g(f2)

[array([[[ 0.16226242,  0.08958624,  0.27161729,  0.47653398],
         [ 0.16226242,  0.08958624,  0.27161729,  0.47653398],
         [ 0.08958624,  0.16226242,  0.27161729,  0.47653398],
         [ 0.08958624,  0.16226242,  0.27161729,  0.47653398]],
 
        [[ 0.16226242,  0.08958624,  0.27161729,  0.47653398],
         [ 0.16226242,  0.08958624,  0.27161729,  0.47653398],
         [ 0.08958624,  0.16226242,  0.27161729,  0.47653398],
         [ 0.08958624,  0.16226242,  0.27161729,  0.47653398]],
 
        [[ 0.16226242,  0.08958624,  0.27161729,  0.47653398],
         [ 0.16226242,  0.08958624,  0.27161729,  0.47653398],
         [ 0.08958624,  0.16226242,  0.27161729,  0.47653398],
         [ 0.08958624,  0.16226242,  0.27161729,  0.47653398]],
 
        [[ 0.16226242,  0.08958624,  0.27161729,  0.47653398],
         [ 0.08958624,  0.16226242,  0.27161729,  0.47653398],
         [ 0.16226242,  0.08958624,  0.27161729,  0.47653398],
         [ 0.08958624,  0.16226242,  0.2716172

In [7]:
dfprob = pd.DataFrame(prob, columns=["annoyed", "joke", "calm", "excited"])
df_final = pd.concat([df.reset_index(), dfprob], axis=1)
df_final.drop('index',1).head()

Unnamed: 0,cid,text,time,author,clikes,cdislikes,video_id,desc,category,lang,annoyed,joke,calm,excited
0,UgzA4ydCdjJIzAqX3t14AaABAg.8lDy87LafJH8lG8GP3YQqZ,🍕,8 months ago,Candy,1,0,1i_hs2-VYOU,-,-,en,0.148453,0.359591,0.224187,0.267769
1,UgxqNtSQw_FdrWODwqJ4AaABAg.8lD_GxlwGjZ8lGFGdWHfC6,"+Nyan cat I've spеnt 4 d𝖺ys, 13 hours and 36 ...",8 months ago,Thomas Tomassen,1,0,1i_hs2-VYOU,-,-,en,0.522031,0.113159,0.150054,0.214756
2,UgwWQziec5aPXNjheip4AaABAg.8lDjabLmDUi8qNe-_0WbSX,​@SmashyPlaysdude legit it's not I've spent 10...,4 months ago,Ainsley Harriott,1,0,1i_hs2-VYOU,-,-,en,0.449348,0.108669,0.148693,0.29329
3,Ugw91vlJkKu8EN95zah4AaABAg,"Yo, I can't find a video on tasty of this cate...",8 months ago,amirhmem. 180,1,0,1i_hs2-VYOU,-,-,en,0.824112,0.001777,0.002608,0.171502
4,UgymWA9Y35aUdcHBpgJ4AaABAg,after playing Mogeko Castle when i see the wor...,8 months ago,MissIrene,1,0,1i_hs2-VYOU,-,-,en,0.349374,0.009012,0.016298,0.625315


In [69]:
df.shape

(464, 10)