In [1]:
import csv
import json
import numpy as np
from bert_embedding import BertEmbedding


bert_embedding = BertEmbedding(model='bert_24_1024_16', dataset_name='book_corpus_wiki_en_cased')


def embed_and_save(X, Y, outpath):
    E = np.array([np.mean(t[1], axis=0) for t in bert_embedding(X)])
    with open(outpath, 'w', newline='') as f:
        fieldnames = [f"d{i}" for i in range(len(E[0]))] + ['emotion']
        writer = csv.DictWriter(f, fieldnames=fieldnames)

        writer.writeheader()
        for e, l in zip(E, Y):
            writer.writerow(dict({f"d{i}": ei for i, ei in enumerate(e)}, **{'emotion': l}))

Vocab file is not found. Downloading.
Downloading /root/.mxnet/models/book_corpus_wiki_en_cased-2d62af22.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/vocab/book_corpus_wiki_en_cased-2d62af22.zip...
Downloading /root/.mxnet/models/bert_24_1024_16_book_corpus_wiki_en_cased-4e685a96.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/bert_24_1024_16_book_corpus_wiki_en_cased-4e685a96.zip...


# Crowdflower

Read the data and labels

In [2]:
inpath = "../data/CrowdFlower/text_emotion.csv"
header = ["tweet_id", "sentiment", "author", "content"]

X = []
Y = []
with open(inpath, 'r', newline='') as f:
    reader = csv.DictReader(f)
    for row in reader:
        X += [row["content"]]
        Y += [row["sentiment"]]
X = np.array(X)
Y = np.array(Y)

In [3]:
%time E = np.array([np.mean(t[1], axis=0) for t in bert_embedding(X)])

CPU times: user 4h 1min 47s, sys: 16min 21s, total: 4h 18min 9s
Wall time: 1h 7min 43s


In [4]:
outpath = "../data/CrowdFlower/BERT/embeddings.csv"
with open(outpath, 'w', newline='') as f:
    fieldnames = [f"d{i}" for i in range(len(E[0]))] + ['emotion']
    writer = csv.DictWriter(f, fieldnames=fieldnames)

    writer.writeheader()
    for e, l in zip(E, Y):
        writer.writerow(dict({f"d{i}": ei for i, ei in enumerate(e)}, **{'emotion': l}))

Embed

In [10]:
# %time embed_and_save(X, Y, "../data/CrowdFlower/BERT/embeddings.csv")

CPU times: user 3h 57min 7s, sys: 13min 11s, total: 4h 10min 18s
Wall time: 1h 5min 33s


# EmotionPush

In [2]:
path1 = "../data/EmotionPush/emotionpush.dev.json"
path2 = "../data/EmotionPush/emotionpush.test.json"
path3 = "../data/EmotionPush/emotionpush.train.json"

Y = []
X = []
data = {}
for path in [path1, path2, path3]:
    with open(path, 'r') as f:
        json_data = f.read()
        data = json.loads(json_data)

    for doc in data:
        for sent in doc:
            Y.append(sent["emotion"])
            X.append(sent["utterance"])
assert len(X) == len(Y)

In [3]:
%time E = np.array([np.mean(t[1], axis=0) for t in bert_embedding(X)])

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


CPU times: user 1h 28min 35s, sys: 5min 25s, total: 1h 34min 1s
Wall time: 24min 31s


In [6]:
E.shape

(14742,)

In [17]:
outpath = "../data/EmotionPush/BERT/embeddings.csv"
with open(outpath, 'w', newline='') as f:
    fieldnames = [f"d{i}" for i in range(len(E[0]))] + ['emotion']
    writer = csv.DictWriter(f, fieldnames=fieldnames)

    writer.writeheader()
    for e, l in zip(E, Y):
        try:
            writer.writerow(dict({f"d{i}": ei for i, ei in enumerate(e)}, **{'emotion': l}))
        except TypeError:
            writer.writerow(dict({f"d{i}": 0 for i in range(1024)}, **{'emotion': l}))

In [None]:
# %time embed_and_save(X, Y, "../data/EmotionPush/BERT/embeddings.csv")

# Friends

In [None]:
path = "../data/Friends/Friends/friends.json"

with open(path, 'r') as f:
    json_data = f.read()
    data = json.loads(json_data)

Y = []
X = []
for doc in data:
    for sent in doc:
        Y.append(sent["emotion"])
        X.append(sent["utterance"])
assert len(X) == len(Y)

In [None]:
%time E = np.array([np.mean(t[1], axis=0) for t in bert_embedding(X)])

In [None]:
E.shape

In [None]:
outpath = "../data/Friends/BERT/embeddings.csv"
with open(outpath, 'w', newline='') as f:
    fieldnames = [f"d{i}" for i in range(len(E[0]))] + ['emotion']
    writer = csv.DictWriter(f, fieldnames=fieldnames)

    writer.writeheader()
    for e, l in zip(E, Y):
        writer.writerow(dict({f"d{i}": ei for i, ei in enumerate(e)}, **{'emotion': l}))

In [29]:
# %time embed_and_save(X, Y, "../data/Friends/BERT/embeddings.csv")

CPU times: user 1h 26min 40s, sys: 5min 3s, total: 1h 31min 44s
Wall time: 24min 9s
