This notebook creates the data for the data scaling experiments. We are interested in answering the question: how much data per author do we need? We will create multiple versions of the data by looking at the cumulative number of words per author: 1000, 5000, 10000, 50000

In [2]:
import json
from data_utils import Corpus
from collections import defaultdict

In [3]:
def read_data(path):
    with open(path) as f:
        return [json.loads(line) for line in f.readlines()]

In [4]:
def group_data(data):
    data_by_author = defaultdict(list)
    for example in data:
        data_by_author[example['author']].append(example)
    return data_by_author

In [5]:
def scale_data_per_author(examples, word_limit=15000):
    scaled_examples = list()
    current_word_cnt = 0

    for example in examples:
        if current_word_cnt + len(example['text'].split()) <= word_limit:
            scaled_examples.append({'author': example['author'],
                                    'file': example['title'],
                                    'text': example['text'],
                                    'domain': example['doc_atts']['domain']})
            current_word_cnt += len(example['text'].split())

        else:
            # if we reach a document that's too long, we will keep adding 
            # sentences until we reach the word limit
            annotated_example = Corpus(author=example['author'],
                                       title=example['title'],
                                       domain=example['doc_atts']['domain'],
                                       text=example['text']
                                    )

            split_example = []
            for sent in annotated_example.sents:
                if current_word_cnt + len(sent.split()) <= word_limit:
                    split_example.append(sent)
                    current_word_cnt += len(sent.split())
                else:
                    scaled_examples.append({'author': example['author'],
                                            'file': example['title'],
                                            'text': " ".join(split_example),
                                            'domain': example['doc_atts']['domain']})
                    return scaled_examples

    return scaled_examples


def scale_data(data, limit):
    scaled_data = dict()

    for author in data:
        examples = data[author]
        scaled_examples = scale_data_per_author(examples, word_limit=limit)
        scaled_data[author] = scaled_examples

    return scaled_data

In [6]:
def avg_num_words(data):
    num_words = 0

    for author in data:
        for example in data[author]:
            num_words += len(example['text'].split())

    return num_words / len(data)


def stats(data):
    # authors
    num_authors = len(data)
    # docs
    num_docs = sum([len(x) for v, x in data.items()])
    # avg # docs per author
    avg_num_docs = num_docs / len(data)
    # avg # words per doc
    total_words = 0
    for author, docs in data.items():
        total_words += sum([len(x['text'].split()) for x in docs])

    avg_words_per_doc = total_words / num_docs

    # avg # per author
    avg_words_per_author = total_words / len(data)

    print(f'Num Authors: {len(data)}')
    print(f'Num Docs: {num_docs}')
    print(f'Avg Num Docs: {avg_num_docs}')
    print(f'Avg Words Per Doc: {avg_words_per_doc}')
    print(f'Avg Words Per Author: {avg_words_per_author}')

In [10]:
def write(data, path):
    with open(path, mode='w') as f:
        for author, docs in data.items():
            for doc in docs:
                f.write(json.dumps(doc))
                f.write('\n')

In [11]:
train = read_data('../data/train.json')
data_by_author = group_data(train)

In [12]:
blogs = dict()
imdb = dict()
amazon = dict()

In [13]:
for author in data_by_author:
    domain = data_by_author[author][0]['doc_atts']['domain']

    if domain == 'blog':
        blogs[author] = data_by_author[author]

    elif domain == 'retail':
        amazon[author] = data_by_author[author]

    elif domain == 'movie':
        imdb[author] = data_by_author[author]


In [14]:
stats(blogs)
print()
stats(imdb)
print()
stats(amazon)

Num Authors: 140
Num Docs: 19879
Avg Num Docs: 141.99285714285713
Avg Words Per Doc: 214.94426278987876
Avg Words Per Author: 30520.55

Num Authors: 62
Num Docs: 30933
Avg Num Docs: 498.9193548387097
Avg Words Per Doc: 253.443054343258
Avg Words Per Author: 126447.64516129032

Num Authors: 49
Num Docs: 34012
Avg Num Docs: 694.1224489795918
Avg Words Per Doc: 183.11531224273784
Avg Words Per Author: 127104.44897959183


In [15]:
train_data_by_author = group_data(train)

In [16]:
scaled_data_1k = scale_data(train_data_by_author, limit=1000)
scaled_data_5k = scale_data(train_data_by_author, limit=5000)
scaled_data_10k = scale_data(train_data_by_author, limit=10000)
scaled_data_20k = scale_data(train_data_by_author, limit=20000)
scaled_data_35k = scale_data(train_data_by_author, limit=35000)

In [19]:
avg_num_words(scaled_data_1k), avg_num_words(scaled_data_5k), avg_num_words(scaled_data_10k), avg_num_words(scaled_data_20k), avg_num_words(scaled_data_35k)

(987.8286852589641,
 4987.673306772908,
 9962.940239043824,
 18871.211155378485,
 28895.37450199203)

In [20]:
write(scaled_data_1k, path='../data/scaling_exps/1k/train_1k.raw.json')
write(scaled_data_5k, path='../data/scaling_exps/5k/train_5k.raw.json')
write(scaled_data_10k, path='../data/scaling_exps/10k/train_10k.raw.json')
write(scaled_data_20k, path='../data/scaling_exps/20k/train_20k.raw.json')
write(scaled_data_35k, path='../data/scaling_exps/35k/train_35k.raw.json')
