# C4

In [None]:
__author__ = "Christopher Potts"

## Set-up

In [None]:
import glob
import json
import re
import gzip
from joblib import Parallel, delayed
import tqdm
import os
import pandas as pd
from nltk.tokenize import sent_tokenize
import random

import utils

In [None]:
C4_HOME = "c4/en"

OUTPUT_DIRNAME = "c4-pipp-candidates"

## Sentence count

In [None]:
def count_c4_sentences(filename):
    count = 0
    with gzip.open(filename, "rt") as f:
        for line in f:
            ex = json.loads(line)
            count += len(sent_tokenize(ex['text']))
    return count

def count_c4_sentences_parallel(filenames):
    pbar = tqdm.tqdm(filenames)
    return sum(Parallel(n_jobs=10)(delayed(count_c4_sentences)(f) for f in pbar))

In [None]:
filenames = glob.glob(os.path.join(C4_HOME, "c4-train*.gz"))

In [None]:
%time train_sent_count = count_c4_sentences_parallel(filenames)

In [None]:
train_sent_count

## Candidates

In [None]:
def sentence_reader(text):
    doc = sent_tokenize(text)
    return [s for s in doc if " though " in s or " as " in s]

In [None]:
def c4_reader(filename, output_dirname=OUTPUT_DIRNAME):
    import os
    import gzip
    data = []
    with gzip.open(filename, "rt") as f:
        for line in f:
            if " though " in line or " as " in line:
                ex = json.loads(line)
                for sent in sentence_reader(ex['text']):
                    if utils.is_match(sent):
                        data.append({
                            "timestamp": ex['timestamp'],
                            "url": ex['url'],
                            "sentence": sent})
    basename = os.path.basename(filename).replace(".json.gz", ".json")
    output_filename = os.path.join(OUTPUT_DIRNAME, basename)
    with open(output_filename, "wt") as f:
        json.dump(data, f)


def c4_reader_parallel(filenames):
    pbar = tqdm.tqdm(filenames)
    Parallel(n_jobs=10)(delayed(c4_reader)(f) for f in pbar)

In [None]:
%time train = c4_reader_parallel(filenames)

In [None]:
def get_sample_with_counts():
    count = 0
    data = []
    filenames = glob.glob(OUTPUT_DIRNAME, "c4-train*.json")
    for filename in tqdm.tqdm(filenames):
        with open(filename) as f:
            exs = json.load(f)
            count += len(exs)
            # To avoid a memory overload, sample just one example per file:
            i = random.randint(0, len(exs))
            data.append(exs[i])
    return data, count

In [None]:
samp, count = get_sample_with_counts()

In [None]:
count

In [None]:
random.shuffle(samp)

In [None]:
sample_df = pd.DataFrame(samp[: 1000])

In [None]:
sample_df.to_csv("samples/pipp-sample-c4.csv")