# Exploring Pretraining Datasets

## C4

C4 is a massive dataset used in pre-training T5. In our pre-training setting we only use it as a regularizer so as the model does not forget to understand natural language.

Download link: https://huggingface.co/datasets/allenai/c4/tree/main

In [38]:
import pandas as pd
import json
import random
import gzip
import glob
import numpy as np

from tqdm.notebook import tqdm
from sentence_splitter import split_text_into_sentences
from nltk.tokenize import word_tokenize

### Reading

In [2]:
with open('../storage/datasets/c4/c4_data_original.json') as f:
    c4_original = [json.loads(line)['text'] for line in f]

### Processing

* Removing newlines
* Sentence splitting
* Shuffling

In [3]:
# Remove newlines
c4 = [text.replace('\n', ' ') for text in c4_original]

# Sentence split
c4_sentences = []
for text in tqdm(c4):
    sents = split_text_into_sentences(text, language='en')
    c4_sentences.extend(sents)

random.shuffle(c4_sentences)

del c4

  0%|          | 0/356317 [00:00<?, ?it/s]

## WDC


WebTableCorpus is a collection of tables crawled from the web.

Download link: http://webdatacommons.org/webtables/2015/downloadInstructions.html

## Reading

In [69]:
table_paths = glob.glob('../storage/datasets/wdc/original/1438042981460.12/warc/*')
original_tables = []

def get_tables():
    for table_path in table_paths[:500]:
        with gzip.open(table_path, 'r') as f:
            for line in f:
                try:
                    yield json.loads(line)
                except UnicodeDecodeError:
                    continue
# print(len(original_tables))

## Filtering

In [71]:
def is_english(table):
    return any(domain in table['url'] for domain in ['.com', '.eu', '.uk', '.net', '.org'])

def has_header(table):
    return table['hasHeader']

def is_not_empty(table):
    return len(table['relation']) >= 2

def is_not_huge(table):
    return len(table['relation']) < 50 and len(table['relation'][0]) < 15

def has_title_or_page_title(table):
    return table['title'] != '' or table['pageTitle'] != ''

def title_is_not_huge(table):
    if table['title'] != '' and len(table['title'].split()) < 5:
        return True
    elif table['pageTitle'] != '' and len(table['pageTitle'].split()) < 5:
        return True
    else:
        return False
    

filtered_tables = [table for table in get_tables() 
                  if has_header(table) and 
                  is_not_empty(table) and 
                  is_english(table) and
                  is_not_huge(table) and
                  has_title_or_page_title(table) and
                  title_is_not_huge(table)]

print(len(filtered_tables))

122124


## Analysis of TextBefore and TextAfter

In [72]:
def calculate_token_overlap(text, row):
    text_tokens = word_tokenize(text)
    
    if len(row) == 0 or len(text_tokens) == 0:
        return 0
    
    token_set = set(text_tokens)
    row_set = set(row)
    
    return len(token_set.intersection(row_set)) / min(len(text_tokens), len(row_set))


def calculate_row_overlaps(table, text_position="textBeforeTable"):
    return [calculate_token_overlap(table[text_position], row) 
            for row in table['relation'][1:]]  # Skip the header


def calculate_dataset_overlaps(tables):
    before_table_max = []
    after_table_max = []
    
    for table in tqdm(tables):
        before_table_max.append(max(calculate_row_overlaps(table, text_position="textBeforeTable")))
        after_table_max.append(max(calculate_row_overlaps(table, text_position="textAfterTable")))

    return before_table_max, after_table_max

before_overlaps, after_overlaps = calculate_dataset_overlaps(filtered_tables)

  0%|          | 0/122124 [00:00<?, ?it/s]

In [73]:
before_overlaps = np.array(before_overlaps)
after_overlaps = np.array(after_overlaps)

## How many tables have at least one row with more than 50% overlap
thresh = 0.5
print(f"More than {thresh} | TextBefore: {np.sum(before_overlaps > thresh) / len(before_overlaps)}")
print(f"More than {thresh} | TextAfter: {np.sum(after_overlaps > thresh) / len(after_overlaps)}")

## What is the average overlap of the text before and the text after?
print(f"Average overlap | TextBefore: {np.mean(before_overlaps)}")
print(f"Average overlap | TextAfter: {np.mean(after_overlaps)}")

More than 0.5 | TextBefore: 0.05948871638662343
More than 0.5 | TextAfter: 0.22504176083325145
Average overlap | TextBefore: 0.11209920456137416
Average overlap | TextAfter: 0.3239794220614944


## Analysis of Title and PageTitle

In [74]:
has_title = 0
has_page_title = 0
has_both = 0
has_none = 0

for table in filtered_tables:
    if table['title'] != '':
        has_title += 1
    if table['pageTitle'] != '':
        has_page_title += 1
    if table['title'] != '' and table['pageTitle'] != '':
        has_both += 1
    if table['title'] == '' and table['pageTitle'] == '':
        has_none += 1
        
print(f"Has title: {has_title / len(filtered_tables)}")
print(f"Has page title: {has_page_title / len(filtered_tables)}")
print(f"Has both: {has_both / len(filtered_tables)}")
print(f"Has none: {has_none / len(filtered_tables)}")

Has title: 0.22697422292096558
Has page title: 0.9997707248370509
Has both: 0.22674494775801643
Has none: 0.0


In [75]:
# Explore the difference between title and pageTitle
counter = 0
for table in filtered_tables:
    if table['title'] != '':
        print(table)
        counter += 1
        print("-" * 120)
    if counter > 5:
        break
        
print("!" * 240)
        
counter = 0
for table in filtered_tables:
    if table['pageTitle'] != '':
        print(table)
        counter += 1
        print("-" * 120)
    if counter > 5:
        break

{'relation': [['SEASON', '2008', '2009', 'TOTAL'], ['GP', '16', '6', '23'], ['G', '0', '0', '0'], ['A', '0', '0', '0'], ['PTS', '0', '0', '0'], ['SHOTS', '1', '0', '1'], ['SHOT %', '.000', '.000', '.000'], ['SOG', '1', '0', '1'], ['SOG%', '1.000', '.000', '1.000'], ['GW', '0', '0', '0'], ['PK-ATT', '0-0', '0-0', '0-0']], 'pageTitle': 'Brown', 'title': 'Career Statistics', 'url': 'http://brownbears.com/sports/m-soccer/2010-11/bios/smith_ian00.html', 'hasHeader': True, 'headerPosition': 'FIRST_ROW', 'tableType': 'RELATION', 'tableNum': 1, 's3Link': 'common-crawl/crawl-data/CC-MAIN-2015-32/segments/1438042981460.12/warc/CC-MAIN-20150728002301-00160-ip-10-236-191-2.ec2.internal.warc.gz', 'recordEndOffset': 37217939, 'recordOffset': 37209737, 'tableOrientation': 'HORIZONTAL', 'TableContextTimeStampAfterTable': '{26037=Brown University Athletics | 235 Hope St. Box 1932 | Providence, RI 02912, 21585=Before Brown: Transferred from Western Kentucky University where he started as a freshman ... 