# Exploring Pretraining Datasets

## C4

C4 is a massive dataset used in pre-training T5. In our pre-training setting we only use it as a regularizer so as the model does not forget to understand natural language.

Download link: https://huggingface.co/datasets/allenai/c4/tree/main

In [1]:
import pandas as pd
import json
import random
import gzip
import glob
import numpy as np

from tqdm.notebook import tqdm
from sentence_splitter import split_text_into_sentences
from nltk.tokenize import word_tokenize

### Reading

In [2]:
with open('../storage/datasets/c4/c4_data_original.json') as f:
    c4_original = [json.loads(line)['text'] for line in f]

### Processing

* Removing newlines
* Sentence splitting
* Shuffling

In [3]:
# Remove newlines
c4 = [text.replace('\n', ' ') for text in c4_original]

# Sentence split
c4_sentences = []
for text in tqdm(c4):
    sents = split_text_into_sentences(text, language='en')
    c4_sentences.extend(sents)

random.shuffle(c4_sentences)

del c4

  0%|          | 0/356317 [00:00<?, ?it/s]

## WDC


WebTableCorpus is a collection of tables crawled from the web.

Download link: http://webdatacommons.org/webtables/2015/downloadInstructions.html

## Reading

In [2]:
table_paths = glob.glob('../storage/datasets/wdc/original/1438042981460.12/warc/*')
original_tables = []

def get_tables():
    for table_path in table_paths[:500]:
        with gzip.open(table_path, 'r') as f:
            for line in f:
                try:
                    yield json.loads(line)
                except UnicodeDecodeError:
                    continue
# print(len(original_tables))

## Filtering

In [3]:
def is_english(table):
    return any(domain in table['url'] for domain in ['.com', '.eu', '.uk', '.net', '.org'])

def has_header(table):
    return table['hasHeader']

def is_not_empty(table):
    return len(table['relation']) >= 2

def is_not_huge(table):
    return len(table['relation']) < 50 and len(table['relation'][0]) < 15

def has_title_or_page_title(table):
    return table['title'] != '' or table['pageTitle'] != ''

def title_is_not_huge(table):
    if table['title'] != '' and len(table['title'].split()) < 5:
        return True
    elif table['pageTitle'] != '' and len(table['pageTitle'].split()) < 5:
        return True
    else:
        return False
    

filtered_tables = [table for table in get_tables() 
                  if has_header(table) and 
                  is_not_empty(table) and 
                  is_english(table) and
                  is_not_huge(table) and
                  has_title_or_page_title(table) and
                  title_is_not_huge(table)]

print(len(filtered_tables))

122124


## Analysis of TextBefore and TextAfter

In [None]:
def calculate_token_overlap(text, row):
    text_tokens = word_tokenize(text)
    
    # We want the text to be at least 8 tokens so as it resembles real text
    if len(row) == 0 or len(text_tokens) < 8:
        return 0
    
    token_set = set(text_tokens)
    row_set = set(row)
    
    return len(token_set.intersection(row_set)) / min(len(text_tokens), len(row_set))


def calculate_row_overlaps(pairs):
    return [calculate_token_overlap(sent, row) for sent, row in pairs]


def create_possible_sent_row_pairs(table, text_position="textBeforeTable"):
    sent_row_pairs = []
    for sent in table[text_position]:
        for row in table['relation'][1:]:
            sent_row_pairs.append((sent, row))
    # print(sent_row_pairs)
    return sent_row_pairs
            
            
def calculate_all_overlaps(table, text_position="textBeforeTable"):
    if len(table[text_position]) == 0:
        return [0]
    
    pairs = create_possible_sent_row_pairs(table, text_position)
    overlaps = calculate_row_overlaps(pairs)
    return overlaps

    

def calculate_dataset_overlaps(tables):
    before_table_max = []
    after_table_max = []
    
    for table in tqdm(tables):
        
        before_table_max.append(max(calculate_all_overlaps(table, text_position="textBeforeTable")))
        after_table_max.append(max(calculate_all_overlaps(table, text_position="textAfterTable")))
        # break

    return before_table_max, after_table_max


with open('../storage/datasets/wdc/filtered/1438042981525.10.json', 'r') as inp:
    filtered_tables = json.load(inp)

before_overlaps, after_overlaps = calculate_dataset_overlaps(filtered_tables)
# for table in filtered_tables[:10]:
#     print(table)
#     print("-"*90)
    

  0%|          | 0/105119 [00:00<?, ?it/s]

In [13]:
before_overlaps = np.array(before_overlaps)
after_overlaps = np.array(after_overlaps)

## How many tables have at least one row with more than 50% overlap
thresh = 0.4
print(f"More than {thresh} | TextBefore: {np.sum(before_overlaps > thresh) / len(before_overlaps)}")
print(f"More than {thresh} | TextAfter: {np.sum(after_overlaps > thresh) / len(after_overlaps)}")

## What is the average overlap of the text before and the text after?
print(f"Average overlap | TextBefore: {np.mean(before_overlaps)}")
print(f"Average overlap | TextAfter: {np.mean(after_overlaps)}")

More than 0.4 | TextBefore: 0.006120527306967985
More than 0.4 | TextAfter: 0.05555555555555555
Average overlap | TextBefore: 0.011010223298358891
Average overlap | TextAfter: 0.047087705138552594


In [14]:
before_overlaps = np.array(before_overlaps)
after_overlaps = np.array(after_overlaps)

thresh = 0.4
after_inds = np.where(after_overlaps > thresh)[0]
before_inds = np.where(before_overlaps > thresh)[0]


for i, ind in enumerate(before_inds):
    table = pd.DataFrame(filtered_tables[ind]['relation'][1:], columns=filtered_tables[ind]['relation'][0])
    print(filtered_tables[ind]['textBeforeTable'])
    display(table)
    
    print("-" * 90)
    
    if i > 20:
        break
    

['Found 233 RPM for libxml_plugin.so RPM resource libxml_plugin.so Arch System The System and Arch are optional added filters, for example System could be "redhat", "redhat-7.2", "mandrake" or "gnome", Arch could be "i386" or "src", etc.', 'depending on your system.', 'The search service can find package by either name (apache), provides(webserver), absolute file names (/usr/bin/apache), binaries (gprof) or shared libraries (libXm.so.2) in standard path.', 'It does not support multiple arguments yet... Help Mirrors index by Name index by creation date index by Vendor index by Distribution index by Group Index']


Unnamed: 0,Package,videolan-client-0.8.4-3.2.el4.rf.i386.html,videolan-client-0.8.4a-4.el4.rf.i386.html
0,Summary,"The VideoLAN client, also a very good standalo...","The VideoLAN client, also a very good standalo..."
1,Distribution,DAG packages for Red Hat Linux el4 i386,DAG packages for Red Hat Linux el4 i386
2,Download,videolan-client-0.8.4-3.2.el4.rf.i386.rpm,videolan-client-0.8.4a-4.el4.rf.i386.rpm


------------------------------------------------------------------------------------------
['Botswana good excellent excellent excellent excellent excellent excellent good good good good good Matobo poor good excellent excellent excellent excellent good fair fair']


Unnamed: 0,Moremi,Okavango Delta,Linyanti,Savute,Chobe,Nxai Pan,Makgadikgadi
0,good,good,good,excellent,fair,excellent,excellent
1,good,good,good,excellent,fair,excellent,excellent
2,good,good,good,excellent,fair,excellent,excellent
3,good,good,good,excellent,good,excellent,excellent
4,excellent,excellent,excellent,good,good,good,good
5,excellent,excellent,excellent,good,excellent,fair,fair
6,excellent,excellent,excellent,good,excellent,fair,fair
7,excellent,excellent,excellent,good,excellent,poor,poor
8,excellent,excellent,excellent,good,excellent,poor,poor
9,excellent,excellent,excellent,good,excellent,poor,poor


------------------------------------------------------------------------------------------
['Show/hide PIC10F X=Not Working U = Untested v = Tested ½ = Partially Supported Devices The devices below are supported by the embedded software and the usbpicprog PC software It is very possible that a PIC device is supported by the latest snapshot version of usbpicprog (firmware + PC software).', 'In the table below, either the snapshot revision or the release version of usbpicprog is shown for every supported device.', 'The latest firmware can always be obtained from the github repository in binary and source form (.hex file).', 'The software is available in source format.', 'A ready-built snapshot version can be found here, but be careful – the snapshots are not stable and therefore not yet released.', 'If you have tested a device that is not listed, please use the form below Supported devices usbpicprog a free and open source usb pic programmer']


Unnamed: 0,Device,PIC10F200,PIC10F202,PIC10F204,PIC10F206,PIC10F220,PIC10F222
0,Implemented,v,v,v,v,v,v
1,Erase,v,v,v,v,v,v
2,Read code,v,v,v,v,v,v
3,Autodetect,,,,,,
4,Read Data,,,,,,
5,Read Config,v,v,v,v,v,v
6,Write Code,v,v,v,v,v,v
7,Write Data,,,,,,
8,Write Config,v,v,v,v,v,v
9,version,0.4.1,0.4.1,0.4.1,0.4.1,0.4.2,0.4.2


------------------------------------------------------------------------------------------
['Super Wash------------$6.00-----------Undercarrige wash, Double pass presoak, Blue Corel Treatment, step 1: Foaming polish, step 2: Poly Sealant, High pressure wash, Spot free rinse, & Dry.', 'Deluxe Wash-----------$5.00-----------Undercarrige wash, High pressure wash, Pre-soak ,Blue Corel Poly Sealant, Spot free rinse, & Dry.', 'Touch Free Car Wash Self Serve\\Vacuum Touch Free Automatic Wash Hours:7:00 a.m.- 9:00p.m.']


Unnamed: 0,Price,$4.00,$5.00,$6.00
0,Presoak,*,*,
1,Undercarrige Wash,*,*,*
2,High Pressure Wash,,,*
3,Double Pass Presoak,,,*
4,Foaming Polish,,,*
5,Poly Sealant,,*,*
6,Spot Free Rinse,*,*,*
7,Dry,*,*,*


------------------------------------------------------------------------------------------
['Found 3 RPM for libwidget_gtk.so RPM resource libwidget_gtk.so Arch System The System and Arch are optional added filters, for example System could be "redhat", "redhat-7.2", "mandrake" or "gnome", Arch could be "i386" or "src", etc.', 'depending on your system.', 'The search service can find package by either name (apache), provides(webserver), absolute file names (/usr/bin/apache), binaries (gprof) or shared libraries (libXm.so.2) in standard path.', 'It does not support multiple arguments yet... Help Mirrors index by Name index by creation date index by Vendor index by Distribution index by Group Index']


Unnamed: 0,Package,firefox-0.10.1-0.1.0.el2.dag.i386.html,firefox-0.8-3.0.el2.dag.i386.html
0,Summary,Mozilla Firefox web browser,Mozilla Firefox web browser
1,Distribution,DAG Testing packages for Red Hat Linux el2.1 i386,DAG Testing packages for Red Hat Linux el2.1 i386
2,Download,firefox-0.10.1-0.1.0.el2.dag.i386.rpm,firefox-0.8-3.0.el2.dag.i386.rpm


------------------------------------------------------------------------------------------
['matrix inputs:1, matrix outputs:4 Matrix Operator A word on Jitter strings: In Jitter, any 1 plane char matrix may be manipulated as a string.', 'String matrices may have one or two dimensions.', 'If 1-dimensional, the string is considered terminated at the first 0 (like in C).', 'If 2-dimensional, the string may be considered a multi-line string, with the first 0 encountered on each row treated as a carriage return.', 'The jit.textfile and jit.str.op objects will accept and output 2D matrices.', 'jit.str.fromsymbol only outputs 1D matrices and jit.str.tosymbol only accepts 1D matrices (or the first row of a 2D matrix).', 'jit.str.regexp gets its regular expression know-how from the PCRE library package, which is open source software, written by Philip Hazel, and copyright by the University of Cambridge, England.', 'The jit.str.regexp object performs regular expression analysis to Jitter matric

Unnamed: 0,Name,out,out2,out3,out4
0,IOProc,,,,
1,Planelink,1,1,1,1
2,Typelink,1,1,1,1
3,Dimlink,1,1,1,1
4,Plane,1,1,1,1
5,Dim,1,1,1,1
6,Type,char,char,char,char


------------------------------------------------------------------------------------------
['See all My Contest Entries entered Sunday, July 21, 2002 entered Tuesday, July 23, 2002 My Contest Stats Latest Entries... I have 2 entries in the photo contest 2 point member Ronnie Cardwell Member since: Wednesday, October 6, 2004 Ronnie Cardwell Profile']


Unnamed: 0,Seas I have dived,unspecified,totals
0,Number of Entries,2,2
1,Total votes,9,9
2,Average votes,4,4


------------------------------------------------------------------------------------------
['See all My Contest Entries entered Sunday, July 21, 2002 entered Tuesday, July 23, 2002 My Contest Stats Latest Entries... I have 2 entries in the photo contest 2 point member Ronnie Cardwell Member since: Wednesday, October 6, 2004 Ronnie Cardwell Profile']


Unnamed: 0,Countries I have visited,unspecified,totals
0,Number of Entries,2,2
1,Total votes,9,9
2,Average votes,4,4


------------------------------------------------------------------------------------------
['table of potential recruits go for quality (1.0) Up to 5 units are to be recruited AI has 120 gold Example input a list of units to be recruited.', "Output (integer) maximum quantity (integer) current quantity (double) how good each unit is (integer) cost to recruit (string) recruit name table of potential recruits (double) quantity vs quality - 0 means 'go for mass of low-quality units', 1 means 'go for few high-quality units), and other values are in between (integer) maximum number of units to be recruited (integer) gold Input Wesnoth AI has to select what units to recruit.", 'AI leader stands on keep and can recruit a number of units, up to the number of free hexes in his castle.', 'AI can choose between a number of different unit types.', 'Each unit type costs a certain amount of gold to recruit and the AI knows how good a specific unit is, and should try to recruit better unit types.', 'A

Unnamed: 0,Recruit name,Spearman,Royal Guard
0,Cost to recruit,13gp,30gp
1,How good this unit is,12.6,40.0
2,Current Qty,1,0
3,Maximum Qty,10,2


------------------------------------------------------------------------------------------
['Found 3 RPM for libguac.so.0 RPM resource libguac.so.0 Arch System The System and Arch are optional added filters, for example System could be "redhat", "redhat-7.2", "mandrake" or "gnome", Arch could be "i386" or "src", etc.', 'depending on your system.', 'The search service can find package by either name (apache), provides(webserver), absolute file names (/usr/bin/apache), binaries (gprof) or shared libraries (libXm.so.2) in standard path.', 'It does not support multiple arguments yet... Help Mirrors index by Name index by creation date index by Vendor index by Distribution index by Group Index']


Unnamed: 0,Package,libguac0-0.3.0-7.1.i386.html,libguac0-0.3.0-7.1.i386.html.1,libguac0-0.3.0-7.1.i386.html.2
0,Summary,The core guacamole library,The core guacamole library,The core guacamole library
1,Distribution,SourceForge,SourceForge,SourceForge
2,Download,libguac0-0.3.0-7.1.i386.rpm,libguac0-0.3.0-7.1.i386.rpm,libguac0-0.3.0-7.1.i386.rpm


------------------------------------------------------------------------------------------
['UPDATE 6.2005: The Bundeswehr has given the MG43 the new designation MG4.', 'These are some of the only photographs emerging in reference to the new HK MG43 belt fed 5.56mm x 45 machine gun.', 'Heckler & Koch unveiled the new machine gun at the Defense Systems and Equipment International Convention in London, September 11-14, 2001.', 'Photos two, three and four are from the June, 2002 edition of Small Arms Review, reprinted here courtesy of Dan Shea, author and photographer of their article on the MG43.', 'The new gun is reportedly still in the early testing stages and is gas operated with a rotary bolt.', 'Also among the features are safe/full auto only, reminiscent of the M249 SAW and other adopted designs that do not feature select fire capability.', 'The gun has quick change barrels and a "very strong" belt feeding system.', 'The use of gas operation and rotary bolt seems to indicate that t

Unnamed: 0,Caliber,5.56 x 45,5.56 x45
0,Cyclic Rate,750,750
1,Mag Capacity,Belt,Belt
2,Modes of Fire,Safe/Full,Safe/Full
3,Width (in/mm),3.54,90
4,Height (in/mm),9.84,250
5,Weight (lb/kg),18.85,8.55
6,bbl. length (in/mm),18.9,480
7,Overall Length (in.mm),41.34,1050


------------------------------------------------------------------------------------------
['Found 3 RPM for techhounds_frc http://www.techhounds.com/ Found 1 sites for techhounds_frc The techhounds-frc package includes the NetBeans IDE, as well as the five plugins necessary to get it to deploy code onto an FIRST Robotics Competion (FRC) robot.', 'It also includes an frc-ping script, which can be used to test network functionality on an FRC robotics network.', 'RPM resource techhounds_frc Arch System The System and Arch are optional added filters, for example System could be "redhat", "redhat-7.2", "mandrake" or "gnome", Arch could be "i386" or "src", etc.', 'depending on your system.', 'The search service can find package by either name (apache), provides(webserver), absolute file names (/usr/bin/apache), binaries (gprof) or shared libraries (libXm.so.2) in standard path.', 'It does not support multiple arguments yet... Help Mirrors index by Name index by creation date index by Vendor

Unnamed: 0,Package,techhounds_frc-20130207-429.noarch.html,techhounds_frc-20130106-2.noarch.html,techhounds_frc-20130106-2.noarch.html.1
0,Summary,Software used to work on FRC robotics.,Software used to work on FRC robotics.,Software used to work on FRC robotics.
1,Distribution,SourceForge,SourceForge,SourceForge
2,Download,techhounds_frc-20130207-429.noarch.rpm,techhounds_frc-20130106-2.noarch.rpm,techhounds_frc-20130106-2.noarch.rpm


------------------------------------------------------------------------------------------
['Arkansas Sarah Angeline Burnham Callaway County, Missouri Roda Burnham Callaway County, Missouri Death Certificates Maine Death Records New Hampshire Death Records Kentucky Death Records Illinois Death Records Link to Roots Web message board Death of Adelaide Burnham Of Missouri Death records Search billions of records on Ancestry.com']


Unnamed: 0,Name,"Burnham, Lydia Reed","Burnham, Rufus"
0,Death Date,"Jan. 19, 1926","July 29, 1936"
1,County,Bradford-O'Keefe Funeral Home,Bradford-O'Keefe Funeral Home


------------------------------------------------------------------------------------------


### Check specific cases

In [31]:
# How many tables contain the word Player in first column
player_tables = 0

for table in filtered_tables:
    if table['relation'][0][0] == 'Cart Icon':
        player_tables += 1
        
player_tables

1238

## Analysis of Title and PageTitle

In [74]:
has_title = 0
has_page_title = 0
has_both = 0
has_none = 0

for table in filtered_tables:
    if table['title'] != '':
        has_title += 1
    if table['pageTitle'] != '':
        has_page_title += 1
    if table['title'] != '' and table['pageTitle'] != '':
        has_both += 1
    if table['title'] == '' and table['pageTitle'] == '':
        has_none += 1
        
print(f"Has title: {has_title / len(filtered_tables)}")
print(f"Has page title: {has_page_title / len(filtered_tables)}")
print(f"Has both: {has_both / len(filtered_tables)}")
print(f"Has none: {has_none / len(filtered_tables)}")

Has title: 0.22697422292096558
Has page title: 0.9997707248370509
Has both: 0.22674494775801643
Has none: 0.0


In [75]:
# Explore the difference between title and pageTitle
counter = 0
for table in filtered_tables:
    if table['title'] != '':
        print(table)
        counter += 1
        print("-" * 120)
    if counter > 5:
        break
        
print("!" * 240)
        
counter = 0
for table in filtered_tables:
    if table['pageTitle'] != '':
        print(table)
        counter += 1
        print("-" * 120)
    if counter > 5:
        break

{'relation': [['SEASON', '2008', '2009', 'TOTAL'], ['GP', '16', '6', '23'], ['G', '0', '0', '0'], ['A', '0', '0', '0'], ['PTS', '0', '0', '0'], ['SHOTS', '1', '0', '1'], ['SHOT %', '.000', '.000', '.000'], ['SOG', '1', '0', '1'], ['SOG%', '1.000', '.000', '1.000'], ['GW', '0', '0', '0'], ['PK-ATT', '0-0', '0-0', '0-0']], 'pageTitle': 'Brown', 'title': 'Career Statistics', 'url': 'http://brownbears.com/sports/m-soccer/2010-11/bios/smith_ian00.html', 'hasHeader': True, 'headerPosition': 'FIRST_ROW', 'tableType': 'RELATION', 'tableNum': 1, 's3Link': 'common-crawl/crawl-data/CC-MAIN-2015-32/segments/1438042981460.12/warc/CC-MAIN-20150728002301-00160-ip-10-236-191-2.ec2.internal.warc.gz', 'recordEndOffset': 37217939, 'recordOffset': 37209737, 'tableOrientation': 'HORIZONTAL', 'TableContextTimeStampAfterTable': '{26037=Brown University Athletics | 235 Hope St. Box 1932 | Providence, RI 02912, 21585=Before Brown: Transferred from Western Kentucky University where he started as a freshman ... 