In [None]:
%cd "/gscratch/xlab/alisaliu/hack-tokenizers"

In [None]:
import pandas as pd
import seaborn as sns
import json
import os
from pathlib import Path
from tqdm import tqdm
from utils import ensure_dir
from collections import defaultdict
import random
import numpy as np

# Wikipedia

In [None]:
data_dir = Path('/gscratch/scrubbed/alisaliu/redpajama/wikipedia')

In [None]:
df = pd.read_json(data_dir / 'wiki.jsonl', lines=True)

languages = []
for meta in tqdm(df.meta):
    languages.append(meta['language'])

df['language'] = languages
df

In [None]:
# split wikipedia into different languages for shift experiments
wiki_by_lang_dir = Path('/gscratch/xlab/alisaliu/redpajama/wikipedia')
for language, sub_df in df.groupby('language'):
    ensure_dir(wiki_by_lang_dir / language)
    with open(wiki_by_lang_dir / language / 'wiki.txt', 'w') as fin:
        fin.write('\n\n'.join(sub_df['text']))

In [None]:
# split English wikipedia data into smaller documents
english_df = df.loc[df['language'] == 'en'].sample(frac=1)
sub_dfs = np.array_split(english_df, 30)
for i, sub_df in tqdm(enumerate(sub_dfs), total=len(sub_dfs)):
    with open(data_dir / f'{i}.txt', 'w') as fo:
        fo.write('\n\n'.join(sub_dfs[i]['text']))

# ArXiv

In [None]:
data_dir = Path('/gscratch/scrubbed/alisaliu/redpajama/arxiv')

In [None]:
for f in tqdm(os.listdir(data_dir / 'arxiv')):
    if not os.path.isdir(data_dir / 'arxiv' / f):
        identifier = f.split('_')[1].rsplit('.', 1)[0]
        sub_df = pd.read_json(data_dir / 'arxiv' / f, lines=True)
        with open(data_dir / f'{identifier}.txt', 'w') as fo:
            fo.write('\n\n'.join(sub_df['text']))

# Common Crawl

In [None]:
data_dir = Path('/gscratch/scrubbed/alisaliu/redpajama/web')

In [None]:
# we have way too much data, so really we should downsample from the url list
with open(data_dir / '../urls.txt') as fin:
    urls = fin.readlines()
    cc_urls = [url for url in urls if 'common_crawl' in url]

In [None]:
dumps = defaultdict(list)
for url in cc_urls:
    dump = url.split('common_crawl/')[1].split('/en')[0]
    dumps[dump].append(url)

In [None]:
sampled_urls = []
for d in dumps:
    sampled_urls.extend(random.sample(dumps[d], len(dumps[d]) // 12))

In [None]:
with open(data_dir / 'cc_urls.txt', 'w') as fo:
    fo.writelines(sampled_urls)

In [None]:
for dump in os.listdir(data_dir / 'common_crawl'):
    for f in tqdm(os.listdir(data_dir / 'common_crawl' / dump), desc=dump):
        sub_df = pd.read_json(data_dir / 'common_crawl' / dump / f, lines=True)
        identifier = f'{dump}-{f.split(".")[0]}'
        with open(data_dir / f'{identifier}.txt', 'w') as fo:
            fo.write('\n\n'.join(sub_df['text']))

# Books

In [None]:
data_dir = Path('/gscratch/scrubbed/alisaliu/redpajama/books')

In [None]:
df = pd.read_json('/gscratch/xlab/alisaliu/redpajama/book.jsonl', lines=True)
df

In [None]:
sub_dfs = np.array_split(df, 50)
for i, sub_df in tqdm(enumerate(sub_dfs)):
    with open(data_dir / f'{i}.txt', 'w') as fo:
        fo.write('\n\n'.join(sub_dfs[i]['text']))

# Code

In [None]:
data_dir = Path('/gscratch/scrubbed/alisaliu/redpajama/github')

In [None]:
for fin in tqdm(os.listdir(data_dir / 'github')):
    if os.path.isfile(data_dir / 'github' / fin):
        random_str = fin.split('_')[1].split('.')[0]
        sub_df = pd.read_json(data_dir / 'github' / fin, lines=True)
        with open(data_dir / f'{random_str}.txt', 'w') as fo:
            fo.write('\n\n'.join(sub_df['text']))

### Group by language to create language data

In [None]:
languages_json = json.load(open('preprocessing/languages.json'))
extensions_to_language = {}
for language, data in languages_json.items():
    if 'extensions' in data:
        for ext in data['extensions']:
            extensions_to_language[ext[1:]] = language

In [None]:
scrubbed_data_dir = Path('/gscratch/scrubbed/alisaliu/redpajama/github/github')
xlab_data_dir = Path('/gscratch/xlab/alisaliu/redpajama/github')

In [None]:
for fin in os.listdir(scrubbed_data_dir):
    if os.path.isfile(scrubbed_data_dir / fin):
        random_str = fin.split('_')[1].split('.')[0]
        df = pd.read_json(scrubbed_data_dir / fin, lines=True)
        languages = []
        for meta in df.meta:
            languages.append(extensions_to_language.get(meta['path'].rsplit('.', 1)[-1].rsplit('/', 1)[-1].lower()))
        df['language'] = languages
        for language, sub_df in tqdm(df.groupby('language'), desc=random_str):
            language = ''.join(language.split(' '))
            ensure_dir(xlab_data_dir / language)
            with open(xlab_data_dir / language / f'{random_str}.txt', 'w') as fin:
                fin.write('\n\n'.join(sub_df['text']))