# Steps

- Load a year
- Read data
- Apply reddit-filter (r1m_preprocess)
- split into json files, each of 512 MB max
- Remove the max_conv limiter
- only extract in-domain validation data

In [1]:
import os, glob, re, json, random
from tqdm.notebook import tqdm
from r1m_preprocess import filter_dialogs

In [2]:
def extract_text(conv):
    text = " __eou__ ".join(conv['turns']) + " __eou__"
    text = text.replace("\n", "")
    text = text.replace("\r", "")
    return text

In [3]:
year = "2011"
training_files = glob.glob(f'./data/{year}/dialogues/training/*.txt')

In [4]:
len(training_files)

541

In [5]:
MAX_CONV_PER_DOMAIN = float('inf')
out_dir = './data/reddit_xtreme/' # around 900 train files

In [6]:
train_data = []
for file in tqdm(training_files):
    with open(file) as f:
        for i, l in enumerate(f):
            if i >= MAX_CONV_PER_DOMAIN:
                break
            # conv = extract_text(json.loads(l))
            conv = json.loads(l)["turns"]
            train_data.append(conv)
        print(f"Loaded {i} samples from {file}")

HBox(children=(FloatProgress(value=0.0, max=541.0), HTML(value='')))

Loaded 241 samples from ./data/2011/dialogues/training/rainmeter.txt
Loaded 1467 samples from ./data/2011/dialogues/training/tea.txt
Loaded 461 samples from ./data/2011/dialogues/training/dragonage.txt
Loaded 4293 samples from ./data/2011/dialogues/training/ubuntu.txt
Loaded 4391 samples from ./data/2011/dialogues/training/london.txt
Loaded 751 samples from ./data/2011/dialogues/training/fancyfollicles.txt
Loaded 2239 samples from ./data/2011/dialogues/training/adventuretime.txt
Loaded 2533 samples from ./data/2011/dialogues/training/whatsthisbug.txt
Loaded 3417 samples from ./data/2011/dialogues/training/zombies.txt
Loaded 899 samples from ./data/2011/dialogues/training/alienblue.txt
Loaded 47871 samples from ./data/2011/dialogues/training/leagueoflegends.txt
Loaded 9059 samples from ./data/2011/dialogues/training/okcupid.txt
Loaded 333 samples from ./data/2011/dialogues/training/seinfeld.txt
Loaded 47 samples from ./data/2011/dialogues/training/grilledcheese.txt
Loaded 17359 samples 

In [7]:
train_data[0]

["How would I install a calendar or something of the sort to be able to use?  <selfbr> I downloaded a calendar and I have tried to just put it into themes. Can't find it anywhere when I am trying to config to add it independantly. Any help? ",
 'What calendar did you download?',
 'I Downloaded the corner calendar. I figured out how after another hour of fidgeting with it. I appreciate the comment though. ',
 'Well, if you need to remember, it goes in the Skins folder.']

In [8]:
len(train_data)

2960212

In [9]:
_, train_data = filter_dialogs(train_data)

Filtering dialogs: 100%|██████████| 2960212/2960212 [23:43<00:00, 2079.52it/s]


salvaged 1360186 dialogs out of a total 2960212 samples.
               urls   avg_utt_len  very_long_utt         turns  first_utt_len
count  2.960212e+06  2.960212e+06   2.960212e+06  2.960212e+06   2.960212e+06
mean   6.945921e-01  4.825744e+01   7.353642e-01  4.819741e+00   8.372496e+01
std    2.218434e+00  4.957103e+01   1.026226e+00  1.398186e+00   1.455393e+02
min    0.000000e+00  1.000000e+00   0.000000e+00  4.000000e+00   1.000000e+00
25%    0.000000e+00  1.860000e+01   0.000000e+00  4.000000e+00   1.100000e+01
50%    0.000000e+00  3.390000e+01   0.000000e+00  4.000000e+00   3.700000e+01
75%    1.000000e+00  6.016667e+01   1.000000e+00  5.000000e+00   9.900000e+01
max    3.020000e+02  2.548500e+03   3.300000e+01  1.060000e+02   6.594000e+03


In [10]:
val_files = glob.glob(f'./data/{year}/dialogues/validation_date_in_domain_in/*.txt')

In [11]:
len(val_files)

497

In [12]:
val_data = []
for file in tqdm(val_files):
    with open(file) as f:        
        for j, l in enumerate(f):
            if j >= MAX_CONV_PER_DOMAIN:
                break
            # conv = extract_text(json.loads(l))  
            conv = json.loads(l)["turns"]
            val_data.append(conv)
        print(f"Loaded {j} samples from {file}")

HBox(children=(FloatProgress(value=0.0, max=497.0), HTML(value='')))

Loaded 11 samples from ./data/2011/dialogues/validation_date_in_domain_in/rainmeter.txt
Loaded 83 samples from ./data/2011/dialogues/validation_date_in_domain_in/tea.txt
Loaded 31 samples from ./data/2011/dialogues/validation_date_in_domain_in/dragonage.txt
Loaded 221 samples from ./data/2011/dialogues/validation_date_in_domain_in/ubuntu.txt
Loaded 227 samples from ./data/2011/dialogues/validation_date_in_domain_in/london.txt
Loaded 45 samples from ./data/2011/dialogues/validation_date_in_domain_in/fancyfollicles.txt
Loaded 103 samples from ./data/2011/dialogues/validation_date_in_domain_in/adventuretime.txt
Loaded 117 samples from ./data/2011/dialogues/validation_date_in_domain_in/whatsthisbug.txt
Loaded 177 samples from ./data/2011/dialogues/validation_date_in_domain_in/zombies.txt
Loaded 45 samples from ./data/2011/dialogues/validation_date_in_domain_in/alienblue.txt
Loaded 2535 samples from ./data/2011/dialogues/validation_date_in_domain_in/leagueoflegends.txt
Loaded 405 samples fr

In [13]:
val_data[0]

["Just found out about Rainmeter. Here's my Omnimo desktop!",
 "How'd you turn the weather/recycle bin/mail icon transparent?",
 'Hover over the "Right Arrow" icon, click the pallet that pops up below it, and you get a bunch of color choices, the last one being transparent.',
 'Awesome, thank you!']

In [14]:
len(val_data)

150168

In [15]:
_, val_data = filter_dialogs(val_data)

Filtering dialogs: 100%|██████████| 150168/150168 [01:12<00:00, 2080.13it/s]


salvaged 68504 dialogs out of a total 150168 samples.
                urls    avg_utt_len  very_long_utt          turns  \
count  150168.000000  150168.000000  150168.000000  150168.000000   
mean        0.696313      48.432179       0.740784       4.816685   
std         2.076700      49.453850       1.023773       1.372674   
min         0.000000       1.000000       0.000000       4.000000   
25%         0.000000      18.750000       0.000000       4.000000   
50%         0.000000      34.000000       0.000000       4.000000   
75%         1.000000      60.285714       1.000000       5.000000   
max       111.000000    1514.750000      17.000000      49.000000   

       first_utt_len  
count  150168.000000  
mean       84.334958  
std       146.406324  
min         1.000000  
25%        11.000000  
50%        37.000000  
75%       100.000000  
max      5682.000000  


In [16]:
random.seed(42)

In [17]:
random.shuffle(train_data)
random.shuffle(val_data)

In [18]:
test_data = val_data[1000:]
val_data = val_data[:1000]

In [19]:
print(f"Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}")

Train: 1360186, Val: 1000, Test: 67504


In [20]:
test_data[0]

['"Fairy tales do not tell children the dragons exist. Children already know that dragons exist. Fairy tales tell children the dragons can be killed." - G. K. Chesterton',
 '[relevant awesome piece of art]([URL])\n\n[artist]([URL])',
 "That's awesome. But what the hell is that kid riding? Looks like a cross between a pterodactyl and a cliff racer. ",
 "I don't care what it *is* I just know I want one."]

In [21]:
exp_path = out_dir
try:
    os.makedirs(exp_path)
    print(f"Path {exp_path} created.")
except FileExistsError:
    print(f"Path {exp_path} exists.")
    pass

Path ./data/reddit_xtreme/ exists.


In [22]:
class SplitWriter:
    def __init__(self, base_path_prefix, max_split_mb=512):
        """
        @param base_path_prefix: We will add an incremental id and .json at the end.
        """
        self.current_file = 0
        self.base_path_prefix = base_path_prefix
        self.f = None
        self.max_split_mb = max_split_mb
        
        self._open_next()
    
    def _open_next(self):
        if self.f is not None:
            self.f.close()
        
        next_file = f"{self.base_path_prefix}_{self.current_file:02d}.json"
        print("Opening next file:", next_file)
        self.f = open(next_file, "w")
        self.current_file += 1
        
    def write(self, l):        
        self.f.write(l)
        
        if self.f.tell() > self.max_split_mb*1024*1024:
            print("Reached Max Size")
            self._open_next()
        
    def close(self):
        if self.f is not None:
            self.f.close()

In [23]:
# Train
train_writer = SplitWriter(os.path.join(exp_path, f'train_{year}'), max_split_mb=512)

for l in train_data:
    train_writer.write(json.dumps({"turns": l})+"\n")
    
train_writer.close()

# Valid
valid_writer = SplitWriter(os.path.join(exp_path, f'valid_{year}'), max_split_mb=64)

for l in val_data:
    valid_writer.write(json.dumps({"turns": l})+"\n")

valid_writer.close()

# Test
test_writer = SplitWriter(os.path.join(exp_path, f'test_{year}'), max_split_mb=512)

for l in test_data:
    test_writer.write(json.dumps({"turns": l})+"\n")
    
test_writer.close()

Opening next file: ./data/reddit_xtreme/train_2011_00.json
Reached Max Size
Opening next file: ./data/reddit_xtreme/train_2011_01.json
Opening next file: ./data/reddit_xtreme/valid_2011_00.json
Opening next file: ./data/reddit_xtreme/test_2011_00.json
