# Steps

- Load a year
- Read data
- Apply reddit-filter (r1m_preprocess)
- split into json files, each of 512 MB max
- Remove the max_conv limiter
- only extract in-domain validation data

In [1]:
import os, glob, re, json, random
from tqdm.notebook import tqdm
from r1m_preprocess import filter_dialogs

In [2]:
def extract_text(conv):
    text = " __eou__ ".join(conv['turns']) + " __eou__"
    text = text.replace("\n", "")
    text = text.replace("\r", "")
    return text

In [3]:
year = "2011"
training_files = glob.glob(f'./data/{year}/dialogues/training/*.txt')

In [4]:
len(training_files)

541

In [5]:
MAX_CONV_PER_DOMAIN = 500
out_dir = './data/reddit_xtreme/' # around 900 train files

In [6]:
train_data = []
for file in tqdm(training_files):
    with open(file) as f:
        for i, l in enumerate(f):
            if i >= MAX_CONV_PER_DOMAIN:
                break
            # conv = extract_text(json.loads(l))
            conv = json.loads(l)["turns"]
            train_data.append(conv)

HBox(children=(FloatProgress(value=0.0, max=541.0), HTML(value='')))




In [7]:
train_data[0]

["How would I install a calendar or something of the sort to be able to use?  <selfbr> I downloaded a calendar and I have tried to just put it into themes. Can't find it anywhere when I am trying to config to add it independantly. Any help? ",
 'What calendar did you download?',
 'I Downloaded the corner calendar. I figured out how after another hour of fidgeting with it. I appreciate the comment though. ',
 'Well, if you need to remember, it goes in the Skins folder.']

In [8]:
len(train_data)

207028

In [9]:
_, train_data = filter_dialogs(train_data)

Filtering dialogs: 100%|██████████| 207028/207028 [01:46<00:00, 1944.54it/s]


salvaged 88111 dialogs out of a total 207028 samples.
                urls    avg_utt_len  very_long_utt          turns  \
count  207028.000000  207028.000000  207028.000000  207028.000000   
mean        0.854537      51.994496       0.831337       4.800727   
std         2.371895      50.730940       1.103665       1.322925   
min         0.000000       1.000000       0.000000       4.000000   
25%         0.000000      20.750000       0.000000       4.000000   
50%         0.000000      37.285714       0.000000       4.000000   
75%         1.000000      65.333333       1.000000       5.000000   
max       287.000000    1023.750000      24.000000      61.000000   

       first_utt_len  
count  207028.000000  
mean       82.561977  
std       132.036542  
min         1.000000  
25%        11.000000  
50%        40.000000  
75%       102.000000  
max      3531.000000  


In [10]:
val_files = glob.glob(f'./data/{year}/dialogues/validation_date_in_domain_in/*.txt')

In [11]:
len(val_files)

497

In [12]:
val_data = []
for file in tqdm(val_files):
    with open(file) as f:        
        for j, l in enumerate(f):
            if j >= MAX_CONV_PER_DOMAIN:
                break
            # conv = extract_text(json.loads(l))  
            conv = json.loads(l)["turns"]
            val_data.append(conv)

HBox(children=(FloatProgress(value=0.0, max=497.0), HTML(value='')))




In [13]:
val_data[0]

["Just found out about Rainmeter. Here's my Omnimo desktop!",
 "How'd you turn the weather/recycle bin/mail icon transparent?",
 'Hover over the "Right Arrow" icon, click the pallet that pops up below it, and you get a bunch of color choices, the last one being transparent.',
 'Awesome, thank you!']

In [14]:
len(val_data)

74634

In [15]:
_, val_data = filter_dialogs(val_data)

Filtering dialogs: 100%|██████████| 74634/74634 [00:38<00:00, 1962.29it/s]


salvaged 32514 dialogs out of a total 74634 samples.
               urls   avg_utt_len  very_long_utt         turns  first_utt_len
count  74634.000000  74634.000000   74634.000000  74634.000000   74634.000000
mean       0.778667     51.460641       0.798657      4.814361      86.861618
std        2.097565     52.491119       1.064477      1.364278     159.734347
min        0.000000      1.000000       0.000000      4.000000       1.000000
25%        0.000000     20.250000       0.000000      4.000000      11.000000
50%        0.000000     36.500000       0.000000      4.000000      37.000000
75%        1.000000     63.777778       1.000000      5.000000     103.000000
max       98.000000   1162.800000      17.000000     49.000000    5682.000000


In [16]:
random.seed(42)

In [17]:
random.shuffle(train_data)
random.shuffle(val_data)

In [18]:
test_data = val_data[1000:]
val_data = val_data[:1000]

In [19]:
print(f"Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}")

Train: 88111, Val: 1000, Test: 31514


In [20]:
test_data[0]

['What is the creepiest thing that has ever happened in your house at night while you were alone? (X-post from /r/AskReddit) : I love "This happened to me one night in my house" ghost stories, and these are some of the creepiest:\n\n[URL]',
 'I had an apartment where I would here something whisper my name when i was alone.',
 '*mcdeeaagghlessaaaaaadwiiiichh.......happpyy birrrthdayy*',
 'OMG I HAD NO IDEA!!! TY.  Making obligitory rage comic now.',
 'I found the rage comic stating it was your reddit birthday just earlier.']

In [21]:
exp_path = out_dir
try:
    os.makedirs(exp_path)
    print(f"Path {exp_path} created.")
except FileExistsError:
    print(f"Path {exp_path} exists.")
    pass

Path ./data/reddit_xtreme/ exists.


In [25]:
class SplitWriter:
    def __init__(self, base_path_prefix, max_split_mb=512):
        """
        @param base_path_prefix: We will add an incremental id and .json at the end.
        """
        self.current_file = 0
        self.base_path_prefix = base_path_prefix
        self.f = None
        self.max_split_mb = max_split_mb
        
        self._open_next()
    
    def _open_next(self):
        if self.f is not None:
            self.f.close()
        
        self.f = open(f"{self.base_path_prefix}_{self.current_file:02d}.json")
        self.current_file += 1
        
    def write(self, l):        
        self.f.write(l)
        
        if self.f.tell() > self.max_split_mb*1024*1024:
            print("Reached Max Size")
            self._open_next()
        
    def close(self):
        if self.f is not None:
            self.f.close()

In [24]:
with open(os.path.join(exp_path, 'train_dialogues.json'), "w") as f:
    for l in train_data:
        f.write(json.dumps({"turns": l})+"\n")
        if f.tell() > 512*1024*1024:
            print("Reached Max Size")
            break

In [None]:
with open(os.path.join(exp_path, 'val_dialogues.json'), "w") as f:
    for l in val_data:
        f.write(json.dumps({"turns": l})+"\n")

In [None]:
with open(os.path.join(exp_path, 'test_dialogues.json'), "w") as f:
    for l in test_data:
        f.write(json.dumps({"turns": l})+"\n")