In [71]:
import pickle
import json
import jsonlines
from collections import defaultdict
import random
from tqdm import tqdm
import numpy as np

# Statistic

In [2]:
train_data = defaultdict(list)
test_data = defaultdict(list)

for i in range(5):
    with jsonlines.open(f'train_{i}.jsonl') as f:
        for line in f.iter():
            train_data[i].append(line)

    with jsonlines.open(f'test_{i}.jsonl') as f:
        for line in f.iter():
            test_data[i].append(line)

In [10]:
conditions = defaultdict(list)

for data in train_data[0]:
    data = json.loads(data['messages'][2]['content'])

    for key, value in data.items():
        if key == 'precursors':
            continue
        
        if isinstance(value, list):
            value = tuple(sorted(value))

        conditions[key].append(value)

In [11]:
condition_set = {key: list(set(value)) for key, value in conditions.items()}

In [12]:
from collections import Counter

sl_counter = Counter()
for i in condition_set['solvent']:
    sl_counter.update(i)

In [13]:
print ([i for i, v in sl_counter.most_common()])

['water', 'DMF', 'MeOH', 'EtOH', 'acetonitrile', 'DMSO', 'DMA', 'CH2Cl2', 'CHCl3', 'THF', 'toluene', 'Et3N', 'acetone', 'DEF', 'benzene', 'pyridine', 'isopropanol', 'NMP', 'ethylene glycol', 'diethyl ether', 'AcOH', 'dioxane', 'DMAC', 'hexane', 'formic acid', 'cyclohexane', 'BuOH', 'TEA', 'Ethanol', 'nitrobenzene', 'DMI', 'ethyl acetate', 'DMAc', 'DME', 'isobutanol', 'TBA', 'TPA', 'NMA', 'DMAE', 'dichlorobenzene', 'DEA', 'formamide']


In [15]:
def make_random_prediction(true, condition=condition_set):
    true = json.loads(data['messages'][2]['content'])
    prediction = dict()
    for key, value in true.items():
        if key == 'precursors':
            prediction[key] = value
        else:
            prediction[key] = random.choice(condition[key])
    
    return true, prediction

In [16]:
true_label, prediction_label = [], []

for data in tqdm(test_data[0]):
    try:
        true, prediction = make_random_prediction(data, conditions)
    except Exception as e:
        print (e)
        continue
    else:
        true_label.append(true)
        prediction_label.append(prediction)
    

100%|██████████| 2094/2094 [00:00<00:00, 156023.46it/s]


In [17]:
with open('true_label_random_feq_save.pickle', 'wb') as f:
    pickle.dump(true_label, f)

with open('prediction_label_random_feq_save.pickle', 'wb') as f:
    pickle.dump(prediction_label, f)

# Full Random

In [214]:
def make_random_solvent(washing=False):
    solvents = list(sl_counter.keys())
    if washing:
        n_solvent = random.randint(0, 5)
    else:
        n_solvent = random.randint(1, 5)

    sol = random.sample(solvents, n_solvent)
    if not sol and washing:
        return False
    else:
        return sol


In [114]:
temp_list = [float(i.replace('°C', '').strip()) for i in conditions['temperature'] if i and '°C' in i]
temp_max, temp_min = np.max(temp_list), np.min(temp_list)

def make_random_temp():
    temp = random.uniform(temp_min, temp_max)
    return f'{temp:.1f} °C'
    

In [204]:
list_pressure = list(set(conditions['pressure']))

def make_random_pressure():
    return random.choice(list_pressure)

In [285]:
list_pressure

['0 atm', '1 atm', 'autogenous']

In [255]:
list(set(conditions['time']))

def make_random_time():
    unit = random.choice(['min', 'h', 'days', 'week'])

    if unit == 'min':
        value = random.uniform(0, 60)
    elif unit == 'h':
        value = random.uniform(0, 24)
    elif unit == 'days':
        value = random.uniform(0, 7)
    elif unit == 'week':
        value = random.uniform(0, 3)

    return f'{value} {unit}'


In [256]:
def make_random_syn_method():
    return random.choice(['chemical synthesis', 'solvothermal synthesis', 'sonochemical synthesis', 'hydrothermal synthesis'])

In [282]:
def make_random_dataset(data):
    true = json.loads(data['messages'][2]['content'])
    precursor = true['precursors']
    prediction = {
        'precursors': precursor,
        'synthesis_method': make_random_syn_method(),
        'solvent': make_random_solvent(),
        'temperature': make_random_temp(),
        'time': make_random_time(),
        'pressure': make_random_pressure(),
        'cooling': random.choice([True, False]),
        'pH_adjustment': random.choice([True, False]),
        'washing': make_random_solvent(washing=True),
        'filtration': random.choice([True, False]),
        'drying': random.choice([True, False]),

    }
    return true, prediction

In [283]:
true_label, prediction_label = [], []

for data in tqdm(test_data[0]):
    try:
        true, prediction = make_random_dataset(data)
    except Exception as e:
        print (e)
        continue
    else:
        true_label.append(true)
        prediction_label.append(prediction)
    

100%|██████████| 2094/2094 [00:00<00:00, 95507.53it/s]


In [284]:
with open('true_label_random_save.pickle', 'wb') as f:
    pickle.dump(true_label, f)

with open('prediction_label_random_save.pickle', 'wb') as f:
    pickle.dump(prediction_label, f)