In [18]:
import transformers
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import Trainer, TrainingArguments
from datasets import load_dataset

In [16]:
model_checkpoint = "facebook/galactica-125m"

model = AutoModelForCausalLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def process_str(str):
    str['text'] = str['text'].replace('\\','')
    return str 

dataset = load_dataset("text", data_files='./523129_start.jsonl', split='train', streaming=True)
dataset = dataset.map(process_str)

def tokenize_function(examples):
    return tokenizer(examples["text"])


tokenized_datasets = dataset.map(tokenize_function, batched=True)

model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    f"{model_name}-finetuned-pubchem",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    max_steps=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    # eval_dataset=lm_datasets["validation"],
)

trainer.train()

In [69]:
import pyarrow as pa
from pyarrow import json
    
raw_schema = pa.schema([
    pa.field('SomeDecimal', pa.string())
])

In [82]:
schema = {'synonyms': [{'name': 'p-Phenylazo carbanilic acid, n-hexyl ester'}],
 'related': [{'SMILES': 'CCCCOC(=O)NC1=CC=C(C=C1)N', 'similarity': 0.74}],
 'experimental': [],
 'CID': 523129,
 'SMILES': 'CCCCCCOC(=O)NC1=CC=C(C=C1)N=NC2=CC=CC=C2',
 'SAS': 2.09,
 'WEIGHT': 325.18,
 'TPSA': 63.05,
 'CLOGP': 6.23,
 'QED': 0.46,
 'NUMHDONORS': 1,
 'NUMHACCEPTORS': 4,
 'NUMHETEROATOMS': 5,
 'NUMROTATABLEBONDS': 8,
 'NOCOUNT': 5,
 'NHOHCOUNT': 1,
 'RINGCOUNT': 2,
 'HEAVYATOMCOUNT': 24,
 'FRACTIONCSP3': 0.32,
 'NUMAROMATICRINGS': 2,
 'NUMSATURATEDRINGS': 0,
 'NUMAROMATICHETEROCYCLES': 0,
 'NUMAROMATICCARBOCYCLES': 2,
 'NUMSATURATEDHETEROCYCLES': 0,
 'NUMSATURATEDCARBOCYCLES': 0,
 'NUMALIPHATICRINGS': 0,
 'NUMALIPHATICHETEROCYCLES': 0,
 'NUMALIPHATICCARBOCYCLES': 0,
 'IUPAC': 'hexyl N-(4-phenyldiazenylphenyl)carbamate'}

In [89]:
s = pa.array(schema)

TypeError: 'str' object cannot be interpreted as an integer

In [95]:
l=[]
for k,v in schema.items():
    f = k
    if isinstance(v,list):
        s = pa.list_(pa.dictionary(pa.int16(),pa.string()))
    elif isinstance(v, str):
        s = pa.string()
    else:
        s = pa.float16()

    l.append(pa.field(f,s))

In [84]:
raw_schema = pa.schema(l)

In [85]:
raw_schema

synonyms: list<item: dictionary<values=string, indices=int16, ordered=0>>
  child 0, item: dictionary<values=string, indices=int16, ordered=0>
related: list<item: dictionary<values=string, indices=int16, ordered=0>>
  child 0, item: dictionary<values=string, indices=int16, ordered=0>
experimental: list<item: dictionary<values=string, indices=int16, ordered=0>>
  child 0, item: dictionary<values=string, indices=int16, ordered=0>
CID: halffloat
SMILES: string
SAS: halffloat
WEIGHT: halffloat
TPSA: halffloat
CLOGP: halffloat
QED: halffloat
NUMHDONORS: halffloat
NUMHACCEPTORS: halffloat
NUMHETEROATOMS: halffloat
NUMROTATABLEBONDS: halffloat
NOCOUNT: halffloat
NHOHCOUNT: halffloat
RINGCOUNT: halffloat
HEAVYATOMCOUNT: halffloat
FRACTIONCSP3: halffloat
NUMAROMATICRINGS: halffloat
NUMSATURATEDRINGS: halffloat
NUMAROMATICHETEROCYCLES: halffloat
NUMAROMATICCARBOCYCLES: halffloat
NUMSATURATEDHETEROCYCLES: halffloat
NUMSATURATEDCARBOCYCLES: halffloat
NUMALIPHATICRINGS: halffloat
NUMALIPHATICHETERO

In [81]:
json.read_json('../dataloader/523129_start.jsonl', parse_options=json.ParseOptions(explicit_schema=raw_schema))

ArrowNotImplementedError: JSON conversion to halffloat is not supported