In [None]:
#molecular_generation.py file incelemesidir.
import json
from argparse import ArgumentParser
from transformers import RobertaTokenizer, RobertaForCausalLM
from pathlib import Path


def main(args):
    ckpt = Path(args.model)
    # Assigning a path to variable ckpt
    tokenizer = RobertaTokenizer.from_pretrained(str(ckpt))
    # We can load and save a tokenizer through from_pretrained() and save_pretrained() methods.
    model = RobertaForCausalLM.from_pretrained(str(ckpt))
    # Loadining pre-trained model through from_pretrained method
    # The RoBERTa model was proposed in RoBERTa: A Robustly Optimized BERT Pretraining Approach. It is based on Google’s BERT model released in 2018.
    # It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining objective and training 
    # with much larger mini-batches and learning rates.
    input_ids = tokenizer.encode('', return_tensors='pt')
    # Return_tensors = “pt” is just for the tokenizer to return PyTorch tensors. Encoding defines input ids for tokens
    args_dict = vars(args)
    # Make args a dictionary.
    generation_params = {k:v for k, v in args_dict.items() if k not in ['model', 'output_file']}
    output = model.generate(input_ids, **generation_params)
    # Generating outputs from input ids and model
    output_decoded = [tokenizer.decode(item, skip_special_tokens=True) for item in output]
    # Convert id to decoded version using tokenizer.
    with open(f'predictions/{args.model}.json', 'w') as f:
        f.write(json.dumps({'predictions': output_decoded, 'model': str(ckpt), **generation_params}))
        # Write predictions to .json file
    

if __name__ == "__main__":
    parser = ArgumentParser()
    parser.add_argument('--model', type=str, required=True)
    parser.add_argument('--num_return_sequences', type=int, required=True)
    # num_return_sequences : The number of independently computed returned sequences for each element in the batch.
    parser.add_argument('--do_sample', action='store_true')
    # Whether or not to use sampling ; use greedy decoding otherwise.
    parser.add_argument('--top_k', type=int, default=0)
    # The number of highest probability vocabulary tokens to keep for top-k-filtering.
    parser.add_argument('--max_length', type=int, default=None)
    # The maximum length the generated tokens can have
    parser.add_argument('--top_p', type=float)
    # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
    args = parser.parse_args()
    main(args) 


    #-----------------------------------------------------------------------------------------------#



In [None]:
import moses
import json
import pandas as pd
import logging
from pathlib import Path
from argparse import ArgumentParser


def read_json(filename):
    return json.loads(open(filename, 'r').read())


def dump_json(data, filename):
    json.dump(data, open(filename, 'w'))


def evaluate_model_generations(folder, train, targeted_interactions):
    output_folder = Path(folder)
    outputs = read_json(output_folder / 'ChemBERTaLM.json')
    moses_train = moses.get_dataset('train').tolist()
    # Transform moses data set to list
    results = {}
    scores_dir = Path('results') / output_folder.parent.name / output_folder.name
    scores_dir.mkdir(parents=True, exist_ok=True)
    results['metrics_moses'] = moses.get_all_metrics(sum(outputs['predictions'].values(),[]), train=(train+moses_train))
    #Moses is a benchmarking data set refined from ZINC Database

    results['targets'] = {}
    for uniprot_id, smiles in outputs['predictions'].items():
        logging.info('Targeting %s' % uniprot_id)
        try:
            results['targets'][uniprot_id] = {
                'bdb': moses.get_all_metrics(smiles, k=1, train=(train+moses_train), test=targeted_interactions[uniprot_id]),
                'moses': moses.get_all_metrics(smiles, k=1)
            }
        except:
            results['error'].append(uniprot_id)
    dump_json(results, output_folder / 'ChemBERTaLM_scores.json')


parser = ArgumentParser()
parser.add_argument('--model', type=str, required=True)
parser.add_argument('--train', type=str, default='data/splits/train_interactions.csv')
parser.add_argument('--test', type=str, default='data/splits/test_interactions.csv')
args = parser.parse_args()
train = pd.read_csv(args.train)['canonical_SMILES'].values.tolist()
test_interactions = pd.read_csv(args.test, index_col=None)
targeted_interactions = test_interactions.groupby('UniProt_S_ID')['canonical_SMILES'].apply(list).to_dict()
evaluate_model_generations('predictions/' + args.model, train, targeted_interactions)

In [None]:
#molecular_training.py dosyası yeterince commentliydi onu okumak yeterli oldu.