<a href="https://colab.research.google.com/github/alex-tianhuang/idrfeatlib/blob/main/notebooks/DesignToFeatureVector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Run this cell to install the IDR design library
!file idrfeatlib/ >/dev/null && rm -rf idrfeatlib
!git clone https://github.com/alex-tianhuang/idrfeatlib --quiet
%pip install idrfeatlib/

Processing ./idrfeatlib
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: idrfeatlib
  Building wheel for idrfeatlib (setup.py) ... [?25l[?25hdone
  Created wheel for idrfeatlib: filename=idrfeatlib-0.0.0-py3-none-any.whl size=31311 sha256=dc263fd5f7adf86522d61bec64202a19b45e984eb79e29c3cc5499fde4981574
  Stored in directory: /tmp/pip-ephem-wheel-cache-7k47mdlp/wheels/d5/cf/10/5dfdc8ed2b6a9afbf9abd3de203724f1fd35094f1eb11f5312
Successfully built idrfeatlib
Installing collected packages: idrfeatlib
Successfully installed idrfeatlib-0.0.0


In [2]:
# Run this cell to define the design function

def main(args):
    """Designs a number of sequences whose features are optimized towards a target feature vector."""
    import csv
    metric = load_metric(args.target_features, args.weights_feature_vector)
    featurizer = compile_default_featurizer()
    check_features_match(featurizer, metric.origin)
    check_features_match(featurizer, metric.weights)
    designer = load_designer(featurizer, metric, args.greedy)

    feat_names = featurizer.keys()
    column_names = ["DesignID", "Iteration", "Sequence", "Time", "Distance"] + list(feat_names)
    if args.query_sequence is None:
        column_names.pop(0)

    with open(args.output_file, "w") as file:
        (writer := csv.DictWriter(file, column_names)).writeheader()
        if args.query_sequence is not None:
            design_loop_with_query_sequence(
                designer=designer,
                featurizer=featurizer,
                metric=metric,
                writer=writer,
                query=args.query_sequence
            )
        else:
            design_loop_with_generated_sequences(
                designer=designer,
                featurizer=featurizer,
                metric=metric,
                sequence_length=args.sequence_length,
                design_id_template=args.design_id,
                writer=writer,
                num_designs=args.num_designs
            )

ACCEPTABLE_ERRORS=(ArithmeticError, ValueError, KeyError)

def load_metric(target_features_file: str, weights_file: str):
    """Load a feature-metric from a target features and feature weights file."""
    from idrfeatlib import FeatureVector
    from idrfeatlib.metric import Metric
    for _, target_features in FeatureVector.load(target_features_file):
        break
    else:
        raise RuntimeError(f"could not find feature vector from target feature file: `{target_features_file}`")
    for _, feature_weights in FeatureVector.load(weights_file):
        return Metric(target_features, feature_weights)
    raise RuntimeError(f"could not find feature vector from weights file: `{weights_file}`")

def compile_default_featurizer():
    """Get the default featurizer."""
    from idrfeatlib.native import compile_native_featurizer
    featurizer, errors = compile_native_featurizer()
    if len(errors) > 0:
        raise RuntimeError("could not compile native featurizer")
    return featurizer

def check_features_match(featurizer, feature_vector):
    """Ensure that the features of the featurizer and the feature vector match."""
    if featurizer.keys() != feature_vector.as_dict.keys():
        raise RuntimeError("featurizer and feature vector have different features")

def load_designer(featurizer, metric, greedy: bool):
    """Return the appropriate designer object."""
    from idrfeatlib.designer import FeatureDesigner, GreedyFeatureDesigner
    import random
    CONVERGENCE_THRESHOLD = 1e-4
    GOOD_MOVES_THRESHOLD = 3
    DECENT_MOVES_THRESHOLD = 5
    if greedy:
        designer = GreedyFeatureDesigner(featurizer, metric, convergence_threshold=CONVERGENCE_THRESHOLD)
        designer.rng = random.Random()
    else:
        designer = FeatureDesigner(featurizer, metric, covergence_threshold=CONVERGENCE_THRESHOLD, good_moves_threshold=GOOD_MOVES_THRESHOLD, decent_moves_threshold=DECENT_MOVES_THRESHOLD, rng=random.Random())
    return designer

def design_loop_with_query_sequence(*, designer, featurizer, metric, writer, query):
    """
    Design a single sequence from query that matches a target feature vector.

    The target and weighting are already supposed to be set up in the
    `designer.metric` object.
    """
    from math import sqrt
    from idrfeatlib import FeatureVector
    from idrfeatlib.featurizer import Featurizer
    feat_names = featurizer.keys()
    for progress in designer.design_loop(query, acceptable_errors=ACCEPTABLE_ERRORS):
        fvec = FeatureVector({feat_name: progress[feat_name] for feat_name in feat_names})
        sqr_distance = metric.euclidean_norm_of(fvec)
        writer.writerow({
            **progress,
            "Distance": sqrt(sqr_distance)
        })

def design_loop_with_generated_sequences(*, designer, featurizer, metric, sequence_length, design_id_template, writer, num_designs):
    """
    Design a number of sequences that match a target feature vector.

    The target and weighting are already supposed to be set up in the
    `designer.metric` object.
    """
    from math import sqrt
    from idrfeatlib import FeatureVector
    from idrfeatlib.featurizer import Featurizer
    for counter in range(num_designs):
        design_id = design_id_template.format(counter=counter)
        query = generate_initial_sequence(designer=designer, sequence_length=sequence_length, featurizer=Featurizer(featurizer))
        patched_writer = object()
        patched_writer.writerow = lambda row: writer.writerow({**row, "DesignID": design_id})
        design_loop_with_query_sequence(
            designer=designer,
            featurizer=featurizer,
            metric=metric,
            writer=patched_writer,
            query=query
        )

def generate_initial_sequence(*, designer, sequence_length, featurizer):
    """Generate an initial query sequence."""
    AMINOACIDS = list("ACDEFGHIKLMNPQRSTVWY")
    MAX_RETRIES = 15

    for _ in range(MAX_RETRIES):
        query = "".join(designer.rng.choice(AMINOACIDS) for _ in range(sequence_length))
        try:
            featurizer.featurize(query, acceptable_errors=())
        except ACCEPTABLE_ERRORS:
            continue
        return query
    else:
        raise RuntimeError("cannot generate query with all features")

def display_csv(output_name: str):
    """
    Show the table in the notebook.

    I assume colab will forever keep pandas as available by default.
    """
    from IPython.display import display
    import pandas as pd

    df = pd.read_csv(output_name)

    print()
    print("Showing output below")
    print("--------------------")
    print()
    display(df)
    print()

def run_colab_wrapper(output_name: str):
    """Design a sequence to match an arbitrary feature vector given in a csv."""
    import argparse
    import os
    from google.colab import files

    args = argparse.Namespace()

    args.target_features = 'target_features.csv'
    goto_upload = True
    if os.path.exists(args.target_features):
        choice = input(f"The file {args.target_features} already exists. Would you like to overwrite it? (y/n)")
        if choice.lower() != 'y':
            print("Using existing CSV of target feature values.")
            goto_upload = False
    if goto_upload:
        print("You may choose to upload a CSV of target feature values,")
        print("or use an example file located at `idrfeatlib/notebooks/data/example_target_features.csv`.")
        choice = input("Would you like to upload a file? (y/n): ")
        if choice.lower() == 'y':
            print("Please upload a CSV of target feature values.")
            files.upload_file(args.target_features)
        else:
            args.target_features = 'idrfeatlib/notebooks/data/example_target_features.csv'
            print(f"Using example target feature file at `{args.target_features}`.")

    print("How would you like to weight the features?")
    print("1. Use inv-std from all human IDRs.")
    print("2. Use inv-std from Disprot.")
    print("3. Provide a custom CSV of feature weights (provide file).")
    choice = input("Enter choice (1, 2, or 3): ")
    if choice not in ['1', '2', '3']:
        print("Invalid choice. Defaulting to inv-std from all human IDRs.")
        choice = '1'
    if choice == '1':
        args.weights_feature_vector = 'idrfeatlib/notebooks/data/inv_std_human.csv'
    elif choice == '2':
        args.weights_feature_vector = 'idrfeatlib/notebooks/data/inv_std_disprot.csv'
    elif choice == '3':
        args.weights_feature_vector = 'feature_weights.csv'
        print("Please upload a CSV of feature weights.")
        files.upload_file(args.weights_feature_vector)

    choice = input("Would you like to generate sequences from scratch or use a query sequence? (s/q):")
    if choice not in ['s', 'q']:
        print("Invalid choice. Defaulting to generating sequences from scratch.")
        choice = 's'
    if choice.lower() == 's':
        args.query_sequence = None
        args.sequence_length = int(input("How long should the designed sequences be?"))
        args.num_designs = int(input("How many sequences do you want to generate?"))
    elif choice.lower() == 'q':
        args.query_sequence = input("Enter a query sequence:")

    choice = input("Would you like to adjust other parameters? Press `n` for defaults (y/n)")
    if choice.lower() == 'y':
        if args.query_sequence is None:
            args.design_id = input("Enter design ID format string (default: '{counter}'): ") or "{counter}"
            try:
                args.design_id.format(counter=0)
            except KeyError:
                print("Invalid design ID format string. Defaulting to '{counter}'.")
                args.design_id = "{counter}"
        args.greedy = input("Use greedy optimization? (y/n): ").lower() == 'y'
    else:
        if args.query_sequence is None:
            args.design_id = "{counter}"
        args.greedy = False


    args.output_file = output_name
    if os.path.exists(args.output_file):
        overwrite = input(f"Output file '{args.output_file}' already exists. Overwrite? (y/n)").lower()
        if overwrite != 'y':
            raise RuntimeError(f"Output file '{args.output_file}' already exists. Please rename or delete it.")
        print(f"Output file '{args.output_file}' overwritten.")

    print("Starting design task...")
    main(args)
    print("Design task finished.")
    display_csv(args.output_file)
    print(f"Downloading output file to {args.output_file}")
    files.download(args.output_file)

In [None]:
# This cell will:
#
# 1. Ask for lots of input parameters:
#    - (required) a CSV file of target feature values
#    - (required) a CSV file of feature weights
#                 either for the default features or the user-entered features
#    - (required) the length of each generated sequence
#    - (required) the number of sequences to generate
#    - (optional) two other parameters ...
# 2. Design sequences and output them iteratively to `output_features.csv`
# 3. Ask to download the output file, called `output_features.csv`
#
# Run this cell after running the above cells as many times as you would like.

run_colab_wrapper("output_features.csv")
print("Done!")