<a href="https://colab.research.google.com/github/b0risfosso/practice-with-ga/blob/main/practice_with_ga.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import random
import string



In [2]:
import requests

def fetch_uniprot_sequences(query, format='fasta', max_results=100):
    base_url = "https://rest.uniprot.org/uniprotkb/search"
    params = {
        'query': query,
        'format': format,
        'size': max_results
    }
    headers = {
        'Accept': 'text/plain' if format == 'fasta' else 'application/json'
    }
    response = requests.get(base_url, params=params, headers=headers)
    if response.status_code == 200:
        return response.text
    else:
        print("Error fetching data from UniProt:", response.status_code, response.text)
        return None

# Example query: proteins with KDEL signal
query = "KDEL"
sequences = fetch_uniprot_sequences(query)
if sequences:
    lines = sequences.strip().split('\n')
    print(len(lines))
else:
    print("No sequences found.")


749


In [3]:
def build_groups(in_group_animal, out_group_animal):
    # Initialize dictionaries
    sequences_dict = {
        "in_group": [],
        "out_group": []
    }

    # Temporary variables to hold sequence data
    current_header = None
    current_sequence = []

    # Parse lines to populate dictionary
    for line in lines:
      if line.startswith('>'):
            # Save the previous sequence if it exists
            if current_header and current_sequence:
                sequence_str = ''.join(current_sequence)
                if in_group_animal in current_header:
                    sequences_dict["in_group"].append(sequence_str)
                elif out_group_animal in current_header:
                    sequences_dict["out_group"].append(sequence_str)

            # Reset for the new sequence
            current_header = line
            current_sequence = []
      else:
            current_sequence.append(line)

    # Don't forget to add the last sequence
    if current_header and current_sequence:
        sequence_str = ''.join(current_sequence)
        if in_group_animal in current_header:
            sequences_dict["in_group"].append(sequence_str)
        elif out_group_animal in current_header:
            sequences_dict["out_group"].append(sequence_str)

    # Separate the dictionary into two lists
    return sequences_dict["in_group"], sequences_dict["out_group"]

in_group, out_group = build_groups("HUMAN", "MOUSE")


In [4]:
max_seq = max(in_group, key=len)
print(len(max_seq))

825


In [32]:
# Define parameters
population_size = 800
mutation_rate = 0.5
max_generations = 100

In [48]:
# Function to calculate fitness
def fitness(schema, in_group, out_group):
    match_in = sum([schema in protein for protein in in_group])
    match_out = sum([schema in protein for protein in out_group])
    return match_in - 0.1 * match_out

# Generate initial population
def generate_population(size):
    population = []
    for _ in range(size):
        length = random.randint(4, 10)
        schema = ''.join(random.choices(string.ascii_uppercase, k=length))
        population.append(schema)
    return population

# Mutation function
def mutate(schema):
    if random.random() < mutation_rate:
        index = random.randint(0, len(schema) - 1)
        schema = schema[:index] + random.choice(string.ascii_uppercase) + schema[index + 1:]
    return schema

# Genetic Algorithm
def genetic_algorithm(in_group, out_group):
    population = generate_population(population_size)
    for generation in range(max_generations):
        population = sorted(population, key=lambda x: fitness(x, in_group, out_group), reverse=True)


        fitness_dict = {x: fitness(x, in_group, out_group) for x in population}
        # Sort the dictionary by value in descending order
        sorted_fitness_dict = dict(sorted(fitness_dict.items(), key=lambda item: item[1], reverse=True))
        top_5 = list(sorted_fitness_dict.items())[:5]
        print(top_5)


        if fitness(population[0], in_group, out_group) > 25:  # Example threshold
            print(f"Solution found in generation {generation}: {population[0]}")
            return population[0]
        next_population = population[:int(0.3 * population_size)]  # Retain top 10%
        while len(next_population) < population_size:
            parent = random.choice(population[:int(0.5 * population_size)])  # Select from top 50%
            child = mutate(parent)
            next_population.append(child)
        population = next_population
    print("No solution found.")
    return None

In [21]:
in_group, out_group = build_groups("HUMAN", "MOUSE")
print(len(in_group))
print(len(out_group))

11
10


In [37]:
in_group = []
# Temporary variables to hold sequence data
current_header = None
current_sequence = []

# Parse lines to populate dictionary
for line in lines:
  if line.startswith('>'):
        # Save the previous sequence if it exists
        if current_header and current_sequence:
            sequence_str = ''.join(current_sequence)
            in_group.append(sequence_str)

        # Reset for the new sequence
        current_header = line
        current_sequence = []
  else:
        current_sequence.append(line)

# Don't forget to add the last sequence
if current_header and current_sequence:
    sequence_str = ''.join(current_sequence)
    in_group.append(sequence_str)

In [22]:
genetic_algorithm(in_group, out_group)

7
8
7
8
10
8
9
8
3
3
7
3
9
10
5
7
5
5
3
5
4
5
7
9
9
3
9
9
5
4
7
5
7
3
3
8
9
3
10
6
3
9
8
3
9
6
9
9
9
4
7
5
9
7
7
3
4
5
9
7
4
10
9
10
6
5
4
5
3
4
10
7
3
9
7
8
7
6
5
3
5
4
5
4
9
10
10
8
5
5
3
5
3
10
4
3
5
9
8
3
4
10
7
7
9
9
7
9
4
4
6
6
3
6
3
8
4
5
10
4
9
7
3
10
10
10
6
10
3
4
3
9
5
10
3
4
10
5
9
8
10
9
3
10
6
6
5
8
10
8
8
6
4
9
9
9
9
7
4
3
6
6
10
5
4
4
8
4
7
4
8
3
4
6
3
7
9
7
8
4
7
4
4
10
9
4
9
9
8
4
7
5
4
3
3
6
9
7
4
6
10
8
9
9
3
10
4
4
6
7
8
3
5
3
7
5
7
4
3
4
3
3
5
10
3
8
5
3
9
6
8
9
7
6
7
5
4
10
9
9
9
3
5
4
10
6
8
9
5
8
3
8
10
9
7
10
3
8
7
4
10
4
8
3
10
7
4
10
9
4
4
9
8
9
7
7
9
8
7
5
7
8
10
5
7
8
8
8
10
7
5
7
10
9
5
4
9
3
6
4
5
5
4
6
10
9
8
7
5
9
5
9
9
7
10
4
3
7
7
7
7
9
6
4
7
10
8
4
9
9
9
3
3
3
5
9
9
9
9
8
5
5
4
4
3
4
10
10
6
3
3
8
6
10
6
3
4
10
8
10
5
3
8
7
6
7
9
8
9
6
3
5
7
6
6
3
5
9
6
7
6
6
9
4
5
10
10
3
3
9
8
6
5
7
7
9
4
8
8
3
7
4
8
8
5
4
7
7
7
7
10
5
9
9
5
7
5
9
8
4
9
3
8
9
4
4
7
8
9
9
3
8
6
9
7
5
3
5
3
9
9
9
5
9
3
5
8
10
8
4
5
6
9
3
5
7
3
3
10
6
3
3
5
3
6
5
5
3
8
7
6
8
5
5
9
6


'VLK'

In [38]:
import requests

def fetch_uniprot_sequences_by_length(min_length, max_length, n=10):
    base_url = "https://rest.uniprot.org/uniprotkb/stream"
    query = f"reviewed:true length:[{min_length} TO {max_length}]"
    params = {
        'format': 'fasta',
        'query': query,
        'size': n  # Number of sequences to fetch
    }
    headers = {
        'Accept': 'text/plain'
    }

    response = requests.get(base_url, params=params, headers=headers)
    if response.status_code == 200:
        return response.text
    else:
        print("Error fetching data from UniProt:", response.status_code)
        return None

# Fetch sequences with length between 100 and 200
min_length = 100
max_length = 800
random_sequences = fetch_uniprot_sequences_by_length(min_length, max_length, 700)
if sequences:
    random_sequences_list = random_sequences.split('\n>')
    for i, seq in enumerate(random_sequences_list):
        if i == 0:
            print(f">{seq}\n")  # Print the first sequence without '>'
        else:
            print(f">{seq}\n")  # Print the subsequent sequences with '>'
        if i >= 4:
            break  # Print only the first 5 sequences


>>sp|A0A009IHW8|ABTIR_ACIB9 2' cyclic ADP-D-ribose synthase AbTIR OS=Acinetobacter baumannii (strain 1295743) OX=1310613 GN=J512_3302 PE=1 SV=1
MSLEQKKGADIISKILQIQNSIGKTTSPSTLKTKLSEISRKEQENARIQSKLSDLQKKKI
DIDNKLLKEKQNLIKEEILERKKLEVLTKKQQKDEIEHQKKLKREIDAIKASTQYITDVS
ISSYNNTIPETEPEYDLFISHASEDKEDFVRPLAETLQQLGVNVWYDEFTLKVGDSLRQK
IDSGLRNSKYGTVVLSTDFIKKDWTNYELDGLVAREMNGHKMILPIWHKITKNDVLDYSP
NLADKVALNTSVNSIEEIAHQLADVILNR

>sp|A0A023I7E1|ENG1_RHIMI Glucan endo-1,3-beta-D-glucosidase 1 OS=Rhizomucor miehei OX=4839 GN=ENG1 PE=1 SV=1
MRFQVIVAAATITMITSYIPGVASQSTSDGDDLFVPVSNFDPKSIFPEIKHPFEPMYANT
ENGKIVPTNSWISNLFYPSADNLAPTTPDPYTLRLLDGYGGNPGLTIRQPSAKVLGSYPP
TNDVPYTDAGYMINSVVVDLRLTSSEWSDVVPDRQVTDWDHLSANLRLSTPQDSNSYIDF
PIVRGMAYITANYNNLTPQFLSQHAIISVEADEKKSDDNTSTFSGRKFKITMNDDPTSTF
IIYSLGDKPLELRKQDNSNLVASKPYTGVIRVAKLPAPEFETLLDASRAVWPTGGDISAR
SDDNNGASYTIKWKTNSNEAPLLTYAYAHHLTSIDDSNVKRTDMTLQSATKGPMTALVGN
EWTLRETELSPVEWLPLQAAPNPTTINEIMTEINKDIASNYTQETAKEDNYFSGKGLQKF
AMLALILNKSDQTQLRNPELAQIALDKLKAAFLPYLQNEQADPF

In [39]:
random_sequences[:100]

">sp|A0A009IHW8|ABTIR_ACIB9 2' cyclic ADP-D-ribose synthase AbTIR OS=Acinetobacter baumannii (strain "

In [44]:
random_out_group = []
# Temporary variables to hold sequence data
current_header = None
current_sequence = []

# Parse lines to populate dictionary
for line in lines:
  if line.startswith('>'):
        # Save the previous sequence if it exists
        if current_header and current_sequence:
            sequence_str = ''.join(current_sequence)
            random_out_group.append(sequence_str)

        # Reset for the new sequence
        current_header = line
        current_sequence = []
  else:
        current_sequence.append(line)

# Don't forget to add the last sequence
if current_header and current_sequence:
    sequence_str = ''.join(current_sequence)
    random_out_group.append(sequence_str)

In [50]:
genetic_algorithm(in_group, random_out_group)

[('VEEG', 2.7), ('DSTA', 1.8), ('VWIL', 0.9), ('QAWR', 0.9), ('LSNH', 0.9)]
[('VEEG', 2.7), ('DSTA', 1.8), ('VWIL', 0.9), ('QAWR', 0.9), ('LSNH', 0.9)]
[('QAWK', 5.4), ('VEEG', 2.7), ('DSTA', 1.8), ('VWIL', 0.9), ('QAWR', 0.9)]
[('PARY', 9.9), ('QAWK', 5.4), ('VEEG', 2.7), ('AEEG', 2.7), ('DSTA', 1.8)]
[('PARY', 9.9), ('NSTA', 8.1), ('QAWK', 5.4), ('VEEG', 2.7), ('AEEG', 2.7)]
[('PARY', 9.9), ('NSTA', 8.1), ('QAWK', 5.4), ('VEEG', 2.7), ('AEEG', 2.7)]
[('PARY', 9.9), ('NSTA', 8.1), ('QAWK', 5.4), ('LNNH', 3.6), ('VEEG', 2.7)]
[('VAIL', 21.6), ('LEIL', 18.0), ('PARY', 9.9), ('NSTA', 8.1), ('QAWK', 5.4)]
[('VAIL', 21.6), ('LEIL', 18.0), ('LSHL', 15.3), ('AVEG', 10.8), ('PARY', 9.9)]
[('VAIL', 21.6), ('LEIL', 18.0), ('LSHL', 15.3), ('AVEG', 10.8), ('PARY', 9.9)]
[('VAIL', 21.6), ('LEIL', 18.0), ('LSHL', 15.3), ('AVEG', 10.8), ('PARY', 9.9)]
[('VAIL', 21.6), ('GDWP', 20.7), ('LEIL', 18.0), ('LSDL', 18.0), ('LSHL', 15.3)]
[('VAIL', 21.6), ('GDWP', 20.7), ('LEIL', 18.0), ('LSDL', 18.0), ('LS