# Quimeras de humano com HIV
* **VERSÃO**:1.0
* **DESCRIÇÃO:** Gera quimeras de DNA humano e HIV
* **DATA DA CRIAÇÃO:** 2022-01-10
* **ESCRITO POR:** João Paulo C. Ferreira
* **E-MAIL:** jpferreira.jota@gmail.com
* **Linguagem:** Python 3.8.8
* **PROJETO:** LINK DO GIT

### Parametros e arquivos de input

In [344]:
input_genome_human   = 'genome_human.fa'   # Arquivo fasta Humano
input_genome_virus   = 'genome_hiv.fa'   # Arquivo fasta Viral
path_output_files    = './output'   # Diretório de saída do script
random_mutation_rate = 10  # Taxa de mutação aleatoria ex: 0.1 para 10% de mutação aleatória
mutate_separately    = False # True para aplicar mutações em ambas as especies separadamente False para aplicar as mutações na quimera
start_human_genome   = True # True para que as quimeras comecem com DNA humano, False para começar com DNA viral
range_human_percent  = [40,60]  # Quantidade de DNA humano por sequencia ex:[40,60] = Entre 40 e 60 porcento humano
size_reads           = 150 # Comprimento das reads finais em par de base

#### Imports

In [345]:
from Bio import SeqIO
import random
import os

#### Main

In [346]:
if not (os.path.isfile(input_genome_human) and os.path.isfile(input_genome_virus)):
    print('Arquivos de input não encontrados')
    exit()

In [347]:
read_fasta = lambda file: [seq.seq for seq in SeqIO.parse(open(file),'fasta')][0]
get_lenght = lambda file: len(read_fasta(file))
draw_start = lambda lenght: random.randrange(0,lenght)

def get_values(human,viral):
    human_start = draw_start(human)
    viral_start = draw_start(viral)
    return human_start,viral_start

def calc_values(human_start,viral_start,range_percent = range_human_percent):
    proportion_human = int(size_reads*(random.randrange(range_percent[0],range_percent[1]) / 100 ))
    proportion_viral = size_reads - proportion_human
    human_end = human_start + proportion_human
    viral_end = viral_start + proportion_viral
    return proportion_human,proportion_viral,human_end,viral_end

def format_string(read,human_start,human_end,viral_start,viral_end,proportion_human):
    format_number = lambda number,arg: str(number).zfill(arg)
    reference = f'HR-{format_number(human_start,9)}:{format_number(human_end,9)}_VR-{format_number(viral_start,5)}:{format_number(viral_end,5)}'
    sample = f'RN-{format_number(read,7)}_HS-0:{proportion_human}_VS-{proportion_human+1}:{size_reads}'    
    return f'{sample}_{reference}_M-{format_number(random_mutation_rate,3)}'

def apply_mutation(name,read,range_posix=[0,0]):
    count = 0
    while (count <= (int(len(read)*(random_mutation_rate / 100 )))):
        index = random.randrange(range_posix[0],range_posix[1])
        alt = random.choice(nucleotides)
        while alt == read[index]:
            alt = random.choice(nucleotides)
        print(f'{name}\t{index}\t{read[index]}/{alt}')
        read = read[:index] + alt + read[index+1:]
        
        write_mutations(name,read[index],alt,index)
        count +=1
    return read

def write_mutations(name,ref,alt,posix):
    file = open(f'{path_output_files}/mutations.csv','a')
    text = f'{name}\t{posix}\t{ref}/{alt}\n'
    file.write(text)
    file.close()
    
def write_fasta(name,seq):
    file = open(f'{path_output_files}/quimeras_human_hiv.fa','a')
    text = f'>{name}\n{seq}\n'
    file.write(text)
    file.close()

In [348]:
nucleotides = ['A','T','C','G']
lenght_human_genome = get_lenght(input_genome_human)
lenght_viral_genome = get_lenght(input_genome_virus)

In [349]:
count = 0
while (count < 1000):
    count += 1
    human_start,viral_start = get_values(lenght_human_genome,lenght_viral_genome)
    proportion_human,proportion_viral,human_end,viral_end = calc_values(human_start,viral_start)
    
    if not ((human_start + proportion_human) > lenght_human_genome or (viral_start + proportion_viral)  > lenght_viral_genome):
        human_bases = read_fasta(input_genome_human)[human_start:human_end]
        viral_bases = read_fasta(input_genome_virus)[viral_start:viral_end]
        
        
        if mutate_separately:
            text = format_string(count,human_start,human_end,viral_start,viral_end,proportion_human)
            read=f'{apply_mutation(text,human_bases,[human_start,human_end])}{apply_mutation(text,viral_bases,[viral_start,viral_end])}'
            write_fasta(text,read)
            print(f'{text}\n{read}')
            
        else:
            text = format_string(count,human_start,human_end,viral_start,viral_end,proportion_human)
            read=apply_mutation(text,f'{human_bases}{viral_bases}',[0,size_reads])
            write_fasta(text,read)
            print(f'{text}\n{read}')


RN-0000001_HS-0:72_VS-73:150_HR-045747269:045747341_VR-05799:05877_M-010	85	T/A
RN-0000001_HS-0:72_VS-73:150_HR-045747269:045747341_VR-05799:05877_M-010	146	G/T
RN-0000001_HS-0:72_VS-73:150_HR-045747269:045747341_VR-05799:05877_M-010	119	T/G
RN-0000001_HS-0:72_VS-73:150_HR-045747269:045747341_VR-05799:05877_M-010	148	G/A
RN-0000001_HS-0:72_VS-73:150_HR-045747269:045747341_VR-05799:05877_M-010	53	T/G
RN-0000001_HS-0:72_VS-73:150_HR-045747269:045747341_VR-05799:05877_M-010	99	C/A
RN-0000001_HS-0:72_VS-73:150_HR-045747269:045747341_VR-05799:05877_M-010	72	G/C
RN-0000001_HS-0:72_VS-73:150_HR-045747269:045747341_VR-05799:05877_M-010	113	T/A
RN-0000001_HS-0:72_VS-73:150_HR-045747269:045747341_VR-05799:05877_M-010	121	A/G
RN-0000001_HS-0:72_VS-73:150_HR-045747269:045747341_VR-05799:05877_M-010	94	G/C
RN-0000001_HS-0:72_VS-73:150_HR-045747269:045747341_VR-05799:05877_M-010	109	G/T
RN-0000001_HS-0:72_VS-73:150_HR-045747269:045747341_VR-05799:05877_M-010	78	A/G
RN-0000001_HS-0:72_VS-73:150_HR-04