In [2]:
from Algorithms.Z_algorithm import get_matches as Z
from Algorithms.manacher import manacher as M

import os

In [36]:
# Clase principal
class Virus:
  def __init__(self, file_path):
    if not os.path.exists(file_path):
      raise FileNotFoundError(f"El archivo {file_path} no existe")
    with open(file_path, 'r') as file:
      lines = file.readlines()
    self.metadata = lines[0].strip()
    self.sequence = ''.join(line.strip() for line in lines[1:])
    self.file = os.path.basename(file_path)
    self.palindromo = M(self.sequence) # Algoritmo Manacher importado
    
    new_sequence = ''
    # Iterate through the gene sequence in steps of 3 (codons)
    for i in range(0, len(self.sequence), 3):  # Stop 2 nucleotides before the end to avoid partial codons
        codon = self.sequence[i:i+3]
        amino_acid = self.codon_to_amino_acid.get(codon, '?')  # '?' for invalid codons
        new_sequence += amino_acid
    self.amino_sequence = new_sequence

  def getFirstChars(self, n=12):
    if n > len(self.sequence):
      raise ValueError("n no puede ser mayor a la longitud de la secuencia (length =", len(self.sequence), ")")
    return self.sequence[:n]
  
  codon_to_amino_acid = {
    'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',  # Alanine
    'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'AGA': 'R', 'AGG': 'R',  # Arginine
    'AAT': 'N', 'AAC': 'N',  # Asparagine
    'GAT': 'D', 'GAC': 'D',  # Aspartic Acid
    'TGT': 'C', 'TGC': 'C',  # Cysteine
    'CAA': 'Q', 'CAG': 'Q',  # Glutamine
    'GAA': 'E', 'GAG': 'E',  # Glutamic Acid
    'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G',  # Glycine
    'CAT': 'H', 'CAC': 'H',  # Histidine
    'ATT': 'I', 'ATC': 'I', 'ATA': 'I',  # Isoleucine
    'TTA': 'L', 'TTG': 'L', 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',  # Leucine
    'AAA': 'K', 'AAG': 'K',  # Lysine
    'ATG': 'M',  # Methionine (Start)
    'TTT': 'F', 'TTC': 'F',  # Phenylalanine
    'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',  # Proline
    'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'AGT': 'S', 'AGC': 'S',  # Serine
    'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',  # Threonine
    'TGG': 'W',  # Tryptophan
    'TAT': 'Y', 'TAC': 'Y',  # Tyrosine
    'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',  # Valine
    'TAA': '*', 'TAG': '*', 'TGA': '*'  # Stop codons
  } 

  def printData(self):
    print("File: ", self.file)
    print("Metadata: ", self.metadata)
    #print("Sequence: ", self.sequence)
    print("Primeros 12 chars: ", self.getFirstChars())
    print("Palindromo: ", self.palindromo)



In [37]:
class Gen(Virus):
  def __init__(self, file_path):
    super().__init__(file_path)
    self.VirusIndexes = []

  def check_viruses(self, viruses):
    for virus in viruses:
      self.VirusIndexes.append([virus.file, Z(self.sequence, virus.sequence)])

  def printData(self):
    super().printData()
    for virus in self.VirusIndexes:
      print("Virus: ", virus[0])
      print("Indexes: ", virus[1])

In [51]:
class Proteina(Virus):
  def __init__(self, metadata, sequence, file_path):
    self.file = os.path.basename(file_path)
    self.metadata = metadata
    self.sequence = sequence
    self.palindromo = M(self.sequence)  # Manacher function or similar
    self.VirusIndexes = []
    self.GeneIndexes = []

  @classmethod
  def leer_de_mismo_archivo(cls, file_path):
    if not os.path.exists(file_path):
      raise FileNotFoundError(f"El archivo {file_path} no existe")

    with open(file_path, 'r') as file:
      content = file.read().split('>')  # Split by '>' to separate different proteins

      proteins = []
      for entry in content:
        if entry.strip():
          lines = entry.splitlines()
          metadata = lines[0].strip()
          sequence = ''.join(line.strip() for line in lines[1:])
          protein = cls(metadata, sequence, file_path)  # Create Proteina instances
          proteins.append(protein)

      return proteins
  
  # Sobreescribir porque ahora se busca con el amino_sequence
  def check_viruses(self, viruses):
    for virus in viruses:
      self.VirusIndexes.append([virus.file, Z(self.sequence, virus.amino_sequence)])
  
  def check_genes(self, genes):
    for gene in genes:
      self.GeneIndexes.append([gene.file, Z(self.sequence, gene.amino_sequence)])

  def printData(self):
    print("Metadata: ", self.metadata)
    #print("Sequence: ", self.sequence)
    print("Primeros 12 chars: ", self.getFirstChars())
    print("Palindromo: ", self.palindromo)
    for virus in self.VirusIndexes:
      print("Virus: ", virus[0])
      print("Indexes: ", virus[1])
    for gene in self.GeneIndexes:
      print("Gene: ", gene[0])
      print("Indexes: ", gene[1])


In [39]:
def leer_archivos_de_directorio(directory, tipo):
    sequences = []
    for filename in os.listdir(directory):
      if filename.endswith('.txt'):
        file_path = os.path.join(directory, filename)
        if tipo == 1:
          sequences.append(Virus(file_path))
        elif tipo == 2:
          sequences.append(Gen(file_path))
        elif tipo == 3:
          # Aqui es extend porque regresa una lista de objetos y no un objeto (y se rquiere una lista de objetos plana)
          sequences.extend(Proteina.leer_de_mismo_archivo(file_path))
        else:
          raise ValueError("Tipo no valido")
    return sequences

In [52]:
path_genes = 'genes'
path_viruses = 'viruses'
path_proteinas = 'proteinas'
viruses = leer_archivos_de_directorio(path_viruses, 1)
genes = leer_archivos_de_directorio(path_genes, 2)
proteinas = leer_archivos_de_directorio(path_proteinas, 3)

print("VIRUS DATA")
print()
for virus in viruses:
    virus.printData()
    print()

print("-----------------------")
print("\nGENE DATA\n")
print("-----------------------\n")

for gen in genes:
    gen.check_viruses(viruses)
    gen.printData()
    print()

print("-----------------------")
print("\nPROTEIN DATA\n")
print("-----------------------\n")

for proteina in proteinas:
    proteina.check_viruses(viruses)
    proteina.check_genes(genes)
    proteina.printData()
    print()
    

VIRUS DATA

File:  SARS-COV-2-MN908947.3.txt
Metadata:  >MN908947.3 Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome
Primeros 12 chars:  ATTAAAGGTTTA
Palindromo:  AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA

File:  SARS-COV-2-MT106054.1.txt
Metadata:  >MT106054.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/TX-CDC-03050000-001/2020, complete genome
Primeros 12 chars:  ATTAAAGGTTTA
Palindromo:  CTCAATGACTTCAGTAACTC

-----------------------

GENE DATA

-----------------------

File:  gen-M.txt
Metadata:  >MN908947.3:26523-27191 Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome
Primeros 12 chars:  ATGGCAGATTCC
Palindromo:  CTAAAGAAATC
Virus:  SARS-COV-2-MN908947.3.txt
Indexes:  [26522]
Virus:  SARS-COV-2-MT106054.1.txt
Indexes:  [26522]

File:  gen-ORF1AB.txt
Metadata:  >MN908947.3:266-21555 Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome
Primeros 12 chars:  ATGGA