# SARS-CoV-2 Variants Analysis : Omicron and Delta

## Data Preprocessing

In [1]:
def read_fasta(file_path) -> list:
    sequences = []
    cur = ''
    with open(file_path, 'r') as f:
        for line in f:
            if line.startswith('>'):
                if cur:
                    sequences.append(cur)
                    cur = ''
            else:
                cur += line.strip()
        sequences.append(cur)
    return sequences

def clean_fasta(sequences) -> list:
    for i in range(len(sequences)):
        sequences[i] = sequences[i].upper()
        sequences[i] = \
            ''.join(c for c in sequences[i] if c in {'A','C','G','T'})
    return sequences

def write_fastas(sequences, variant, directory, header=''):
    import os
    path_to_dir = f'./{directory}/{variant}'
    if not os.path.exists(path_to_dir):
        os.makedirs(path_to_dir)
    for i in range(len(sequences)):
        with open(f'{path_to_dir}/{variant}_{i+1}.fasta', 'w') as f:
            f.write(f'>{header}\n')
            for j in range(0, len(sequences[i]), 60):
                f.write(f'{sequences[i][j:j+60]}\n')
              
def read_sequence(file_path) -> str:
    return read_fasta(file_path)[0]

In [2]:
directory = "./SARS_CoV_2_Variants"
omicron_file = "omicron.fasta"
delta_file = "delta.fasta"
sequences_delta = clean_fasta(read_fasta(delta_file))
sequences_omicron = clean_fasta(read_fasta(omicron_file))
write_fastas(sequences_omicron, 'Omicron', directory)
write_fastas(sequences_delta, 'Delta', directory)