In [None]:
import utilities
import fasta
import alignment
import alignment_curation
import glob

## Load the files in

In [None]:
# Read in and collate all of the FASTA files in a given directory
fasta_files = glob.glob("Files/*.fasta")

records = utilities.load_sequences(*fasta_files)

In [None]:
print ("The total length of the records before cleaning them up is %d " % len(records))


## Remove sequences from files 

In [None]:
# Only include records > 400 amino acids, with no X characters, and with the Cytochrome P450 motif
# Can include a motif here that we wish to ensure sequences have / don't have
cytochrome_p450_motif = "F..G[HRK]..C.G"


records = fasta.subset_records(records=records, length=400, mode='exclude')
records = fasta.exclude_character(records, "X")
records = fasta.subset_on_motif(records, cytochrome_p450_motif)
records = fasta.map_dict_to_records(records)


In [None]:
print ("The total length of CYP2U1 hits after cleaning them up is %d " % len(records))

In [None]:
# Write the cleaned sequence file to disk

fasta.write_fasta(records, "Files/Output/output.fasta")

## Create a new alignment

In [None]:
# Create an alignment using MAFFT. Can specify if we want to use local (L-ins-I method) and the number of iterations to perform

aln = alignment.align_with_mafft("Files/Output/output.fasta", localpair=True)
utilities.write_alignment(aln, "Files/Output/output.aln")

## Now we reduce the alignment down on the basis of internal deletions

In [None]:
# Define the alignment location on disk, the accepted percent of other columns that have a gap at the same position, the minimum length of the gap that meets the accepted percent condition, and an output location

alignment_curation.automated_curation("Files/Output/output.aln", 1, 20, delete_all_candidates=False, outpath="Files/Output/curated")