In [None]:
import sys
sys.path.append('./src/curation')

import utilities
import fasta
import alignment
import alignment_curation
import glob

## Load the files in

In [None]:
# Read in and collate all of the FASTA files in a given directory
fasta_files = glob.glob("Files/*.fasta")

records = utilities.load_sequences(*fasta_files)

In [None]:
print ("The total length of the records before cleaning them up is %d " % len(records))


## Remove sequences from files 

In [None]:
# The Cytochrome P450 motif
cytochrome_p450_motif = "F..G[HRK]..C.G"

# Only include records > 400 amino acids. 
records = fasta.subset_records(records=records, length=400, mode='exclude')

# Ensure no X characters
records = fasta.exclude_character(records, "X")

# Exclude sequences which don't have the motif.
records = fasta.subset_on_motif(records, cytochrome_p450_motif)
records = fasta.map_dict_to_records(records)


In [None]:
print ("The total length of CYP2U1 hits after cleaning them up is %d " % len(records))

In [None]:
# Write the cleaned sequence file to disk

fasta.write_fasta(records, "Files/Output/output.fasta")

## Create a new alignment

In [None]:
# Create an alignment using MAFFT. 
# Can specify if we want to use local (L-ins-I method) and the number of iterations to perform

aln = alignment.align_with_mafft("Files/Output/output.fasta", localpair=True)
utilities.write_alignment(aln, "Files/Output/output.aln") #Writing out the alignment

## Now we reduce the alignment down on the basis of internal deletions

In [None]:
alignment_curation.automated_curation("Files/Output/output.aln", #alignment location on disk
                                      1, #accepted percent of other columns that have a gap at the same position
                                      20, #minimum length of the gap that meets the accepted percent condition
                                      delete_all_candidates=False, 
                                      outpath="Files/Output/curated") #output location