# Family 3 saccharide BGCs - Genome Discovery Using BLAST

## Overview:
- Import 'family_3_saccharide_BGCs FASTA file.
- Using BLAST API to find neither the encapsulin proteins have a sequenced genome available.
- Output the information into a DataFrame for each encapsulin BLAST search.
    - 29 DataFrames total


## Using NCBI BLAST API search with the encapsulin proteins:

In [None]:
from Bio.Blast import NCBIWWW
from Bio import SeqIO

# Define API key for increased access to 10 requests/second
api_key = "c9b038e154b263098b1022d633d445c76707"

# Path to FASTA file
family_3_fasta_file_path = r'C:\Users\Cameron\OneDrive - University College London\PhD\Year 1\ENCAPSULIN BIOINFORMATICS AND METAGENOMICS\encapsulin_bioinformatics_repo\family_3_saccharide_BGCs_(fasta)\annotated_family_3.fasta'

# Read protein sequences from the FASTA file
protein_sequences = list(SeqIO.parse(family_3_fasta_file_path, "fasta"))

# Perform BLASTp search for each sequence
for seq_record in protein_sequences:
    sequence = str(seq_record.seq)

    # BLAST request URL with the API key
    blastp_url = f"https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&DATABASE=nr&BLAST_PROGRAMS=blastp&QUERY={sequence}&api_key={api_key}"

    try:
        # Perform BLAST request
        results_process = NCBIWWW.qblast(program="blastp", database="nr", sequence=sequence, url_base=blastp_url, alignments=50)

        # Save the BLAST results
        blastp_results_file_name = f"{seq_record.id}_family_3_sacc._BGC_blastp_result.xml"
        with open(blastp_results_file_name, "w") as save_to:
            save_to.write(results_process.read())

        print(f"BLAST search for {seq_record.id} completed. Results saved to {blastp_results_file_name}")

    except Exception as e:
        print(f"Error processing sequence {seq_record.id}: {str(e)}")
    finally:
        # Close results_process
        if 'results_process' in locals():
            results_process.close()


**Identify and BLASTp searching the remaining 7 MGYP accession numbers**

## Formating the BLASTp outputs into DataFrames

In [17]:
from Bio.Blast import NCBIXML
import pandas as pd
import os

def parse_blast_xml(xml_file, top_hits=50):
    result_handle = open(xml_file)
    blast_records = NCBIXML.parse(result_handle)
    
    data = {'Query': [], 'Subject': [], 'Identity': [], 'E-value': []}

    for record in blast_records:
        query_id = record.query_id
        for alignment in record.alignments[:top_hits]:
            subject_id = alignment.title
            identity = alignment.hsps[0].identities
            e_value = alignment.hsps[0].expect

            data['Query'].append(query_id)
            data['Subject'].append(subject_id)
            data['Identity'].append(identity)
            data['E-value'].append(e_value)

    result_handle.close()
    return pd.DataFrame(data)

# Define folder path
folder_path = r'C:\Users\Cameron\OneDrive - University College London\PhD\Year 1\ENCAPSULIN BIOINFORMATICS AND METAGENOMICS\encapsulin_bioinformatics_repo\BLASTp_outputs\xml_files'

# Get all XML files in the folder
blast_file_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.xml')]

# Process each file 
for blast_file_path in blast_file_paths:
    # Set to selecting the top_hits=50
    blast_df = parse_blast_xml(blast_file_path, top_hits=50)

    # Save DataFrames to CSV files
    output_file = blast_file_path.replace('.xml', '_results.csv')
    blast_df.to_csv(output_file, sep=',', index=False)


**Loading all formatted BLASTp results csv files**

In [20]:
import os
import pandas as pd

folder_path = r'C:\Users\Cameron\OneDrive - University College London\PhD\Year 1\ENCAPSULIN BIOINFORMATICS AND METAGENOMICS\encapsulin_bioinformatics_repo\BLASTp_outputs\csv_files'

#get a list of all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

print(csv_files)


['family_3_MGYP001178754852_family_3_sacc._BGC_blastp_result_results.csv', 'family_3_MGYP001216717877_family_3_sacc._BGC_blastp_result_results.csv', 'family_3_MGYP001238560740_family_3_sacc._BGC_blastp_result_results.csv', 'family_3_MGYP001437231829_family_3_sacc._BGC_blastp_result_results.csv', 'family_3_MGYP001581572508_family_3_sacc._BGC_blastp_result_results.csv', 'family_3_MGYP001595624303_family_3_sacc._BGC_blastp_result_results.csv', 'family_3_MGYP003109322860_family_3_sacc._BGC_blastp_result_results.csv', 'family_3_MGYP003110546203_family_3_sacc._BGC_blastp_result_results.csv', 'family_3_MGYP003110882604_family_3_sacc._BGC_blastp_result_results.csv', 'family_3_MGYP003111233400_family_3_sacc._BGC_blastp_result_results.csv', 'family_3_MGYP003131024615_family_3_sacc._BGC_blastp_result_results.csv', 'family_3_MGYP003131404975_family_3_sacc._BGC_blastp_result_results.csv', 'family_3_MGYP003131556693_family_3_sacc._BGC_blastp_result_results.csv', 'family_3_MGYP003134444350_family_3_s

In [22]:
import pandas as pd
import os

folder_path = r'C:\Users\Cameron\OneDrive - University College London\PhD\Year 1\ENCAPSULIN BIOINFORMATICS AND METAGENOMICS\encapsulin_bioinformatics_repo\BLASTp_outputs\csv_files'

# List of CSV files
csv_files = [
    'family_3_MGYP001178754852_family_3_sacc._BGC_blastp_result_results.csv', 
    'family_3_MGYP001216717877_family_3_sacc._BGC_blastp_result_results.csv', 
    'family_3_MGYP001238560740_family_3_sacc._BGC_blastp_result_results.csv', 
    'family_3_MGYP001437231829_family_3_sacc._BGC_blastp_result_results.csv', 
    'family_3_MGYP001581572508_family_3_sacc._BGC_blastp_result_results.csv', 
    'family_3_MGYP001595624303_family_3_sacc._BGC_blastp_result_results.csv', 
    'family_3_MGYP003109322860_family_3_sacc._BGC_blastp_result_results.csv', 
    'family_3_MGYP003110546203_family_3_sacc._BGC_blastp_result_results.csv', 
    'family_3_MGYP003110882604_family_3_sacc._BGC_blastp_result_results.csv', 
    'family_3_MGYP003111233400_family_3_sacc._BGC_blastp_result_results.csv', 
    'family_3_MGYP003131024615_family_3_sacc._BGC_blastp_result_results.csv', 
    'family_3_MGYP003131404975_family_3_sacc._BGC_blastp_result_results.csv', 
    'family_3_MGYP003131556693_family_3_sacc._BGC_blastp_result_results.csv', 
    'family_3_MGYP003134444350_family_3_sacc._BGC_blastp_result_results.csv', 
    'family_3_MGYP003144635947_family_3_sacc._BGC_blastp_result_results.csv', 
    'family_3_MGYP003341041167_family_3_sacc._BGC_blastp_result_results.csv', 
    'family_3_MGYP003626144734_family_3_sacc._BGC_blastp_result_results.csv', 
    'family_3_MGYP003626701920_family_3_sacc._BGC_blastp_result_results.csv', 
    'family_3_MGYP003636931262_family_3_sacc._BGC_blastp_result_results.csv', 
    'family_3_MGYP003638549746_family_3_sacc._BGC_blastp_result_results.csv', 
    'family_3_MGYP003662477660_family_3_sacc._BGC_blastp_result_results.csv', 
    'family_3_MGYP003662771788_family_3_sacc._BGC_blastp_result_results.csv'
    ]

# Load each CSV file into a separate DataFrame
df1 = pd.read_csv(os.path.join(folder_path, csv_files[0]))
df2 = pd.read_csv(os.path.join(folder_path, csv_files[1]))
df3 = pd.read_csv(os.path.join(folder_path, csv_files[2]))
df4 = pd.read_csv(os.path.join(folder_path, csv_files[3]))
df5 = pd.read_csv(os.path.join(folder_path, csv_files[4]))
df6 = pd.read_csv(os.path.join(folder_path, csv_files[5]))
df7 = pd.read_csv(os.path.join(folder_path, csv_files[6]))
df8 = pd.read_csv(os.path.join(folder_path, csv_files[7]))
df9 = pd.read_csv(os.path.join(folder_path, csv_files[8]))
df10 = pd.read_csv(os.path.join(folder_path, csv_files[9]))
df11 = pd.read_csv(os.path.join(folder_path, csv_files[10]))
df12 = pd.read_csv(os.path.join(folder_path, csv_files[11]))
df13 = pd.read_csv(os.path.join(folder_path, csv_files[12]))
df14 = pd.read_csv(os.path.join(folder_path, csv_files[13]))
df15 = pd.read_csv(os.path.join(folder_path, csv_files[14]))
df16 = pd.read_csv(os.path.join(folder_path, csv_files[15]))
df17 = pd.read_csv(os.path.join(folder_path, csv_files[16]))
df18 = pd.read_csv(os.path.join(folder_path, csv_files[17]))
df19 = pd.read_csv(os.path.join(folder_path, csv_files[18]))
df20 = pd.read_csv(os.path.join(folder_path, csv_files[19]))
df21 = pd.read_csv(os.path.join(folder_path, csv_files[20]))
df22 = pd.read_csv(os.path.join(folder_path, csv_files[21]))
