In [3]:
from Bio.Blast import NCBIXML
import pandas as pd

def parse_blast_xml(xml_file, top_hits=50):
    result_handle = open(xml_file)
    blast_records = NCBIXML.parse(result_handle)
    
    data = {'Query': [], 'Subject': [], 'Identity': [], 'E-value': []}

    for record in blast_records:
        query_id = record.query_id
        for alignment in record.alignments[:top_hits]:
            subject_id = alignment.title
            identity = alignment.hsps[0].identities
            e_value = alignment.hsps[0].expect

            data['Query'].append(query_id)
            data['Subject'].append(subject_id)
            data['Identity'].append(identity)
            data['E-value'].append(e_value)

    result_handle.close()
    return pd.DataFrame(data)

#define file paths
blast_file_paths = [ 
    r"C:\Users\Cameron\OneDrive - University College London\PhD\Year 1\ENCAPSULIN BIOINFORMATICS AND METAGENOMICS\encapsulin_bioinformatics_repo\BLASTp_outputs\xml_files\family_3_MGYP001216717877_family_3_sacc._BGC_blastp_result.xml",
    r"C:\Users\Cameron\OneDrive - University College London\PhD\Year 1\ENCAPSULIN BIOINFORMATICS AND METAGENOMICS\encapsulin_bioinformatics_repo\BLASTp_outputs\xml_files\family_3_MGYP001581572508_family_3_sacc._BGC_blastp_result.xml",
    r"C:\Users\Cameron\OneDrive - University College London\PhD\Year 1\ENCAPSULIN BIOINFORMATICS AND METAGENOMICS\encapsulin_bioinformatics_repo\BLASTp_outputs\xml_files\family_3_MGYP003110882604_family_3_sacc._BGC_blastp_result.xml",
    r"C:\Users\Cameron\OneDrive - University College London\PhD\Year 1\ENCAPSULIN BIOINFORMATICS AND METAGENOMICS\encapsulin_bioinformatics_repo\BLASTp_outputs\xml_files\family_3_MGYP003111233400_family_3_sacc._BGC_blastp_result.xml",
    r"C:\Users\Cameron\OneDrive - University College London\PhD\Year 1\ENCAPSULIN BIOINFORMATICS AND METAGENOMICS\encapsulin_bioinformatics_repo\BLASTp_outputs\xml_files\family_3_MGYP003131024615_family_3_sacc._BGC_blastp_result.xml",
    r"C:\Users\Cameron\OneDrive - University College London\PhD\Year 1\ENCAPSULIN BIOINFORMATICS AND METAGENOMICS\encapsulin_bioinformatics_repo\BLASTp_outputs\xml_files\family_3_MGYP003134444350_family_3_sacc._BGC_blastp_result.xml",
    r"C:\Users\Cameron\OneDrive - University College London\PhD\Year 1\ENCAPSULIN BIOINFORMATICS AND METAGENOMICS\encapsulin_bioinformatics_repo\BLASTp_outputs\xml_files\family_3_MGYP003636931262_family_3_sacc._BGC_blastp_result.xml"
]

#process each file 
for blast_file_path in blast_file_paths:
    #set to selecting the top_hits=50
    blast_df = parse_blast_xml(blast_file_path, top_hits=50)

    #save DataFrames to csv files
    output_file = blast_file_path.replace('.xml', '_results.csv')
    blast_df.to_csv(output_file, sep=',', index=False)