# Step 2: Bacterial Metabolite Analysis via KEGG API

**Objective:** To identify and compare the metabolic potential of several *Streptococcus* strains by querying the KEGG (Kyoto Encyclopedia of Genes and Genomes) database. This notebook will:
1. Install required libraries.
2. Use the KEGG REST API to download metabolite lists for selected pathogenic and commensal strains.
3. Process and compare these lists.
4. Visualize the core and accessory metabolome using an Upset plot.

In [None]:
# 1. Install necessary libraries
!pip install requests pandas upsetplot matplotlib --quiet

---

In [None]:
# 2. Define KEGG API functions and organism codes
import requests
import pandas as pd
from io import StringIO
import time
from collections import defaultdict

KEGG_ORGANISMS = {
    'S. pneumoniae TIGR4': 'spn',
    'S. pneumoniae R6': 'spr',
    'S. pneumoniae D39': 'spd',
    'S. salivarius K12': 'ssa' # Commensal control
}

BASE_URL = "http://rest.kegg.jp"

def get_pathways_for_organism(org_code):
    """Gets a list of all pathways for a given KEGG organism code."""
    url = f"{BASE_URL}/list/pathway/{org_code}"
    try:
        response = requests.get(url)
        response.raise_for_status() # Raise an exception for bad status codes
        pathways = [line.split('\t')[0] for line in response.text.strip().split('\n')]
        return pathways
    except requests.exceptions.RequestException as e:
        print(f"Error fetching pathways for {org_code}: {e}")
        return []

def get_compounds_for_pathway(pathway_id):
    """Gets a list of all compounds (metabolites) in a given KEGG pathway."""
    url = f"{BASE_URL}/link/cpd/{pathway_id}"
    try:
        response = requests.get(url)
        response.raise_for_status()
        if not response.text:
            return []
        compounds = [line.split('\t')[1] for line in response.text.strip().split('\n')]
        return compounds
    except requests.exceptions.RequestException as e:
        print(f"Error fetching compounds for {pathway_id}: {e}")
        return []

print("KEGG API functions defined.")

In [None]:
# 3. Fetch all metabolite data from KEGG
print("Fetching data from KEGG for all organisms. This will take several minutes...")
organism_metabolites = defaultdict(set)

for name, code in KEGG_ORGANISMS.items():
    print(f"\nProcessing {name} ({code})...")
    pathways = get_pathways_for_organism(code)
    print(f"Found {len(pathways)} pathways.")
    
    for i, pathway in enumerate(pathways):
        # Be respectful to the API server
        time.sleep(0.1)
        if (i+1) % 20 == 0:
            print(f"  - Fetched compounds for {i+1}/{len(pathways)} pathways...")
        compounds = get_compounds_for_pathway(pathway)
        organism_metabolites[name].update(compounds)
    
    print(f"Finished {name}. Total unique metabolites found: {len(organism_metabolites[name])}")

print("\nAll data fetched successfully!")

---

In [None]:
# 4. Compare metabolomes with an Upset Plot
from upsetplot import from_contents, UpSet
import matplotlib.pyplot as plt

print("Generating Upset plot for comparative analysis...")

# Convert dictionary of sets to the format required by upsetplot
upset_data = from_contents(organism_metabolites)

plt.figure(figsize=(15, 8))
upset_plot = UpSet(upset_data, subset_size='count', show_counts=True, sort_by='degree')
upset_plot.plot()
plt.suptitle("Comparison of Metabolite Sets Across Streptococcus Strains")
plt.show()

---

In [None]:
# 5. Save the results
import os

os.makedirs('results/bacterial_analysis', exist_ok=True)

# Create a presence/absence dataframe
all_compounds = sorted(list(set.union(*organism_metabolites.values())))
presence_absence_df = pd.DataFrame(0, index=all_compounds, columns=KEGG_ORGANISMS.keys())

for org, compounds in organism_metabolites.items():
    presence_absence_df.loc[list(compounds), org] = 1

output_file = 'results/bacterial_analysis/kegg_metabolite_comparison.csv'
presence_absence_df.to_csv(output_file)

print(f"Comparison table saved to {output_file}")
print("\nBacterial metabolite analysis complete.")