# KEGG Classification Web Scraping

Given our Genes from the Cleaned Cho Data, we want to scrape the classifications of these genes from the KEGG database (https://www.genome.jp/entry/)

# Install Dependencies

In [5]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
import re

# Obtain Cho Genes

In [6]:
BASE_URL = "https://www.genome.jp/entry/sce:"
cho_data = pd.read_csv("../../dataset/raw/cho_cleaned.txt", sep="\t")
cho_data_genes = cho_data["Gene"].tolist()

# Scraping the KEGG Classifications

In [7]:
def scrape_kegg_definitions(gene_id_list):
    definitions = {}
    for gene_id in gene_id_list:
        url = BASE_URL + gene_id.upper()
        response = requests.get(url)
        if response.status_code != 200:
            print("ERROR:", gene_id)
            continue
        
        soup = BeautifulSoup(response.text, "html.parser")
        brite_tag = soup.find("th", string="Brite")
        brite_levels = brite_tag.find_next_sibling("td").text.strip() if brite_tag else None
        
        if brite_levels:
            lines = brite_levels.split("\n")
            for line in lines:
                # Check if the line starts with a 5-digit ID followed by a description
                split_line = line.split(maxsplit=1)  # Split only at the first space
                
                if len(split_line) < 2:  # Ensure there's a description
                    continue
                
                classification = split_line[0]
                description = split_line[1]

                # Check if classification is exactly 5 digits long
                if re.match(r"^\d{5}$", classification):  
                    definitions[classification] = description

    df = pd.DataFrame(definitions.items(), columns=["Classification ID", "Function"])
    df.to_csv("../../classifications/atienza_maximo/kegg_definitions.csv", index=False)

print(cho_data_genes)
scrape_kegg_definitions(['YDL179w', 'YLR079w', 'YER111c', 'YBR200w', 'YJL194w'])

['YDL179w', 'YLR079w', 'YER111c', 'YBR200w', 'YJL194w', 'YLR274w', 'YBR202w', 'YPR019w', 'YBL023c', 'YEL032w', 'YGR044c', 'YML109w', 'YJL157c', 'YKL185w', 'YHR005c', 'YNR001c', 'YKL150w', 'YLR395c', 'YOR065w', 'YDL181w', 'YGR183c', 'YLR258w', 'YML110c', 'YLR273c', 'YCR005c', 'YCL040w', 'YMR256c', 'YIL009w', 'YLL040c', 'YNR016c', 'YBR067c', 'YPL058c', 'YGL055w', 'YGR281W', 'YBR083w', 'YBR054w', 'YKL116c', 'YPR002w', 'YNR067c', 'YBR158w', 'YDL117w', 'YGR035c', 'YHL026c', 'YMR007w', 'YMR254c', 'YNL046w', 'YOR264w', 'YPL066w', 'YBR052c', 'YPL158c', 'YHR022c', 'YPL004c', 'YBR157c', 'YNL078w', 'YOR066w', 'YMR031c', 'YBR053c', 'YDR511w', 'YLR254c', 'YDR033w', 'YKL163w', 'YBR231c', 'YDR368w', 'YLR050c', 'YLR049c', 'YOR273c', 'YLR015w', 'YGR109c', 'YPR120c', 'YDL127w', 'YNL289w', 'YPL256c', 'YMR199w', 'YJL187c', 'YDL003w', 'YMR076c', 'YKL042w', 'YFL008w', 'YPL241c', 'YMR078c', 'YLR212c', 'YNL225c', 'YPL209c', 'YJL074c', 'YNL233w', 'YLR313c', 'YGR041w', 'YGR152c', 'YDR507c', 'YLR286c', 'YIL159w'

In [8]:
def scrape_kegg_classifications(gene_id):
	url = BASE_URL + gene_id.upper()  # Ensure uppercase
	response = requests.get(url)
	if response.status_code != 200:
		print("ERROR:", gene_id)
		return
	soup = BeautifulSoup(response.text, "html.parser")
	name_tag = soup.find("th", string="Name")
	gene_name = name_tag.find_next_sibling("td").text.strip() if name_tag else None

	# Filter keywords
	keywords = ['KEGG', 'BRITE']

	# Extract classification hierarchy
	brite_hierarchy = []
	brite_tag = soup.find("th", string="Brite")
	brite_levels = brite_tag.find_next_sibling("td").text.strip() if brite_tag else None
	if brite_levels:
		lines = brite_levels.split("\n")
		for line in lines:
			if line.count("\xa0") < 1 or line.count("\xa0") > 4:
				continue
			hierarchy_level = line.count("\xa0")
			print(hierarchy_level, line)


	return {
			"Gene ID": gene_id,
			"Gene Name": gene_name,
	}

scrape_kegg_classifications("YLR079w")

1  09130 Environmental Information Processing
2   09132 Signal transduction
3    04011 MAPK signaling pathway - yeast
4     YLR079W (SIC1)
1  09140 Cellular Processes
2   09143 Cell growth and death
3    04111 Cell cycle - yeast
4     YLR079W (SIC1)
3    04113 Meiosis - yeast
4     YLR079W (SIC1)


{'Gene ID': 'YLR079w',
 'Gene Name': '(RefSeq) cyclin-dependent protein serine/threonine kinase inhibiting protein SIC1'}