In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

# Load input CSV
df = pd.read_csv("filtered_gene_data.csv")
unique_genes = df[['GENE_NAME', 'UniProt_ID']].drop_duplicates()

# To store the final results
pdb_records = []

# Simulate browser headers
headers = {
    "User-Agent": "Mozilla/5.0"
}

# Process each UniProt ID
for idx, row in unique_genes.iterrows():
    gene = row['GENE_NAME']
    uniprot_id = row['UniProt_ID']
    url = f"https://www.uniprot.org/uniprotkb/{uniprot_id}/entry"

    try:
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code != 200:
            print(f"Failed to fetch {uniprot_id}")
            continue

        soup = BeautifulSoup(response.content, 'html.parser')
        structure_section = soup.find('section', {'id': 'structure'})
        if not structure_section:
            continue

        for row in structure_section.find_all('tr'):
            cols = row.find_all('td')
            if len(cols) >= 3:
                db_source = cols[0].text.strip()
                pdb_id = cols[1].text.strip()
                chains = cols[2].text.strip()
                if db_source == "PDB":
                    pdb_records.append({
                        'GENE_NAME': gene,
                        'UniProt_ID': uniprot_id,
                        'PDB_ID': pdb_id,
                        'CHAIN': chains
                    })

        time.sleep(1)  # Polite scraping
    except Exception as e:
        print(f"Error for {uniprot_id}: {e}")
        continue

# Save to CSV
output_df = pd.DataFrame(pdb_records)
output_df.to_csv("gene_pdb_chain_mapping.csv", index=False)
print("Saved: gene_pdb_chain_mapping.csv")


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipython-input-1-1362933835.py", line 25, in <cell line: 0>
    response = requests.get(url, headers=headers, timeout=10)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/requests/api.py", line 73, in get
    return request("get", url, params=params, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/requests/api.py", line 59, in request
    return session.request(method=method, url=url, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/requests/sessions.py", line 589, in request
    resp = self.send(prep, **send_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.

TypeError: object of type 'NoneType' has no len()

In [1]:
import pandas as pd
import requests
import time

# Load input file
df = pd.read_csv("filtered_gene_data.csv")
unique_genes = df[['GENE_NAME', 'UniProt_ID']].drop_duplicates()

# Prepare list for results
results = []

# Loop through each UniProt ID
for _, row in unique_genes.iterrows():
    gene = row['GENE_NAME']
    uniprot_id = row['UniProt_ID']
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.json"

    try:
        res = requests.get(url, timeout=10)
        if res.status_code != 200:
            continue

        data = res.json()

        # Look for PDB cross-references
        xrefs = data.get("uniProtKBCrossReferences", [])
        for ref in xrefs:
            if ref.get("database") == "PDB":
                pdb_id = ref.get("id", "")
                properties = ref.get("properties", [])
                chain_info = ""
                for prop in properties:
                    if prop.get("key") == "Chains":
                        chain_info = prop.get("value", "")
                results.append({
                    "GENE_NAME": gene,
                    "UniProt_ID": uniprot_id,
                    "PDB_ID": pdb_id,
                    "CHAIN": chain_info
                })

        time.sleep(0.2)  # Slight delay to be polite
    except Exception as e:
        print(f"Failed for {uniprot_id}: {e}")
        continue

# Save output
output_df = pd.DataFrame(results)
output_df.to_csv("gene_pdb_chain_mapping_api.csv", index=False)
print("✅ Saved as 'gene_pdb_chain_mapping_api.csv'")


✅ Saved as 'gene_pdb_chain_mapping_api.csv'


In [3]:
import pandas as pd

# Load your existing CSV
df = pd.read_csv("gene_pdb_chain_mapping_api.csv")  # Use correct path if needed

# Function to safely split CHAIN into chain and residue range
def split_chain_residue(entry):
    if pd.isna(entry):
        return pd.Series(["", ""])
    parts = entry.split('=', 1)  # Only split at the first '='
    if len(parts) == 2:
        return pd.Series([parts[0].strip(), parts[1].strip()])
    else:
        return pd.Series([parts[0].strip(), ""])

# Apply the splitting function
df[['CHAIN', 'RESIDUE_RANGE']] = df['CHAIN'].apply(split_chain_residue)

# Save the cleaned CSV
df.to_csv("gene_pdb_chain_mapping_cleaned.csv", index=False)
print("Saved cleaned CSV as gene_pdb_chain_mapping_cleaned.csv")


Saved cleaned CSV as gene_pdb_chain_mapping_cleaned.csv
