#### This Code fetches protein kinase family directly from PDB and extracts sequence, dssp3, dssp8 structure and saved as txt format and text to csv saved format#######

Extraction of Dataset Protein Kinase family from PDB and conversion of DSSP8 to DSSP3 saving as csv file 
This Code fetches protein kinase family directly from PDB

In [None]:
from IPython import get_ipython
from IPython.display import display
# # %%
!pip install biopython
# # %%
# # Install DSSP (mkdssp)
!sudo apt-get update
!sudo apt-get install dssp

You can input any family classified PDB protein family in place of "Kinase"

In [7]:
#### This Code fetches protein kinase family directly from PDB #######

import requests

def search_pdb_kinase_structures(n=50):
    url = "https://search.rcsb.org/rcsbsearch/v2/query"

    # Query for PDB entries classified as kinases
    query = {
        "query": {
            "type": "terminal",
            "service": "text",
            "parameters": {
                "attribute": "struct_keywords.pdbx_keywords",
                "operator": "contains_words",
                "value": "kinase"
            }
        },
        "return_type": "entry",
        "request_options": {
            "results_content_type": ["experimental"],
            "sort": [
                {
                    "sort_by": "score",
                    "direction": "desc"
                }
            ],
            "paginate": {
                "start": 0,
                "rows": n
            }
        }
    }

    response = requests.post(url, json=query)

    if response.status_code != 200:
        print("Error:", response.status_code, response.text)
        return []

    data = response.json()

    if 'result_set' not in data:
        print("No results found.")
        return []

    pdb_ids = [item['identifier'] for item in data['result_set']]
    return pdb_ids

# Test it
kinase_pdb_ids = search_pdb_kinase_structures(50)
print("Kinase PDB IDs:", kinase_pdb_ids)



Kinase PDB IDs: ['13PK', '16PK', '1A06', '1ABQ', '1AQ2', '1AWO', '1BG0', '1GMI', '1GSJ', '1GZK', '1GZN', '1GZO', '1H8F', '1KOA', '1KOB', '1KWA', '1NKS', '1OEN', '1OH9', '1OHA', '1OHB', '1OI2', '1OI3', '1OI9', '1OIQ', '1OIR', '1OIT', '1OIU', '1OIY', '1OL7', '1PHK', '1PHP', '1PKM', '1QF9', '1RGS', '1RL3', '1UN9', '3TMK', '4ZSG', '5OAT', '1H4L', '1A0B', '1AGW', '1AQ1', '1AYL', '1E1V', '1E1X', '1E2E', '1E2L', '1E2Q']


In [11]:
from Bio.PDB import PDBList, PDBParser, DSSP

# === Map 8-state DSSP to Q3 ===
def map_dssp8_to_q3(dssp8_char):
    if dssp8_char in ['H', 'G', 'I']:
        return 'H'
    elif dssp8_char in ['E', 'B']:
        return 'E'
    else:
        return 'C'

# === DSSP Extraction Function ===
def get_dssp_info(pdb_id, dssp_exe='mkdssp'):
    pdb_id = pdb_id.lower()
    pdbl = PDBList()
    pdbl.retrieve_pdb_file(pdb_id, pdir='.', file_format='pdb')
    pdb_file = f"pdb{pdb_id}.ent"

    parser = PDBParser()
    structure = parser.get_structure(pdb_id, pdb_file)
    model = structure[0]

    dssp = DSSP(model, pdb_file, file_type='PDB', dssp=dssp_exe)

    dssp_keys = list(dssp.keys())
    primary_sequence = ''.join([dssp[key][1] for key in dssp_keys])
    dssp8_string = ''.join([dssp[key][2] for key in dssp_keys])
    dssp3_string = ''.join([map_dssp8_to_q3(dssp[key][2]) for key in dssp_keys])

    return primary_sequence, dssp8_string, dssp3_string

# === List of Kinase PDB IDs === *****REPLACE WITH ANY PROTEIN FAMILY*
kinase_pdb_ids = [
    '13PK', '16PK', '1A06', '1ABQ', '1AQ2', '1AWO', '1BG0', '1GMI', '1GSJ', '1GZK',
    '1GZN', '1GZO', '1H8F', '1KOA', '1KOB', '1KWA', '1NKS', '1OEN', '1OH9', '1OHA',
    '1OHB', '1OI2', '1OI3', '1OI9', '1OIQ', '1OIR', '1OIT', '1OIU', '1OIY', '1OL7',
    '1PHK', '1PHP', '1PKM', '1QF9', '1RGS', '1RL3', '1UN9', '3TMK', '4ZSG', '5OAT',
    '1H4L', '1A0B', '1AGW', '1AQ1', '1AYL', '1E1V', '1E1X', '1E2E', '1E2L', '1E2Q'
]

# === LOOP OVER PDB IDs AND SAVE RESULTS ===
output_lines = []

for pdb_id in kinase_pdb_ids:
    print(f"Processing {pdb_id}...")
    try:
        primary_seq, dssp8_str, dssp3_str = get_dssp_info(pdb_id)

        print(f"  → Lengths: Primary={len(primary_seq)}, Q8={len(dssp8_str)}, Q3={len(dssp3_str)}")

        entry = (
            f"PDB ID: {pdb_id}\n"
            f"Primary sequence:\n{primary_seq}\n"
            f"DSSP 8-state (Q8):\n{dssp8_str}\n"
            f"DSSP 3-state (Q3):\n{dssp3_str}\n"
            f"Lengths: Primary={len(primary_seq)}, Q8={len(dssp8_str)}, Q3={len(dssp3_str)}\n"
            + "-"*60 + "\n"
        )
        output_lines.append(entry)

    except Exception as e:
        print(f"  ✖ Error processing {pdb_id}: {e}")
        output_lines.append(f"PDB ID: {pdb_id} → ERROR: {e}\n{'-'*60}\n")

# === WRITE TO FILE ===
with open("kinase_dssp_results.txt", "w") as f:
    f.writelines(output_lines)

print("\n✅ All results saved to 'kinase_dssp_results.txt'")


Processing 13PK...
Structure exists: './pdb13pk.ent' 




  → Lengths: Primary=1660, Q8=1660, Q3=1660
Processing 16PK...
Structure exists: './pdb16pk.ent' 
  → Lengths: Primary=415, Q8=415, Q3=415
Processing 1A06...
Structure exists: './pdb1a06.ent' 
  → Lengths: Primary=279, Q8=279, Q3=279
Processing 1ABQ...
Structure exists: './pdb1abq.ent' 
  → Lengths: Primary=56, Q8=56, Q3=56
Processing 1AQ2...
Structure exists: './pdb1aq2.ent' 
  → Lengths: Primary=534, Q8=534, Q3=534
Processing 1AWO...
Structure exists: './pdb1awo.ent' 





  → Lengths: Primary=57, Q8=57, Q3=57
Processing 1BG0...
Structure exists: './pdb1bg0.ent' 
  → Lengths: Primary=356, Q8=356, Q3=356
Processing 1GMI...
Structure exists: './pdb1gmi.ent' 
  → Lengths: Primary=135, Q8=135, Q3=135
Processing 1GSJ...
Structure exists: './pdb1gsj.ent' 
  → Lengths: Primary=258, Q8=258, Q3=258
Processing 1GZK...
Structure exists: './pdb1gzk.ent' 
  → Lengths: Primary=271, Q8=271, Q3=271
Processing 1GZN...
Structure exists: './pdb1gzn.ent' 
  → Lengths: Primary=271, Q8=271, Q3=271
Processing 1GZO...
Structure exists: './pdb1gzo.ent' 
  → Lengths: Primary=271, Q8=271, Q3=271
Processing 1H8F...
Structure exists: './pdb1h8f.ent' 




  → Lengths: Primary=702, Q8=702, Q3=702
Processing 1KOA...
Structure exists: './pdb1koa.ent' 
  → Lengths: Primary=447, Q8=447, Q3=447
Processing 1KOB...
Structure exists: './pdb1kob.ent' 




  → Lengths: Primary=704, Q8=704, Q3=704
Processing 1KWA...
Structure exists: './pdb1kwa.ent' 





  → Lengths: Primary=174, Q8=174, Q3=174
Processing 1NKS...
Structure exists: './pdb1nks.ent' 
  → Lengths: Primary=1164, Q8=1164, Q3=1164
Processing 1OEN...
Structure exists: './pdb1oen.ent' 
  → Lengths: Primary=524, Q8=524, Q3=524
Processing 1OH9...
Structure exists: './pdb1oh9.ent' 
  → Lengths: Primary=258, Q8=258, Q3=258
Processing 1OHA...
Downloading PDB structure '1oha'...
  → Lengths: Primary=258, Q8=258, Q3=258
Processing 1OHB...
Downloading PDB structure '1ohb'...
  → Lengths: Primary=258, Q8=258, Q3=258
Processing 1OI2...
Downloading PDB structure '1oi2'...




  → Lengths: Primary=672, Q8=672, Q3=672
Processing 1OI3...
Downloading PDB structure '1oi3'...




  → Lengths: Primary=672, Q8=672, Q3=672
Processing 1OI9...
Downloading PDB structure '1oi9'...




  → Lengths: Primary=1104, Q8=1104, Q3=1104
Processing 1OIQ...
Downloading PDB structure '1oiq'...
  → Lengths: Primary=264, Q8=264, Q3=264
Processing 1OIR...
Downloading PDB structure '1oir'...
  → Lengths: Primary=287, Q8=287, Q3=287
Processing 1OIT...
Downloading PDB structure '1oit'...
  → Lengths: Primary=273, Q8=273, Q3=273
Processing 1OIU...
Downloading PDB structure '1oiu'...




  → Lengths: Primary=1075, Q8=1075, Q3=1075
Processing 1OIY...
Downloading PDB structure '1oiy'...




  → Lengths: Primary=1107, Q8=1107, Q3=1107
Processing 1OL7...
Downloading PDB structure '1ol7'...
  → Lengths: Primary=263, Q8=263, Q3=263
Processing 1PHK...
Downloading PDB structure '1phk'...
  → Lengths: Primary=277, Q8=277, Q3=277
Processing 1PHP...
Downloading PDB structure '1php'...
  → Lengths: Primary=394, Q8=394, Q3=394
Processing 1PKM...
Downloading PDB structure '1pkm'...
  → Lengths: Primary=519, Q8=519, Q3=519
Processing 1QF9...
Downloading PDB structure '1qf9'...
  → Lengths: Primary=194, Q8=194, Q3=194
Processing 1RGS...
Downloading PDB structure '1rgs'...
  → Lengths: Primary=264, Q8=264, Q3=264
Processing 1RL3...
Downloading PDB structure '1rl3'...




  → Lengths: Primary=527, Q8=527, Q3=527
Processing 1UN9...
Downloading PDB structure '1un9'...




  → Lengths: Primary=1074, Q8=1074, Q3=1074
Processing 3TMK...
Downloading PDB structure '3tmk'...




  → Lengths: Primary=1711, Q8=1711, Q3=1711
Processing 4ZSG...
Downloading PDB structure '4zsg'...
  → Lengths: Primary=347, Q8=347, Q3=347
Processing 5OAT...
Downloading PDB structure '5oat'...




  → Lengths: Primary=2160, Q8=2160, Q3=2160
Processing 1H4L...
Downloading PDB structure '1h4l'...




  → Lengths: Primary=850, Q8=850, Q3=850
Processing 1A0B...
Downloading PDB structure '1a0b'...
  → Lengths: Primary=117, Q8=117, Q3=117
Processing 1AGW...
Downloading PDB structure '1agw'...




  → Lengths: Primary=549, Q8=549, Q3=549
Processing 1AQ1...
Downloading PDB structure '1aq1'...
  → Lengths: Primary=277, Q8=277, Q3=277
Processing 1AYL...
Downloading PDB structure '1ayl'...
  → Lengths: Primary=532, Q8=532, Q3=532
Processing 1E1V...
Downloading PDB structure '1e1v'...
  → Lengths: Primary=290, Q8=290, Q3=290
Processing 1E1X...
Downloading PDB structure '1e1x'...
  → Lengths: Primary=290, Q8=290, Q3=290
Processing 1E2E...
Downloading PDB structure '1e2e'...
  → Lengths: Primary=209, Q8=209, Q3=209
Processing 1E2L...
Downloading PDB structure '1e2l'...




  → Lengths: Primary=614, Q8=614, Q3=614
Processing 1E2Q...
Downloading PDB structure '1e2q'...
  → Lengths: Primary=209, Q8=209, Q3=209

✅ All results saved to 'kinase_dssp_results.txt'


In [12]:
import csv

input_file = "kinase_dssp_results.txt"
output_file = "kinase_dssp_results.csv"

entries = []

with open(input_file, "r") as f:
    lines = f.read().split("------------------------------------------------------------\n")

    for block in lines:
        if not block.strip():
            continue

        lines = block.strip().splitlines()

        try:
            pdb_id = lines[0].split(":")[1].strip()
            primary_seq = lines[2]
            dssp_q8 = lines[4]
            dssp_q3 = lines[6]
            length_line = lines[7]
            length = length_line.split("Primary=")[1].split(",")[0].strip()

            entries.append([pdb_id, primary_seq, dssp_q8, dssp_q3, length])
        except Exception as e:
            print(f"Skipping a block due to parsing error: {e}")

# Write to CSV
with open(output_file, "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["PDB_ID", "Primary_Sequence", "DSSP_Q8", "DSSP_Q3", "Length"])
    writer.writerows(entries)

print(f"✅ CSV file saved as '{output_file}'")


✅ CSV file saved as 'kinase_dssp_results.csv'
