#### This Code fetches protein GPCR family directly from PDB and extracts sequence, dssp3, dssp8 structure and saved as txt format and text to csv saved format#######

Extraction of Dataset Protein GPCR family from PDB and conversion of DSSP8 to DSSP3 saving as csv file
This Code fetches protein GPCR family directly from PDB

In [1]:
from IPython import get_ipython
from IPython.display import display
# # %%
!pip install biopython
# # %%
# # Install DSSP (mkdssp)
!sudo apt-get update
!sudo apt-get install dssp

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85
Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:2 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,757 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/

You can input any family classified PDB protein family in place of "Kinase"

In [2]:
import requests

def search_pdb_gpcr_structures(n=50):
    url = "https://search.rcsb.org/rcsbsearch/v2/query"

    # Query for PDB entries classified as GPCRs
    query = {
        "query": {
            "type": "terminal",
            "service": "text",
            "parameters": {
                "attribute": "struct_keywords.pdbx_keywords",
                "operator": "contains_words",
                "value": "G protein-coupled receptor"
            }
        },
        "return_type": "entry",
        "request_options": {
            "results_content_type": ["experimental"],
            "sort": [
                {
                    "sort_by": "score",
                    "direction": "desc"
                }
            ],
            "paginate": {
                "start": 0,
                "rows": n
            }
        }
    }

    response = requests.post(url, json=query)

    if response.status_code != 200:
        print("Error:", response.status_code, response.text)
        return []

    data = response.json()

    if 'result_set' not in data:
        print("No results found.")
        return []

    pdb_ids = [item['identifier'] for item in data['result_set']]
    return pdb_ids

# Test it
gpcr_pdb_ids = search_pdb_gpcr_structures(50)
print("GPCR PDB IDs:", gpcr_pdb_ids)


GPCR PDB IDs: ['1AJE', '1E0S', '1RGP', '1ZBD', '2CJW', '2RAP', '5NI8', '5NIB', '1H30', '1A52', '1AC6', '1AJJ', '1BEC', '1BJ8', '1E6F', '1GP0', '1GP3', '1GQB', '1H68', '1HKF', '1M6P', '1NGR', '1RPM', '1TCR', '1UPV', '1UPW', '1US4', '1US5', '1W9R', '2BPD', '2BPE', '2BPH', '2BSD', '2BSE', '2BYN', '2BYP', '2BYQ', '2BYR', '2BYS', '2CDG', '2CL8', '2CNJ', '2J4A', '2J67', '2NR1', '2RQZ', '2RR0', '2RR2', '2UV3', '2UWI']


In [3]:
from Bio.PDB import PDBList, PDBParser, DSSP

# === Map 8-state DSSP to Q3 ===
def map_dssp8_to_q3(dssp8_char):
    if dssp8_char in ['H', 'G', 'I']:
        return 'H'
    elif dssp8_char in ['E', 'B']:
        return 'E'
    else:
        return 'C'

# === DSSP Extraction Function ===
def get_dssp_info(pdb_id, dssp_exe='mkdssp'):
    pdb_id = pdb_id.lower()
    pdbl = PDBList()
    pdbl.retrieve_pdb_file(pdb_id, pdir='.', file_format='pdb')
    pdb_file = f"pdb{pdb_id}.ent"

    parser = PDBParser()
    structure = parser.get_structure(pdb_id, pdb_file)
    model = structure[0]

    dssp = DSSP(model, pdb_file, file_type='PDB', dssp=dssp_exe)

    dssp_keys = list(dssp.keys())
    primary_sequence = ''.join([dssp[key][1] for key in dssp_keys])
    dssp8_string = ''.join([dssp[key][2] for key in dssp_keys])
    dssp3_string = ''.join([map_dssp8_to_q3(dssp[key][2]) for key in dssp_keys])

    return primary_sequence, dssp8_string, dssp3_string

# === GPCR PDB IDs ===
gpcr_pdb_ids = [
    '1AJE', '1E0S', '1RGP', '1ZBD', '2CJW', '2RAP', '5NI8', '5NIB', '1H30', '1A52',
    '1AC6', '1AJJ', '1BEC', '1BJ8', '1E6F', '1GP0', '1GP3', '1GQB', '1H68', '1HKF',
    '1M6P', '1NGR', '1RPM', '1TCR', '1UPV', '1UPW', '1US4', '1US5', '1W9R', '2BPD',
    '2BPE', '2BPH', '2BSD', '2BSE', '2BYN', '2BYP', '2BYQ', '2BYR', '2BYS', '2CDG',
    '2CL8', '2CNJ', '2J4A', '2J67', '2NR1', '2RQZ', '2RR0', '2RR2', '2UV3', '2UWI'
]

# === LOOP OVER PDB IDs AND SAVE RESULTS ===
output_lines = []

for pdb_id in gpcr_pdb_ids:
    print(f"Processing {pdb_id}...")
    try:
        primary_seq, dssp8_str, dssp3_str = get_dssp_info(pdb_id)

        print(f"  → Lengths: Primary={len(primary_seq)}, Q8={len(dssp8_str)}, Q3={len(dssp3_str)}")

        entry = (
            f"PDB ID: {pdb_id}\n"
            f"Primary sequence:\n{primary_seq}\n"
            f"DSSP 8-state (Q8):\n{dssp8_str}\n"
            f"DSSP 3-state (Q3):\n{dssp3_str}\n"
            f"Lengths: Primary={len(primary_seq)}, Q8={len(dssp8_str)}, Q3={len(dssp3_str)}\n"
            + "-"*60 + "\n"
        )
        output_lines.append(entry)

    except Exception as e:
        print(f"  ✖ Error processing {pdb_id}: {e}")
        output_lines.append(f"PDB ID: {pdb_id} → ERROR: {e}\n{'-'*60}\n")

# === WRITE TO FILE ===
with open("gpcr_dssp_results.txt", "w") as f:
    f.writelines(output_lines)

print("\n✅ All results saved to 'gpcr_dssp_results.txt'")


Processing 1AJE...
Downloading PDB structure '1aje'...
  → Lengths: Primary=194, Q8=194, Q3=194
Processing 1E0S...
Downloading PDB structure '1e0s'...
  → Lengths: Primary=173, Q8=173, Q3=173
Processing 1RGP...
Downloading PDB structure '1rgp'...
  → Lengths: Primary=189, Q8=189, Q3=189
Processing 1ZBD...
Downloading PDB structure '1zbd'...




  → Lengths: Primary=301, Q8=301, Q3=301
Processing 2CJW...
Downloading PDB structure '2cjw'...




  → Lengths: Primary=351, Q8=351, Q3=351
Processing 2RAP...
Downloading PDB structure '2rap'...
  → Lengths: Primary=167, Q8=167, Q3=167
Processing 5NI8...
Downloading PDB structure '5ni8'...




  → Lengths: Primary=262, Q8=262, Q3=262
Processing 5NIB...
Downloading PDB structure '5nib'...




  → Lengths: Primary=262, Q8=262, Q3=262
Processing 1H30...
Downloading PDB structure '1h30'...
  → Lengths: Primary=391, Q8=391, Q3=391
Processing 1A52...
Downloading PDB structure '1a52'...





  → Lengths: Primary=479, Q8=479, Q3=479
Processing 1AC6...
Downloading PDB structure '1ac6'...
  → Lengths: Primary=220, Q8=220, Q3=220
Processing 1AJJ...
Downloading PDB structure '1ajj'...
  → Lengths: Primary=37, Q8=37, Q3=37
Processing 1BEC...
Downloading PDB structure '1bec'...
  → Lengths: Primary=238, Q8=238, Q3=238
Processing 1BJ8...
Downloading PDB structure '1bj8'...





  → Lengths: Primary=109, Q8=109, Q3=109
Processing 1E6F...
Downloading PDB structure '1e6f'...




  → Lengths: Primary=260, Q8=260, Q3=260
Processing 1GP0...
Downloading PDB structure '1gp0'...
  → Lengths: Primary=133, Q8=133, Q3=133
Processing 1GP3...
Downloading PDB structure '1gp3'...
  → Lengths: Primary=129, Q8=129, Q3=129
Processing 1GQB...
Downloading PDB structure '1gqb'...




  → Lengths: Primary=261, Q8=261, Q3=261
Processing 1H68...
Downloading PDB structure '1h68'...
  → Lengths: Primary=218, Q8=218, Q3=218
Processing 1HKF...
Downloading PDB structure '1hkf'...
  → Lengths: Primary=108, Q8=108, Q3=108
Processing 1M6P...
Downloading PDB structure '1m6p'...




  → Lengths: Primary=292, Q8=292, Q3=292
Processing 1NGR...
Downloading PDB structure '1ngr'...





  → Lengths: Primary=85, Q8=85, Q3=85
Processing 1RPM...
Downloading PDB structure '1rpm'...




  → Lengths: Primary=556, Q8=556, Q3=556
Processing 1TCR...
Downloading PDB structure '1tcr'...




  → Lengths: Primary=439, Q8=439, Q3=439
Processing 1UPV...
Downloading PDB structure '1upv'...
  → Lengths: Primary=232, Q8=232, Q3=232
Processing 1UPW...
Downloading PDB structure '1upw'...
  → Lengths: Primary=232, Q8=232, Q3=232
Processing 1US4...
Downloading PDB structure '1us4'...
  → Lengths: Primary=297, Q8=297, Q3=297
Processing 1US5...
Downloading PDB structure '1us5'...
  → Lengths: Primary=297, Q8=297, Q3=297
Processing 1W9R...
Downloading PDB structure '1w9r'...
  → Lengths: Primary=119, Q8=119, Q3=119
Processing 2BPD...
Downloading PDB structure '2bpd'...




  → Lengths: Primary=256, Q8=256, Q3=256
Processing 2BPE...
Downloading PDB structure '2bpe'...




  → Lengths: Primary=256, Q8=256, Q3=256
Processing 2BPH...
Downloading PDB structure '2bph'...




  → Lengths: Primary=256, Q8=256, Q3=256
Processing 2BSD...
Downloading PDB structure '2bsd'...




  → Lengths: Primary=762, Q8=762, Q3=762
Processing 2BSE...
Downloading PDB structure '2bse'...




  → Lengths: Primary=687, Q8=687, Q3=687
Processing 2BYN...
Downloading PDB structure '2byn'...




  → Lengths: Primary=1066, Q8=1066, Q3=1066
Processing 2BYP...
Downloading PDB structure '2byp'...




  → Lengths: Primary=1113, Q8=1113, Q3=1113
Processing 2BYQ...
Downloading PDB structure '2byq'...




  → Lengths: Primary=1069, Q8=1069, Q3=1069
Processing 2BYR...
Downloading PDB structure '2byr'...





  → Lengths: Primary=2101, Q8=2101, Q3=2101
Processing 2BYS...
Downloading PDB structure '2bys'...




  → Lengths: Primary=2046, Q8=2046, Q3=2046
Processing 2CDG...
Downloading PDB structure '2cdg'...




  → Lengths: Primary=428, Q8=428, Q3=428
Processing 2CL8...
Downloading PDB structure '2cl8'...




  → Lengths: Primary=256, Q8=256, Q3=256
Processing 2CNJ...
Downloading PDB structure '2cnj'...
  → Lengths: Primary=143, Q8=143, Q3=143
Processing 2J4A...
Downloading PDB structure '2j4a'...
  → Lengths: Primary=250, Q8=250, Q3=250
Processing 2J67...
Downloading PDB structure '2j67'...




  → Lengths: Primary=275, Q8=275, Q3=275
Processing 2NR1...
Downloading PDB structure '2nr1'...
  → Lengths: Primary=23, Q8=23, Q3=23
Processing 2RQZ...
Downloading PDB structure '2rqz'...
  → Lengths: Primary=38, Q8=38, Q3=38
Processing 2RR0...
Downloading PDB structure '2rr0'...
  → Lengths: Primary=38, Q8=38, Q3=38
Processing 2RR2...
Downloading PDB structure '2rr2'...
  → Lengths: Primary=38, Q8=38, Q3=38
Processing 2UV3...
Downloading PDB structure '2uv3'...




  → Lengths: Primary=242, Q8=242, Q3=242
Processing 2UWI...
Downloading PDB structure '2uwi'...




  → Lengths: Primary=254, Q8=254, Q3=254

✅ All results saved to 'gpcr_dssp_results.txt'


In [4]:
import csv

input_file = "gpcr_dssp_results.txt"
output_file = "gpcr_dssp_results.csv"

entries = []

with open(input_file, "r") as f:
    lines = f.read().split("------------------------------------------------------------\n")

    for block in lines:
        if not block.strip():
            continue

        lines = block.strip().splitlines()

        try:
            pdb_id = lines[0].split(":")[1].strip()
            primary_seq = lines[2]
            dssp_q8 = lines[4]
            dssp_q3 = lines[6]
            length_line = lines[7]
            length = length_line.split("Primary=")[1].split(",")[0].strip()

            entries.append([pdb_id, primary_seq, dssp_q8, dssp_q3, length])
        except Exception as e:
            print(f"Skipping a block due to parsing error: {e}")

# Write to CSV
with open(output_file, "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["PDB_ID", "Primary_Sequence", "DSSP_Q8", "DSSP_Q3", "Length"])
    writer.writerows(entries)

print(f"✅ CSV file saved as '{output_file}'")


✅ CSV file saved as 'gpcr_dssp_results.csv'
