In [None]:
import pandas as pd
import re

# Read the text file
with open('pdbtosp.txt', 'r') as file:
    lines = file.readlines()

# Define a regular expression pattern to extract columns
pattern = re.compile(r'\s+')

# Initialize an empty list to store the rows
data = []

# Process each line and extract the columns
for line in lines:
    # Split the line using the regular expression pattern
    columns = pattern.split(line.strip())
    # Append the columns to the data list
    data.append(columns)

# Create a DataFrame from the extracted data
df = pd.DataFrame(data)

# Display the DataFrame
print(df)


In [None]:
df.tail()

In [None]:
columns_to_extract = [0, 5]
df_extracted = df.iloc[:, columns_to_extract]

In [None]:
df_extracted

In [None]:
column_names = ['PDB', 'SP']
df_extracted.columns = column_names

In [None]:
df_extracted.head()

In [None]:
df_extracted['SP']

In [None]:
df_extracted.dropna(inplace=True)

In [None]:
df_extracted['SP'] = df_extracted['SP'].map(lambda x: x.replace('(', '').replace(')', ''))


In [None]:
df_extracted.head()

In [None]:
len(df_extracted)

In [None]:
df_extracted_dedup = df_extracted.drop_duplicates(subset='SP', keep='first')

In [None]:
df_extracted_dedup.head()

In [None]:
df_extracted_dedup.reset_index(inplace=True)

In [None]:
df_final = df_extracted_dedup[['PDB' , 'SP']]

In [None]:
df_final.head()

In [None]:
len(df_final)

In [None]:
df_final.to_csv('./pdb2sp.csv')

In [None]:
import requests

# Specify the PDB ID
pdb_id = "1XYZ"  # Replace with the desired PDB ID

# Define the URL to download the PDB file
url = f"https://files.rcsb.org/download/{pdb_id}.cif"

# Send a GET request to the URL and save the file
response = requests.get(url)
if response.status_code == 200:
    with open(f"{pdb_id}.cif", "wb") as file:
        file.write(response.content)
    print("PDB file downloaded successfully.")
else:
    print("Failed to download PDB file.")


In [None]:
from Bio import PDB

# Specify the path to the CIF file
cif_file = '1XYZ.cif'  # Replace with the path to your CIF file

# Create a parser object
parser = PDB.MMCIFParser()

# Parse the CIF file
structure = parser.get_structure('protein', cif_file)

# Extract the protein sequence
model = structure[0]  # Assuming there's only one model in the structure
chain = model['A']  # Assuming the protein sequence is in chain A, change as needed

sequence = ''
for residue in chain:
    if PDB.is_aa(residue):
        sequence += PDB.Polypeptide.three_to_one(residue.get_resname())

# Display the protein sequence
print(sequence)


In [None]:
from Bio import PDB

# Specify the path to the CIF file
cif_file = '1XYZ.cif' # Replace with the path to your CIF file

# Create a parser object
parser = PDB.MMCIFParser()

# Parse the CIF file
structure = parser.get_structure('protein', cif_file)

# Extract the protein sequence
model = structure[0]  # Assuming there's only one model in the structure
chain = model['A']  # Assuming the protein sequence is in chain A, change as needed

# Create an empty distance matrix
num_residues = len(chain)
distance_matrix = [[0.0] * num_residues for _ in range(num_residues)]

# Calculate the distance between residues
for i, residue_i in enumerate(chain):
    for j, residue_j in enumerate(chain):
        if PDB.is_aa(residue_i) and PDB.is_aa(residue_j):
            distance = residue_i['CA'] - residue_j['CA']
            distance_matrix[i][j] = distance




In [None]:
import numpy as np

In [None]:
np.array(distance_matrix).shape

In [None]:
import requests
import time

def download_pdb_file(pdb_id, max_retries=3 , sleep_time_in_sec = 15):
    # Define the URL to download the PDB file
    url = f"https://files.rcsb.org/download/{pdb_id}.cif"

    # Initialize a counter for retry attempts
    retry_count = 0

    # Retry loop
    while retry_count < max_retries:
        try:
            # Send a GET request to the URL and save the file
            response = requests.get(url)
            if response.status_code == 200:
                with open(f"{pdb_id}.cif", "wb") as file:
                    file.write(response.content)
                print("PDB file downloaded successfully.")
                return True
            else:
                print(f"Failed to download PDB file. (Attempt {retry_count + 1})")
        except requests.exceptions.RequestException as e:
            print(f"Error occurred: {e}")

        retry_count += 1
        time.sleep(sleep_time_in_sec)  # Wait for 1 second before retrying

    print(f"Exceeded maximum retry attempts. Failed to download PDB file.")
    return False

# Example usage
pdb_id = "1XYZ"
download_pdb_file(pdb_id, max_retries=3)


In [None]:
from Bio.PDB import *
import numpy as np

def compute_angles_distance_and_amino_acids(pdb_file):
    parser = MMCIFParser()
    structure = parser.get_structure('protein', pdb_file)

    model = structure[0]

    phi_psi_angles = []
    residues = []
    amino_acids_count = 0

    for chain in model:
        polypeptides = PPBuilder().build_peptides(chain)
        for poly_index, poly in enumerate(polypeptides):
            phi_psi = poly.get_phi_psi_list()
            for res_index, residue in enumerate(poly):
                phi_psi_angles.append(phi_psi[res_index])
                residues.append(residue)
                amino_acids_count += 1  # Increment count for each amino acid residue

    n_residues = len(residues)
    distance_matrix = np.zeros((n_residues, n_residues))

    for i in range(n_residues):
        for j in range(n_residues):
            distance_matrix[i][j] = residues[i]['CA'] - residues[j]['CA']

    phi_psi_angles = np.array(phi_psi_angles)
    return phi_psi_angles, distance_matrix, amino_acids_count

phi_psi_angles, distance_matrix, num_amino_acids = compute_angles_distance_and_amino_acids('1XYZ.cif')

print('Phi/Psi angles:', phi_psi_angles.shape)
print('Distance matrix:', distance_matrix.shape)
print('Number of amino acids:', num_amino_acids)


In [None]:
import nglview as nv
from Bio.PDB import *

def visualize_structure(pdb_file):
    # Parse the PDB file
    parser = MMCIFParser()
    structure = parser.get_structure('protein', pdb_file)

    # Visualize the structure
    view = nv.show_biopython(structure)
    view.clear_representations()
    # Add new representation
    view.add_cartoon('protein')
    view.add_ball_and_stick('not protein')
    return view

# Test the function
view = visualize_structure('1XYZ.cif')
view


In [None]:
from Bio.PDB import *

def get_chains(pdb_file):
    # Parse the PDB file
    parser = MMCIFParser()
    structure = parser.get_structure('protein', pdb_file)

    # Select the first model
    model = structure[0]

    # Get chains and residues
    chains = {}
    for chain in model:
        chains[chain.get_id()] = [residue for residue in chain]
        
    return chains

# Test the function
chains = get_chains('1XYZ.cif')

# Print information about each chain
for chain_id, residues in chains.items():
    print('Chain ID:', chain_id)
    print('Number of residues:', len(residues))
    print('First 5 residues:', residues[:5])


In [None]:
import nglview as nv
from Bio.PDB import *

def visualize_chains(pdb_file):
    # Parse the PDB file
    parser = MMCIFParser()
    structure = parser.get_structure('protein', pdb_file)

    # Select the first model
    model = structure[0]

    # Create a list to store chain views
    chain_views = []

    # Visualize each chain separately
    for chain in model:
        view = nv.show_biopython(chain)
        view.clear_representations()
        view.add_cartoon('protein')
        view.add_ball_and_stick('not protein')
        chain_views.append(view)
    
    return chain_views

# Test the function
chain_views = visualize_chains('1XYZ.cif')

# Display the view for each chain
for view in chain_views:
    display(view)


In [None]:
from Bio.PDB import *
import numpy as np

def calculate_distance_matrix_for_each_chain(pdb_file):
    # Parse the PDB file
    parser = MMCIFParser()
    structure = parser.get_structure('protein', pdb_file)

    # Select the first model
    model = structure[0]

    # Create a dictionary to store the distance matrix for each chain
    distance_matrices = {}

    for chain in model:
        # Initialize variables
        residues = []

        polypeptides = PPBuilder().build_peptides(chain)
        for poly_index, poly in enumerate(polypeptides):
            for res_index, residue in enumerate(poly):
                residues.append(residue)

        # Compute the distance matrix for the chain
        n_residues = len(residues)
        distance_matrix = np.zeros((n_residues, n_residues))

        for i in range(n_residues):
            for j in range(n_residues):
                distance_matrix[i][j] = residues[i]['CA'] - residues[j]['CA']

        # Store the distance matrix in the dictionary
        distance_matrices[chain.get_id()] = distance_matrix

    return distance_matrices

# Test the function
distance_matrices = calculate_distance_matrix_for_each_chain('1XYZ.cif')

# Print the distance matrix for each chain
for chain_id, distance_matrix in distance_matrices.items():
    print('Chain ID:', chain_id)
    print('Distance matrix:', distance_matrix)


In [None]:
distance_matrix

In [None]:
from Bio.PDB import *
import numpy as np

def calculate_angles_and_distance_matrix_for_each_chain(pdb_file):
    # Parse the PDB file
    parser = MMCIFParser()
    structure = parser.get_structure('protein', pdb_file)

    # Select the first model
    model = structure[0]

    # Create dictionaries to store the distance matrix and phi/psi angles for each chain
    distance_matrices = {}
    phi_psi_angles_dict = {}

    for chain in model:
        # Initialize variables
        residues = []
        phi_psi_angles = []

        polypeptides = PPBuilder().build_peptides(chain)
        for poly_index, poly in enumerate(polypeptides):
            phi_psi = poly.get_phi_psi_list()
            for res_index, residue in enumerate(poly):
                residues.append(residue)
                phi_psi_angles.append(phi_psi[res_index])

        # Compute the distance matrix for the chain
        n_residues = len(residues)
        distance_matrix = np.zeros((n_residues, n_residues))

        for i in range(n_residues):
            for j in range(n_residues):
                distance_matrix[i][j] = residues[i]['CA'] - residues[j]['CA']

        # Store the distance matrix and phi/psi angles in the dictionaries
        distance_matrices[chain.get_id()] = distance_matrix
        phi_psi_angles_dict[chain.get_id()] = phi_psi_angles

    return distance_matrices, phi_psi_angles_dict

# Test the function
distance_matrices, phi_psi_angles_dict = calculate_angles_and_distance_matrix_for_each_chain('1XYZ.cif')

# Print the distance matrix and phi/psi angles for each chain
for chain_id in distance_matrices.keys():
    print('Chain ID:', chain_id)
    print('Distance matrix:', distance_matrices[chain_id])
    print('Phi/Psi angles:', phi_psi_angles_dict[chain_id])


In [None]:
phi_psi_angles_dict