In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import concurrent.futures
from google.colab import files



In [38]:
df = pd.read_csv('5000_final.csv')

In [39]:
# List of columns to keep (without sd columns)
columns_to_keep = [col for col in df.columns if '_sd' not in col]
new_df = df[columns_to_keep]
print(new_df.head())

  pdbid  binding_affinity (kcal/mol)  electrostatic (kcal/mol)  \
0  6g3f                     -0.25570                  -0.98408   
1  5fpd                     -2.61456                  -1.01432   
2  6hxe                     -0.98558                  -8.41628   
3  5nqb                     -1.05920                 -36.85828   
4  4q3f                     -1.52816                  -5.36862   

   polar_solvation (kcal/mol)  non_polar_solvation (kcal/mol)  vdW (kcal/mol)  
0                    -0.34568                         1.26248        -0.18842  
1                    -0.56942                         1.62432        -2.65510  
2                    -0.81082                         7.90220         0.33930  
3                    -0.46456                        34.03790         2.22572  
4                    -0.74246                         5.44818        -0.86530  


In [41]:
def get_protein_info(pdb_id):
    url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
    response = requests.get(url)

    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: Unable to retrieve data for PDB ID {pdb_id}.")
        return None


In [42]:
def get_molecular_weight(protein_data):
    return protein_data['rcsb_entry_info']['molecular_weight']


In [43]:
def get_entity_count(protein_data):
    return protein_data['rcsb_entry_info']['entity_count']


In [44]:
def get_unit_cell_dimensions(protein_data):
    dimension = {}
    dimension['l1'] = protein_data['cell']['length_a']
    dimension['l2'] = protein_data['cell']['length_b']
    dimension['l3'] = protein_data['cell']['length_c']
    return dimension


In [45]:
# Iterate over each row in the DataFrame
def process_row(row, index):
    pdb_id = row['pdbid']

    # Get protein information
    protein_info = get_protein_info(pdb_id)

    if protein_info:

        # making a new row without the standard deviation columns
        columns_to_keep = [col for col in df.columns if '_sd' not in col]
        new_row = row[columns_to_keep]


        # adding dimension, weight and entity_count of the protein to the row
        dimension = get_unit_cell_dimensions(protein_info)
        new_row['weight'] = get_molecular_weight(protein_info)
        new_row['entity_count'] = get_entity_count(protein_info)
        new_row['l1'] = dimension['l1']
        new_row['l2'] = dimension['l2']
        new_row['l3'] = dimension['l3']



        # print(index, row)
        return new_row



In [51]:
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Submit tasks for each row
    futures = [executor.submit(process_row, row, index) for index, row in df.iterrows()]

    # Wait for all tasks to complete and get the updated rows
    updated_rows = [future.result() for future in concurrent.futures.as_completed(futures)]



# Print or use the updated DataFrame


Error: Unable to retrieve data for PDB ID 1h6w.
Error: Unable to retrieve data for PDB ID 1h6w.


In [52]:
filtered_updated_rows = []
for row in updated_rows:
    if row is None:
        continue
    filtered_updated_rows.append(row)

updated_df = pd.DataFrame(filtered_updated_rows)

In [53]:
print(updated_df)
updated_df.to_csv('df.csv')
files.download('df.csv')

     pdbid  binding_affinity (kcal/mol)  electrostatic (kcal/mol)  \
0     6g3f                     -0.25570                  -0.98408   
4     4q3f                     -1.52816                  -5.36862   
2     6hxe                     -0.98558                  -8.41628   
5     1gyy                     -1.78920                   3.83432   
1     5fpd                     -2.61456                  -1.01432   
...    ...                          ...                       ...   
4996  6gvx                    -39.51860                   0.00000   
4995  4qyg                    -37.02860                 -35.92384   
4997  5mob                    -30.31542                   0.00000   
4998  3l9h                    -55.70448                   0.00000   
4999  3dfr                    -51.58218                   0.00000   

      polar_solvation (kcal/mol)  non_polar_solvation (kcal/mol)  \
0                       -0.34568                         1.26248   
4                       -0.74246   

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>