## Extract data from AlphaFold structure database

AlphaFold stores the pLDDT (predicted local distance difference test) scores in the B-factor field of the PDB file. This script extract the plddt ans sse annotation from a folder of AF structures. 
 

In [1]:
import pandas as pd 
import os
import re
import numpy as np
import biotite.structure as struc
import biotite.structure.io.pdbx as pdbx
from biotite.structure.io.pdb import * 

In [2]:
FOLDER_CIF = "PATH_TO/AlphaFold_predictions/human/uncompressed/"
OUTPUT = "PATH_TO_OUTPUT"

In [4]:
result_dict = {}

for filename in os.listdir(FOLDER_CIF):
    f = os.path.join(FOLDER_CIF, filename)

    if os.path.isfile(f):
        try:
            id = re.split("-", filename)[1]
    
            #if id in uni_ids: # add this to limit to the proteins you are interested in 
            inner_dict = {}
            #print(f"Processing ID: {id}") 
            
            try:
                cif_file = pdbx.CIFFile.read(f) 
                array = pdbx.get_structure(cif_file, model = 1) # , extra_fields = ["b_factor"] 

                plddt = cif_file.block["ma_qa_metric_local"]["metric_value"].as_array(
                            dtype=np.float32
                        )
                
                sse = struc.annotate_sse(array)

                inner_dict['sse'] = sse
                inner_dict['plddt'] = plddt
                inner_dict['sequence'] = pdbx.get_sequence(cif_file)
                    
            except Exception as e:
                print(f"Error reading CIF file for {filename}: {e}")
                #continue

            result_dict[id] = inner_dict
                
    
        except Exception as e:
            print(f"Error reading CIF file for {filename}: {e}")


Error reading CIF file for ._AF-A0A0A6YYL3-F1-model_v1.cif: 'utf-8' codec can't decode byte 0xb0 in position 37: invalid start byte
Error reading CIF file for ._AF-Q99750-F1-model_v1.cif: 'utf-8' codec can't decode byte 0xb0 in position 37: invalid start byte
Error reading CIF file for ._AF-A0A0A0MT87-F1-model_v1.cif: 'utf-8' codec can't decode byte 0xb0 in position 37: invalid start byte
Error reading CIF file for ._AF-A0A0A0MT99-F1-model_v1.cif: 'utf-8' codec can't decode byte 0xb0 in position 37: invalid start byte


In [8]:
flattened_data = []
for outer_key, inner_dict in result_dict.items():
    row = {'outer_key': outer_key}
    row.update(inner_dict)  # Add inner_key-value pairs as columns
    flattened_data.append(row)

df = pd.DataFrame(flattened_data)

# Convert lists to strings, handling NaN or non-list values
df["sse"] = df["sse"].apply(
    lambda x: ','.join(map(str, x)) if isinstance(x, list) else ''  # Handle non-list values
)
df["plddt"] = df["plddt"].apply(
    lambda x: ','.join(map(str, x)) if isinstance(x, list) else ''
)

df.to_csv(OUTPUT, index=False, sep=';')

In [6]:
df

Unnamed: 0,outer_key,sse,plddt,sequence
0,Q5JPI3,"[c, c, c, c, c, c, c, a, a, a, a, a, a, a, a, ...","[31.76, 31.2, 37.05, 52.62, 67.7, 78.74, 83.39...","{'A': ['M', 'E', 'M', 'S', 'G', 'L', 'S', 'F',..."
1,O75445,"[c, c, c, c, c, b, b, b, b, b, b, b, b, b, c, ...","[55.08, 64.11, 59.79, 64.52, 62.16, 68.95, 76....","{'A': ['Q', 'Y', 'S', 'D', 'G', 'K', 'W', 'H',..."
2,O14522,"[c, a, a, a, a, a, a, a, a, a, a, a, a, c, c, ...","[42.67, 39.45, 41.62, 45.65, 47.51, 47.82, 46....","{'A': ['M', 'A', 'S', 'L', 'A', 'A', 'L', 'A',..."
3,Q6L9W6,"[c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, ...","[37.78, 37.6, 36.62, 47.34, 35.93, 37.75, 36.0...","{'A': ['M', 'G', 'S', 'P', 'R', 'A', 'A', 'R',..."
4,Q5T5B0,"[c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, ...","[59.28, 68.4, 68.15, 69.96, 64.11, 64.5, 61.58...","{'A': ['M', 'S', 'C', 'Q', 'Q', 'N', 'Q', 'K',..."
...,...,...,...,...
20499,Q8IWU2,"[c, c, c, c, a, a, a, a, a, a, a, a, a, a, a, ...","[37.16, 44.52, 43.18, 53.68, 54.5, 61.06, 64.7...","{'A': ['M', 'P', 'G', 'P', 'P', 'A', 'L', 'R',..."
20500,P37235,"[c, c, c, c, c, c, c, c, c, a, a, a, a, a, a, ...","[38.46, 41.51, 49.03, 57.31, 60.93, 70.59, 83....","{'A': ['M', 'G', 'K', 'Q', 'N', 'S', 'K', 'L',..."
20501,P41587,"[c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, ...","[43.24, 39.09, 41.39, 46.26, 46.1, 43.38, 42.8...","{'A': ['M', 'R', 'T', 'L', 'L', 'P', 'P', 'A',..."
20502,Q9UHV9,"[c, b, b, b, b, b, b, b, b, b, b, b, b, b, c, ...","[46.6, 54.67, 43.66, 60.74, 48.02, 45.83, 52.7...","{'A': ['M', 'A', 'E', 'N', 'S', 'G', 'R', 'A',..."
