This is a notebook for getting the dates of all of our PDB structures so that we can select only proteins with dates after AlphaFold2's training set.

In [4]:
import pandas as pd
import datetime
import numpy as np
from Bio.PDB.MMCIF2Dict import MMCIF2Dict
from os.path import join
from multiprocessing import Pool

In [5]:
# Some helper functions
def divide_chunks(l, n):

    # looping till length l
    for i in range(0, len(l), n):
        yield l[i:i + n]

def get_structure_dict(fn, path):

    # Join the path and the file name
    full_path = join(path, fn)
            
    # Make an MMCIFDict object to grab more information form the .cif files
    mmcif_dict = MMCIF2Dict(full_path)

    return mmcif_dict

def add_date(row, path):
    # Add the date of deposition of the structure to the row
    f = row['gt_fn']
    uniprot = row['uniprot']
    fn = join(uniprot, f)

    # Get the structure dictionary
    structure_dict = get_structure_dict(fn, path)

    # Get the date of deposition
    date = structure_dict['_pdbx_database_status.recvd_initial_deposition_date'][0]

    row['date'] = date

    return row

def create_chunk_df_and_add_dates(df, path, chunk):
    # Create a dataframe with the chunk of proteins
    chunk_df = df[df['uniprot'].isin(chunk)].reset_index(drop=True)

    # Calculate the rmsds
    chunk_df = chunk_df.apply(add_date, axis=1, args=(path,))

    return chunk_df

In [6]:
path = './project_pipeline/data/input/RCSB_cif'
ai = pd.read_csv('./project_pipeline/data/proteins_by_pdb.tsv', sep='\t')
md = pd.read_csv('./project_pipeline/data/md_pdbs.tsv', sep='\t')

ai_prots = ai['uniprot'].unique()
md_prots = md['uniprot'].unique()

In [7]:
ai_prot_chunks = list(divide_chunks(ai_prots, 10))

with Pool() as pool:
    ai_results = pool.starmap(create_chunk_df_and_add_dates, [(ai, path, chunk) for chunk in ai_prot_chunks])

ai_dates = pd.concat(ai_results).reset_index(drop=True)

In [8]:
# Repeat for the MD proteins
md_prot_chunks = list(divide_chunks(md_prots, 10))

with Pool() as pool:
    md_results = pool.starmap(create_chunk_df_and_add_dates, [(md, path, chunk) for chunk in md_prot_chunks])

md_dates = pd.concat(md_results).reset_index(drop=True)

In [9]:
ai_dates.head()

Unnamed: 0,gene_name,uniprot,protein_length,region_1,region_2,pdb,chain,af_filename,gt_fn,date
0,Nos1,P29476,1429.0,815-870,757-949,1b8q,A,F-P29476-F1-model_v3.cif,1b8q.cif,1999-02-01
1,Nos1,P29476,1429.0,815-870,757-949,1cmi,C,F-P29476-F1-model_v3.cif,1cmi.cif,1999-05-06
2,Nos1,P29476,1429.0,815-870,757-949,1f20,A,F-P29476-F1-model_v3.cif,1f20.cif,2000-05-22
3,Nos1,P29476,1429.0,815-870,757-949,1k2r,A,F-P29476-F1-model_v3.cif,1k2r.cif,2001-09-28
4,Nos1,P29476,1429.0,815-870,757-949,1k2s,A,F-P29476-F1-model_v3.cif,1k2s.cif,2001-09-28


In [10]:
ai2 = ai_dates.copy()
md2 = md_dates.copy()

In [11]:
# Convert the dates to datetime objects
ai2['date'] = pd.to_datetime(ai2['date'])
md2['date'] = pd.to_datetime(md2['date'])

In [12]:
# The maximum release date for AlphaFold2 was April 30, 2018. Select only those structures released after this date
ai_after = ai2[ai2['date'] > datetime.datetime(2018, 4, 30)]
md_after = md2[md2['date'] > datetime.datetime(2018, 4, 30)]

In [13]:
# Save the dates
ai2.to_csv('./project_pipeline/data/ai_dates.tsv', sep='\t', index=False)
md2.to_csv('./project_pipeline/data/md_dates.tsv', sep='\t', index=False)