Import the required functions and create a PyCom object:

In [1]:
from pycom import PyCom, ProteinParams

pyc = PyCom(db_path='~/docs/pycom.db', mat_path='~/docs/pycom.mat')

Query the database by passing a dictionary of conditions:

In [2]:
entries = pyc.find({
    ProteinParams.ENZYME: '3.*.*.*',
    ProteinParams.DISEASE: 'cancer',  # string search, case-insensitive
})

entries

Unnamed: 0,uniprot_id,neff,sequence_length,organism_id,helix_frac,turn_frac,strand_frac,has_ptm,has_pdb,has_substrate,sequence,matrix
0,P01111,12.817,189,9606,0.349206,0.015873,0.227513,1,1,1,MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVI...,
1,P01112,12.841,189,9606,0.31746,0.031746,0.359788,1,1,1,MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVI...,
2,P01116,12.626,189,9606,0.375661,0.031746,0.328042,1,1,1,MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVI...,
3,P62070,12.754,204,9606,0.29902,0.019608,0.220588,1,1,1,MAAAGWRDGSGQEKYRLVVVGGGGVGKSALTIQFIQSYFVTDYDPT...,
4,Q9UNW1,9.554,487,9606,0.0,0.0,0.0,0,0,1,MLRAPGCLLRTSVAPAAALAAALLSSLARCSLLEPRDPVASSLSPY...,


Alternatively, query the database by passing keyword arguments:

In [3]:
entries = pyc.find(
    cofactor='FAD',  # string search, case-insensitive
    has_ptm=True,
    has_disease=True,
)

entries

Unnamed: 0,uniprot_id,neff,sequence_length,organism_id,helix_frac,turn_frac,strand_frac,has_ptm,has_pdb,has_substrate,sequence,matrix
0,P11310,9.93,421,9606,0.517815,0.016627,0.180523,1,1,1,MAAGFGRCCRVLRSISRFHWRSQHTKANRQREPGLGFSFEFTEQQK...,
1,Q658P3,9.677,488,9606,0.157787,0.0,0.086066,1,1,0,MPEEMDKPLISLHLVDSDSSLAKVPDEAPKVGILGSGDFARSLATR...,
2,Q16795,10.997,377,9606,0.363395,0.037135,0.124668,1,1,0,MAAAAQSRVVRVLSMSRSAITAIATSVCHGPPCRQLHHALMPHGKG...,
3,O95299,9.244,355,9606,0.0,0.0,0.0,1,1,0,MALRLLKLAATSASARVVAAGAQRVRGIHSSVQCKLRYGMWHFLLG...,
4,P13804,8.627,333,9606,0.3003,0.027027,0.333333,1,1,0,MFRAAAPGQLRRAASLLRFQSTLVIAEHANDSLAPITLNTITAATR...,


Get the lists of available cofactors and diseases:

In [4]:
cofactors = pyc.get_cofactor_list()
diseases = pyc.get_disease_list()

cofactors

Unnamed: 0,cofactorId,cofactorName
0,CHEBI:597326,pyridoxal 5'-phosphate
1,CHEBI:18420,Mg(2+)
2,CHEBI:60240,a divalent metal cation
3,CHEBI:30413,heme
4,CHEBI:29105,Zn(2+)
...,...,...
109,CHEBI:61721,chlorophyll b
110,CHEBI:73095,divinyl chlorophyll a
111,CHEBI:73096,divinyl chlorophyll b
112,CHEBI:57453,"(6S)-5,6,7,8-tetrahydrofolate"


Make a large query, then paginate the results:

In [5]:
entries = pyc.find(min_length=5, max_length=20)
print(f'Found {len(entries)} entries with length <= 20')

page = pyc.paginate(entries, page=1)  # get first n entries (default 100)
print(f'Found {len(page)} entries on page 1')

Found 2958 entries with length <= 20
Found 100 entries on page 1


Load the coevolution matrices for a dataframe of entries:

In [7]:
pyc.load_matrices(page)

page.iloc[0].matrix

array([[0.00000000e+00, 2.16066837e-07, 1.56462193e-07, 0.00000000e+00,
        0.00000000e+00],
       [2.16066837e-07, 0.00000000e+00, 4.61935997e-07, 4.54485416e-07,
        4.54485416e-07],
       [1.56462193e-07, 4.61935997e-07, 0.00000000e+00, 2.98023224e-07,
        2.98023224e-07],
       [0.00000000e+00, 4.54485416e-07, 2.98023224e-07, 0.00000000e+00,
        2.23517418e-07],
       [0.00000000e+00, 4.54485416e-07, 2.98023224e-07, 2.23517418e-07,
        0.00000000e+00]])