In [1]:
from Bio.SCOP import Scop
import pandas as pd

In [2]:
scop = Scop(cla_handle=open('data/SCOPe/dir.cla.scope.2.08-stable.txt'),
            des_handle=open('data/SCOPe/dir.des.scope.2.08-stable.txt'),
            hie_handle=open('data/SCOPe/dir.hie.scope.2.08-stable.txt'))
scop

<Bio.SCOP.Scop at 0x108968590>

In [3]:
domains = scop.getDomains()
sid_ = []
sccs_ = []
des_ = []
pdb_id_ = []

for domain in domains:
    sid_.append(domain.sid)
    sccs_.append(domain.sccs)
    des_.append(domain.description)
    pdb_id_.append(domain.description.split()[0])

df = pd.DataFrame({'sid': sid_, 'sccs': sccs_, 'description': des_, 'pdb_id': pdb_id_})
df

Unnamed: 0,sid,sccs,description,pdb_id
0,d1ux8a_,a.1.1.1,1ux8 A:,1ux8
1,d1dlwa_,a.1.1.1,1dlw A:,1dlw
2,d1uvya_,a.1.1.1,1uvy A:,1uvy
3,d1dlya_,a.1.1.1,1dly A:,1dly
4,d1uvxa_,a.1.1.1,1uvx A:,1uvx
...,...,...,...,...
344846,d2ehoc3,l.1.1.1,2eho C:0-0,2eho
344847,d2ehof3,l.1.1.1,2eho F:0-0,2eho
344848,d2ehog3,l.1.1.1,2eho G:0-0,2eho
344849,d2ehok3,l.1.1.1,2eho K:0-0,2eho


In [4]:
df.sccs.nunique()

5431

In [4]:
df[df.sccs == "a.1.1.2"]

Unnamed: 0,sid,sccs,description,pdb_id
70,d1asha_,a.1.1.2,1ash A:,1ash
71,d1vhba_,a.1.1.2,1vhb A:,1vhb
72,d1vhbb_,a.1.1.2,1vhb B:,1vhb
73,d4vhba_,a.1.1.2,4vhb A:,4vhb
74,d4vhbb_,a.1.1.2,4vhb B:,4vhb
...,...,...,...,...
2792,d3bj1c_,a.1.1.2,3bj1 C:,3bj1
2793,d3bj2a_,a.1.1.2,3bj2 A:,3bj2
2794,d3bj2c_,a.1.1.2,3bj2 C:,3bj2
2795,d3bj3a_,a.1.1.2,3bj3 A:,3bj3


In [5]:
# Parse the SCOPe hierarchy for dev and small dataset
df['superfamily'] = df.sccs.apply(lambda x: ".".join(x.split(".")[:3]))
df["fold"] = df.sccs.apply(lambda x: ".".join(x.split(".")[:2]))
df["class"] = df.sccs.apply(lambda x: ".".join(x.split(".")[:1]))

df

Unnamed: 0,sid,sccs,description,pdb_id,superfamily,fold,class
0,d1ux8a_,a.1.1.1,1ux8 A:,1ux8,a.1.1,a.1,a
1,d1dlwa_,a.1.1.1,1dlw A:,1dlw,a.1.1,a.1,a
2,d1uvya_,a.1.1.1,1uvy A:,1uvy,a.1.1,a.1,a
3,d1dlya_,a.1.1.1,1dly A:,1dly,a.1.1,a.1,a
4,d1uvxa_,a.1.1.1,1uvx A:,1uvx,a.1.1,a.1,a
...,...,...,...,...,...,...,...
344846,d2ehoc3,l.1.1.1,2eho C:0-0,2eho,l.1.1,l.1,l
344847,d2ehof3,l.1.1.1,2eho F:0-0,2eho,l.1.1,l.1,l
344848,d2ehog3,l.1.1.1,2eho G:0-0,2eho,l.1.1,l.1,l
344849,d2ehok3,l.1.1.1,2eho K:0-0,2eho,l.1.1,l.1,l


In [7]:
df.to_csv("./SCOPe.csv", index=False)

In [16]:
# dev dataset (10 random sample from each class 120 total)
dev = df.groupby("class").apply(lambda x: x.sample(10)).reset_index(drop=True)
dev

  dev = df.groupby("class").apply(lambda x: x.sample(10)).reset_index(drop=True)


Unnamed: 0,sid,sccs,description,pdb_id,superfamily,fold,class
0,d2lh3a_,a.1.1.2,2lh3 A:,2lh3,a.1.1,a.1,a
1,d1mi1b1,a.169.1.1,1mi1 B:2249-2553,1mi1,a.169.1,a.169,a
2,d2p5va1,a.4.5.0,2p5v A:3-65,2p5v,a.4.5,a.4,a
3,d3re2a2,a.118.8.1,3re2 A:206-475,3re2,a.118.8,a.118,a
4,d1yoha_,a.1.1.2,1yoh A:,1yoh,a.1.1,a.1,a
...,...,...,...,...,...,...,...
115,d2yhof2,l.1.1.1,2yho F:-1-0,2yho,l.1.1,l.1,l
116,d4drma2,l.1.1.1,4drm A:13-15,4drm,l.1.1,l.1,l
117,d6qoib2,l.1.1.1,6qoi B:0-0,6qoi,l.1.1,l.1,l
118,d2awnd3,l.1.1.1,2awn D:372-372,2awn,l.1.1,l.1,l


In [25]:
print(dev.pdb_id.nunique())
dev.pdb_id.to_csv("data/dev/pdb_ids.txt", index=False, header=None)

120


In [22]:
# small dataset (1 random sample from each fold, 1485 total)
small = df.groupby("fold").apply(lambda x: x.sample(1)).reset_index(drop=True)
small

  small = df.groupby("fold").apply(lambda x: x.sample(1)).reset_index(drop=True)


Unnamed: 0,sid,sccs,description,pdb_id,superfamily,fold,class
0,d5v5qb_,a.1.1.2,5v5q B:,5v5q,a.1.1,a.1,a
1,d1erca_,a.10.1.1,1erc A:,1erc,a.10.1,a.10,a
2,d1vpda1,a.100.1.1,1vpd A:164-296,1vpd,a.100.1,a.100,a
3,d2ejnb2,a.101.1.1,2ejn B:73-144,2ejn,a.101.1,a.101,a
4,d1gszc2,a.102.4.2,1gsz C:37-307,1gsz,a.102.4,a.102,a
...,...,...,...,...,...,...,...
1480,d1u0ia_,k.6.1.1,1u0i A:,1u0i,k.6.1,k.6,k
1481,d1bb1c_,k.7.1.1,1bb1 C:,1bb1,k.7.1,k.7,k
1482,d1jm0a_,k.8.1.1,1jm0 A:,1jm0,k.8.1,k.8,k
1483,d1g6ub_,k.9.1.1,1g6u B:,1g6u,k.9.1,k.9,k


In [26]:
print(small.pdb_id.nunique())
small.pdb_id.drop_duplicates().to_csv("data/small/pdb_ids.txt", index=False, header=None)

1450
