# Day 1 - Kata

## Parsing Atom Data

The goal is to parse atomic data from a plain-text file in PDB format:

```
ATOM      1  N   ILE A  16      16.792  12.871   4.991  1.00  3.00
ATOM      2  CA  ILE A  16      17.415  14.067   5.486  1.00  3.00
ATOM      3  C   ILE A  16      17.370  15.207   4.480  1.00 11.29
ATOM      4  O   ILE A  16      18.040  15.120   3.445  1.00  3.00
ATOM      5  CB  ILE A  16      18.878  13.807   5.993  1.00  3.00
ATOM      6  CG1 ILE A  16      18.976  12.643   6.989  1.00  3.97
ATOM      7  CG2 ILE A  16      19.566  15.056   6.546  1.00  3.00
ATOM      8  CD1 ILE A  16      18.413  13.065   8.374  1.00  3.00
```
(First residue information from [1ppe.pdb](1ppe.pdb) file)


### 1. Parsing chains

In [1]:
chains = []

with open('1ppe.pdb') as pdb_input:
    for line in pdb_input:
        if line.startswith('ATOM  '):
            chain_id = line[21]
            chains.append(chain_id)
            
print(chains)

['A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A',

Can we get an uniquet set?

In [2]:
chains = []

with open('1ppe.pdb') as pdb_input:
    for line in pdb_input:
        if line.startswith('ATOM  '):
            chain_id = line[21]
            chains.append(chain_id)

chains = set(chains)
print(chains)

{'A', 'B'}


Saving some memory:

In [3]:
chains = []

with open('1ppe.pdb') as pdb_input:
    for line in pdb_input:
        if line.startswith('ATOM  '):
            chain_id = line[21]
            if chain_id not in chains:
                chains.append(chain_id)

print(chains)

['A', 'B']


### 2. Adding residues

In [4]:
chains = {}

with open('1ppe.pdb') as pdb_input:
    for line in pdb_input:
        if line.startswith('ATOM  '):
            chain_id = line[21]
            if chain_id not in chains:
                chains[chain_id] = []

print(chains)

{'A': [], 'B': []}


In [8]:
chains = {}

with open('1ppe.pdb') as pdb_input:
    for line in pdb_input:
        if line.startswith('ATOM  '):
            chain_id = line[21]
            if chain_id not in chains:
                chains[chain_id] = []
            # Parsing residue data
            residue_name = line[17:21].strip()
            if len(chains[chain_id]) == 0 or chains[chain_id][-1] != residue_name:
                chains[chain_id].append(residue_name)
print(chains)

{'A': ['ILE', 'VAL', 'GLY', 'TYR', 'THR', 'CYS', 'GLY', 'ALA', 'ASN', 'THR', 'VAL', 'PRO', 'TYR', 'GLN', 'VAL', 'SER', 'LEU', 'ASN', 'SER', 'GLY', 'TYR', 'HIS', 'PHE', 'CYS', 'GLY', 'SER', 'LEU', 'ILE', 'ASN', 'SER', 'GLN', 'TRP', 'VAL', 'SER', 'ALA', 'HIS', 'CYS', 'TYR', 'LYS', 'SER', 'GLY', 'ILE', 'GLN', 'VAL', 'ARG', 'LEU', 'GLY', 'GLU', 'ASP', 'ASN', 'ILE', 'ASN', 'VAL', 'GLU', 'GLY', 'ASN', 'GLU', 'GLN', 'PHE', 'ILE', 'SER', 'ALA', 'SER', 'LYS', 'SER', 'ILE', 'VAL', 'HIS', 'PRO', 'SER', 'TYR', 'ASN', 'SER', 'ASN', 'THR', 'LEU', 'ASN', 'ASP', 'ILE', 'MET', 'LEU', 'ILE', 'LYS', 'LEU', 'LYS', 'SER', 'ALA', 'SER', 'LEU', 'ASN', 'SER', 'ARG', 'VAL', 'ALA', 'SER', 'ILE', 'SER', 'LEU', 'PRO', 'THR', 'SER', 'CYS', 'ALA', 'SER', 'ALA', 'GLY', 'THR', 'GLN', 'CYS', 'LEU', 'ILE', 'SER', 'GLY', 'TRP', 'GLY', 'ASN', 'THR', 'LYS', 'SER', 'GLY', 'THR', 'SER', 'TYR', 'PRO', 'ASP', 'VAL', 'LEU', 'LYS', 'CYS', 'LEU', 'LYS', 'ALA', 'PRO', 'ILE', 'LEU', 'SER', 'ASP', 'SER', 'CYS', 'LYS', 'SER', 'ALA',

In [9]:
chains = {}

with open('1ppe.pdb') as pdb_input:
    for line in pdb_input:
        if line.startswith('ATOM  '):
            chain_id = line[21]
            if chain_id not in chains:
                chains[chain_id] = []
            # Parsing residue data
            residue_name = line[17:21].strip()
            residue_number = int(line[22:26])
            residue = f"{residue_name}.{residue_number}"
            if len(chains[chain_id]) == 0 or chains[chain_id][-1] != residue:
                chains[chain_id].append(residue)
print(chains)

{'A': ['ILE.16', 'VAL.17', 'GLY.18', 'GLY.19', 'TYR.20', 'THR.21', 'CYS.22', 'GLY.23', 'ALA.24', 'ASN.25', 'THR.26', 'VAL.27', 'PRO.28', 'TYR.29', 'GLN.30', 'VAL.31', 'SER.32', 'LEU.33', 'ASN.34', 'SER.37', 'GLY.38', 'TYR.39', 'HIS.40', 'PHE.41', 'CYS.42', 'GLY.43', 'GLY.44', 'SER.45', 'LEU.46', 'ILE.47', 'ASN.48', 'SER.49', 'GLN.50', 'TRP.51', 'VAL.52', 'VAL.53', 'SER.54', 'ALA.55', 'ALA.56', 'HIS.57', 'CYS.58', 'TYR.59', 'LYS.60', 'SER.61', 'GLY.62', 'ILE.63', 'GLN.64', 'VAL.65', 'ARG.66', 'LEU.67', 'GLY.69', 'GLU.70', 'ASP.71', 'ASN.72', 'ILE.73', 'ASN.74', 'VAL.75', 'VAL.76', 'GLU.77', 'GLY.78', 'ASN.79', 'GLU.80', 'GLN.81', 'PHE.82', 'ILE.83', 'SER.84', 'ALA.85', 'SER.86', 'LYS.87', 'SER.88', 'ILE.89', 'VAL.90', 'HIS.91', 'PRO.92', 'SER.93', 'TYR.94', 'ASN.95', 'SER.96', 'ASN.97', 'THR.98', 'LEU.99', 'ASN.100', 'ASN.101', 'ASP.102', 'ILE.103', 'MET.104', 'LEU.105', 'ILE.106', 'LYS.107', 'LEU.108', 'LYS.109', 'SER.110', 'ALA.111', 'ALA.112', 'SER.113', 'LEU.114', 'ASN.115', 'SER.11

### 3. Adding atomic data to residues

In [11]:
def new_atom():
    return {'name':'', 'number':0, 'x':0., 'y':0., 'z':0.}


chains = {}

with open('1ppe.pdb') as pdb_input:
    for line in pdb_input:
        if line.startswith('ATOM  '):
            chain_id = line[21]
            if chain_id not in chains:
                chains[chain_id] = []
            
            # Parsing residue data
            residue_name = line[17:21].strip()
            residue_number = int(line[22:26])
            residue = f"{residue_name}.{residue_number}"
            if len(chains[chain_id]) == 0 or chains[chain_id][-1] != residue:
                chains[chain_id].append(residue)
                
            # Atom data
            atom = new_atom()
            atom['name'] = line[12:16].strip()
            atom['number'] = int(line[6:11])
            atom['x'] = float(line[30:38])
            atom['y'] = float(line[38:46])
            atom['z'] = float(line[46:54])
            print(atom)
print(chains)

{'name': 'N', 'number': 1, 'x': 16.792, 'y': 12.871, 'z': 4.991}
{'name': 'CA', 'number': 2, 'x': 17.415, 'y': 14.067, 'z': 5.486}
{'name': 'C', 'number': 3, 'x': 17.37, 'y': 15.207, 'z': 4.48}
{'name': 'O', 'number': 4, 'x': 18.04, 'y': 15.12, 'z': 3.445}
{'name': 'CB', 'number': 5, 'x': 18.878, 'y': 13.807, 'z': 5.993}
{'name': 'CG1', 'number': 6, 'x': 18.976, 'y': 12.643, 'z': 6.989}
{'name': 'CG2', 'number': 7, 'x': 19.566, 'y': 15.056, 'z': 6.546}
{'name': 'CD1', 'number': 8, 'x': 18.413, 'y': 13.065, 'z': 8.374}
{'name': 'N', 'number': 9, 'x': 16.723, 'y': 16.321, 'z': 4.866}
{'name': 'CA', 'number': 10, 'x': 16.732, 'y': 17.608, 'z': 4.076}
{'name': 'C', 'number': 11, 'x': 17.82, 'y': 18.575, 'z': 4.601}
{'name': 'O', 'number': 12, 'x': 17.908, 'y': 18.931, 'z': 5.774}
{'name': 'CB', 'number': 13, 'x': 15.333, 'y': 18.307, 'z': 4.062}
{'name': 'CG1', 'number': 14, 'x': 15.4, 'y': 19.511, 'z': 3.139}
{'name': 'CG2', 'number': 15, 'x': 14.147, 'y': 17.352, 'z': 3.76}
{'name': 'N',

In [12]:
def new_atom():
    return {'name':'', 'number':0, 'x':0., 'y':0., 'z':0.}


chains = {}

with open('1ppe.pdb') as pdb_input:
    for line in pdb_input:
        if line.startswith('ATOM  '):
            chain_id = line[21]
            if chain_id not in chains:
                chains[chain_id] = []
            
            # Parsing residue data
            residue_name = line[17:21].strip()
            residue_number = int(line[22:26])
            residue = f"{residue_name}.{residue_number}"
            if len(chains[chain_id]) == 0 or chains[chain_id][-1][0] != residue:
                chains[chain_id].append((residue, []))
                
            # Atom data
            atom = new_atom()
            atom['name'] = line[12:16].strip()
            atom['number'] = int(line[6:11])
            atom['x'] = float(line[30:38])
            atom['y'] = float(line[38:46])
            atom['z'] = float(line[46:54])
            
            chains[chain_id][-1][1].append(atom)
            
print(chains)

{'A': [('ILE.16', [{'name': 'N', 'number': 1, 'x': 16.792, 'y': 12.871, 'z': 4.991}, {'name': 'CA', 'number': 2, 'x': 17.415, 'y': 14.067, 'z': 5.486}, {'name': 'C', 'number': 3, 'x': 17.37, 'y': 15.207, 'z': 4.48}, {'name': 'O', 'number': 4, 'x': 18.04, 'y': 15.12, 'z': 3.445}, {'name': 'CB', 'number': 5, 'x': 18.878, 'y': 13.807, 'z': 5.993}, {'name': 'CG1', 'number': 6, 'x': 18.976, 'y': 12.643, 'z': 6.989}, {'name': 'CG2', 'number': 7, 'x': 19.566, 'y': 15.056, 'z': 6.546}, {'name': 'CD1', 'number': 8, 'x': 18.413, 'y': 13.065, 'z': 8.374}]), ('VAL.17', [{'name': 'N', 'number': 9, 'x': 16.723, 'y': 16.321, 'z': 4.866}, {'name': 'CA', 'number': 10, 'x': 16.732, 'y': 17.608, 'z': 4.076}, {'name': 'C', 'number': 11, 'x': 17.82, 'y': 18.575, 'z': 4.601}, {'name': 'O', 'number': 12, 'x': 17.908, 'y': 18.931, 'z': 5.774}, {'name': 'CB', 'number': 13, 'x': 15.333, 'y': 18.307, 'z': 4.062}, {'name': 'CG1', 'number': 14, 'x': 15.4, 'y': 19.511, 'z': 3.139}, {'name': 'CG2', 'number': 15, 'x'

### 4. Operations over data

In [27]:
def new_atom():
    return {'name':'', 'number':0, 'x':0., 'y':0., 'z':0.}


chains = {}

with open('1ppe.pdb') as pdb_input:
    for line in pdb_input:
        if line.startswith('ATOM  '):
            chain_id = line[21]
            if chain_id not in chains:
                chains[chain_id] = []
            
            # Parsing residue data
            residue_name = line[17:21].strip()
            residue_number = int(line[22:26])
            residue = f"{residue_name}.{residue_number}"
            if len(chains[chain_id]) == 0 or chains[chain_id][-1][0] != residue:
                chains[chain_id].append((residue, []))
                
            # Atom data
            atom = new_atom()
            atom['name'] = line[12:16].strip()
            atom['number'] = int(line[6:11])
            atom['x'] = float(line[30:38])
            atom['y'] = float(line[38:46])
            atom['z'] = float(line[46:54])
            
            chains[chain_id][-1][1].append(atom)
            
num_chains = len(chains)
print('Number of chains: {}'.format(num_chains))
print()

for chain in chains:
    print(f'Chain {chain}')
    print('Number of residues: {}'.format(len(chains[chain])))
    print()
        
num_atoms = 0
for chain in chains:
    for residue in chains[chain]:
        num_atoms += len(residue[1])
print('Number of atoms: {}'.format(num_atoms))

Number of chains: 2

Chain A
Number of residues: 223

Chain B
Number of residues: 29

Number of atoms: 1849
