In [1]:
pip install biopython

Note: you may need to restart the kernel to use updated packages.


In [2]:
from Bio import AlignIO

# Load MSA from a Clustal file
alignment = AlignIO.read("../data/AIntibody_COMPETITION_1.aligned.fasta", "fasta")

# Print alignment
print(alignment)

Alignment with 184171 rows and 449 columns
-------------------DIQMTQSPSSVSASVGDRVTITCRA...--- parental.0
-------------------DIQMTQSPSSVSASVGDRVTITCRA...--- phase1_h1_h2_am.1
-------------------DIQMTQSPSSVSASVGDRVTITCRA...--- phase1_h1_h2_am.10
-------------------DIQMTQSPSSVSASVGDRVTITCRA...--- phase1_h1_h2_am.100
-------------------DIQMTQSPSSVSASVGDRVTITCRA...--- phase1_h1_h2_am.1000
-------------------DIQMTQSPSSVSASVGDRVTITCRA...--- phase1_h1_h2_am.10000
-------------------DIQMTQSPSSVSASVGDRVTITCRA...--- phase1_h1_h2_am.10001
-------------------DIQMTQSPSSVSASVGDRVTITCRA...--- phase1_h1_h2_am.10002
-------------------DIQMTQSPSSVSASVGDRVTITCRA...--- phase1_h1_h2_am.10003
-------------------DIQMTQSPSSVSASVGDRVTITCRA...--- phase1_h1_h2_am.10004
-------------------DIQMTQSPSSVSASVGDRVTITCRA...--- phase1_h1_h2_am.10005
-------------------DIQMTQSPSSVSASVGDRVTITCRA...--- phase1_h1_h2_am.10006
-------------------DIQMTQSPSSVSASVGDRVTITCRA...--- phase1_h1_h2_am.10007
-------------------DIQMTQSPSS

In [3]:
from tqdm import tqdm

crds = ['QSIGTH', 'GASNLES', 'QQYKAYPLT', 'GYTFTSHA', 'ISPYRGDT']
consider_positions = []
for crd in crds:
    # get the index in the first sequence where the crd starts
    seq = alignment[0].seq
    
    candidate = ''

    # Find the first index where the crd starts
    start = None
    for i, aa in enumerate(seq):
        if aa == '-':
            continue
        sequence = seq[i:].replace('-', '')[:len(crd)]
        if str(sequence) == str(crd):
            start = i
            break


    if start is None:
        print(f"CRD {crd} not found in sequence")
        continue

    # Find the index where the crd ends
    rev_seq = seq[start:][::-1]
    rev_crd = crd[::-1]
    end = None
    for i, aa in enumerate(rev_seq):
        if aa == '-':
            continue
        sequence = rev_seq[i:].replace('-', '')[:len(rev_crd)]
        if str(sequence) == str(rev_crd):
            end = start + len(rev_seq) - i
            break

    print(f"CRD {crd} found from {start} to {end}", crd, seq[start:end])
    consider_positions.extend(range(start, end))

consider_positions

CRD QSIGTH found from 54 to 82 QSIGTH QS----I--------G-------T---H
CRD GASNLES found from 114 to 156 GASNLES G--------------------A---S-----N-------LES
CRD QQYKAYPLT found from 203 to 273 QQYKAYPLT QQ----Y----K------------------A--------Y----P-------------------L----T
CRD GYTFTSHA found from 319 to 337 GYTFTSHA GY---TF--T--S---HA
CRD ISPYRGDT found from 360 to 379 ISPYRGDT ISP--YRG------D---T


[54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 203,
 204,
 205,
 206,
 207,
 208,
 209,
 210,
 211,
 212,
 213,
 214,
 215,
 216,
 217,
 218,
 219,
 220,
 221,
 222,
 223,
 224,
 225,
 226,
 227,
 228,
 229,
 230,
 231,
 232,
 233,
 234,
 235,
 236,
 237,
 238,
 239,
 240,
 241,
 242,
 243,
 244,
 245,
 246,
 247,
 248,
 249,
 250,
 251,
 252,
 253,
 254,
 255,
 256,
 257,
 258,
 259,
 260,
 261,
 262,
 263,
 264,
 265,
 266,
 267,
 268,
 269,
 270,
 271,
 272,
 319,
 320,
 321,
 322,
 323,
 324,
 325,
 326,
 327,
 328,
 329,
 330,
 331,
 332,
 333,
 334,
 335,
 336,
 360,
 361,
 362,
 363,
 364,
 365,
 366,
 367,
 368,
 369,
 370,
 371,
 372,
 3

In [11]:
mutations_to_consider = []

# For each position get the frequency of each amino acid
wt_position = 0
chain = 'A'
for i in range(0, alignment.get_alignment_length()):

    if wt_position > 107:
        wt_position = 0
        chain = 'B'

    if alignment[0].seq[i] == '-':
        continue
    else:
        wt_position += 1

    # Get the column
    column = alignment[:, i]
    if i not in consider_positions:
        continue
    # Count the frequency of each amino acid
    frequency = {}
    for amino_acid in column:
        if amino_acid in frequency:
            frequency[amino_acid] += 1
        else:
            frequency[amino_acid] = 1
    frequency = {k: v / len(column) for k, v in frequency.items()}
    
    # Get most common amino acid
    most_common = max(frequency, key=frequency.get)
    if most_common == '-' and frequency[most_common] > 0.99:
        continue

    
    # Remove any gaps in the frequency
    frequency.pop('-', None)
    frequency.pop('X', None)

    # Scale so sum is 1
    total = sum(frequency.values())
    frequency = {k: v / total for k, v in frequency.items()}

    mutations_to_consider.append([{'chain':chain, 'position':wt_position, 'frequency':frequency}])
    print(f"Position {i} {chain} {wt_position} {frequency}")

Position 54 A 27 {'Q': 0.7025692184408067, 'E': 0.0941229060024749, 'R': 0.11144727732622478, 'H': 0.07864654030451208, 'T': 0.0001471862887794986, 'P': 0.004137570117912571, 'L': 0.003205390288975747, 'D': 0.0008122502603016775, 'V': 0.000119929568635147, 'K': 0.003363479265812986, 'F': 0.00032708064173221906, 'S': 0.0003052752656167378, 'A': 1.6354032086610952e-05, 'G': 0.0003052752656167378, 'Y': 0.00034343467381883004, 'N': 0.00010902688057740636, 'I': 5.4513440288703186e-06, 'C': 1.6354032086610952e-05}
Position 55 A 28 {'S': 0.4433938168756147, 'N': 0.09387719612861853, 'D': 0.20615486612649914, 'T': 0.06284745431128644, 'A': 0.03081780487672334, 'H': 0.02480749061228038, 'G': 0.05274512680893612, 'R': 0.020628528885918145, 'I': 0.002912774363238178, 'Y': 0.016563687050652927, 'M': 0.001081421825157458, 'P': 0.018933033361048162, 'E': 0.011189183608036212, 'V': 0.00608639419184097, 'K': 0.005890760092817511, 'Q': 0.00013585701321073594, 'F': 0.0012444502410103413, 'L': 0.00041300

In [12]:
import json

with open('../data/proportional_mutations.json', 'w') as f:
    json.dump(mutations_to_consider, f)

In [13]:
mutations_to_consider = []

# For each position get the frequency of each amino acid
wt_position = 0
chain = 'A'
for i in range(0, alignment.get_alignment_length()):

    if wt_position > 107:
        wt_position = 0
        chain = 'B'

    if alignment[0].seq[i] == '-':
        continue
    else:
        wt_position += 1

    # Get the column
    column = alignment[:, i]
    if i not in consider_positions:
        continue
    # Count the frequency of each amino acid
    frequency = {}
    for amino_acid in column:
        if amino_acid in frequency:
            frequency[amino_acid] += 1
        else:
            frequency[amino_acid] = 1
    frequency = {k: v / len(column) for k, v in frequency.items()}
    
    # Get most common amino acid
    most_common = max(frequency, key=frequency.get)
    if most_common == '-' and frequency[most_common] > 0.99:
        continue

    
    # Remove any gaps in the frequency
    frequency.pop('-', None)
    frequency.pop('X', None)

    # Scale so sum is 1
    total = len(frequency.values())
    frequency = {k: 1 / total for k, v in frequency.items()}

    mutations_to_consider.append([{'chain':chain, 'position':wt_position, 'frequency':frequency}])
    print(f"Position {i} {chain} {wt_position} {frequency}")

Position 54 A 27 {'Q': 0.05555555555555555, 'E': 0.05555555555555555, 'R': 0.05555555555555555, 'H': 0.05555555555555555, 'T': 0.05555555555555555, 'P': 0.05555555555555555, 'L': 0.05555555555555555, 'D': 0.05555555555555555, 'V': 0.05555555555555555, 'K': 0.05555555555555555, 'F': 0.05555555555555555, 'S': 0.05555555555555555, 'A': 0.05555555555555555, 'G': 0.05555555555555555, 'Y': 0.05555555555555555, 'N': 0.05555555555555555, 'I': 0.05555555555555555, 'C': 0.05555555555555555}
Position 55 A 28 {'S': 0.05, 'N': 0.05, 'D': 0.05, 'T': 0.05, 'A': 0.05, 'H': 0.05, 'G': 0.05, 'R': 0.05, 'I': 0.05, 'Y': 0.05, 'M': 0.05, 'P': 0.05, 'E': 0.05, 'V': 0.05, 'K': 0.05, 'Q': 0.05, 'F': 0.05, 'L': 0.05, 'C': 0.05, 'W': 0.05}
Position 60 A 29 {'I': 0.05, 'V': 0.05, 'M': 0.05, 'L': 0.05, 'F': 0.05, 'C': 0.05, 'S': 0.05, 'R': 0.05, 'N': 0.05, 'A': 0.05, 'G': 0.05, 'T': 0.05, 'Y': 0.05, 'D': 0.05, 'P': 0.05, 'Q': 0.05, 'K': 0.05, 'E': 0.05, 'W': 0.05, 'H': 0.05}
Position 69 A 30 {'G': 0.05, 'E': 0.05

In [14]:
import json

with open('../data/equal_mutations.json', 'w') as f:
    json.dump(mutations_to_consider, f)