In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import model_selection, preprocessing, feature_extraction, linear_model

In [1]:
SEED = 42
N_FOLDS = 5 

#### Finding Groups in the Train Data
It is observed that  train data contains mutations. We can see from the training set to see how mutations affects the target. This is a required step because test data is all mutations.

In [4]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')
train_df.head()

Unnamed: 0,seq_id,protein_sequence,pH,data_source,tm
0,0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,7.0,doi.org/10.1038/s41592-020-0801-4,75.7
1,1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,7.0,doi.org/10.1038/s41592-020-0801-4,50.5
2,2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,7.0,doi.org/10.1038/s41592-020-0801-4,40.5
3,3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,7.0,doi.org/10.1038/s41592-020-0801-4,47.2
4,4,AAATKSGPRRQSQGASVRTFTPFYFLVEPVDTLSVRGSSVILNCSA...,7.0,doi.org/10.1038/s41592-020-0801-4,49.5


#### Find Mutations, Insertions, Deletions
Below we find training set mutations, insertions, deletions. For each observed protein sequence length, we compare all rows with ± DEL_THRESH sequence length. 

In [28]:
train_df['x'] = train_df.protein_sequence.str.len()
vc = train_df.x.value_counts()

In [6]:
len(vc)

1965

There are 1965 unique lengths protein sequence in the training set.

In [29]:
# Custom Levenshtein Matrix function from stackoverflow as such a package is not available in python.

from Levenshtein import distance

def get_distance_matrix(str_list):
    matrix = np.zeros(shape=(len(str_list), len(str_list)))

    for i in range(0, len(str_list)):
        for j in range(i+1, len(str_list)):
            matrix[i][j] = distance(str_list[i], str_list[j]) 
    for i in range(0, len(str_list)):
        for j in range(0, len(str_list)):
            if i == j:
                matrix[i][j] = 0 
            elif i > j:
                matrix[i][j] = matrix[j][i]

    return matrix

In [25]:
train_df['group'] = -1
grp_idx = 0

MUT_THRESH = 10
DEL_THRESH = 3

for k in range(len(vc)):
    c = vc.index[k]
    
    # Getting subset of training data withing the threshold distance
    tmp = train_df.loc[(train_df.x>=c-DEL_THRESH) & 
                       (train_df.x<=c+DEL_THRESH) & 
                       (train_df.group==-1)]
    if len(tmp)<=1: break
    
    # Levenstein Distance
    x = get_distance_matrix(tmp.protein_sequence.tolist())
    
    # Finding the number of mutations
    mutations = []
    for m in range(1, MUT_THRESH + 1):
        mutations.append( len( np.unique( np.where( x==m )[0] ) ) )
        
    # finding rows with mutations within the threshold
    mut_rows = np.unique( np.where( (x>0)&(x<=MUT_THRESH) )[0] )
    seen = []
    for j in mut_rows:
        if j in seen: continue
        i = np.where( np.array(x[j,])<=MUT_THRESH )[0]
        seen += list(i)
        idx = tmp.iloc[i].index
        train_df.loc[idx,'group'] = grp_idx
        grp_idx += 1

Saving the new dataset with a group column.

In [27]:
train_df = train_df.drop('x',axis=1)
train_df.to_csv('train_with_groups.csv',index=False)
train_df.head()

Unnamed: 0,seq_id,protein_sequence,pH,data_source,tm,group
0,0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,7.0,doi.org/10.1038/s41592-020-0801-4,75.7,-1
1,1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,7.0,doi.org/10.1038/s41592-020-0801-4,50.5,-1
2,2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,7.0,doi.org/10.1038/s41592-020-0801-4,40.5,-1
3,3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,7.0,doi.org/10.1038/s41592-020-0801-4,47.2,-1
4,4,AAATKSGPRRQSQGASVRTFTPFYFLVEPVDTLSVRGSSVILNCSA...,7.0,doi.org/10.1038/s41592-020-0801-4,49.5,-1
