In [1]:
# Import libraries
import pandas as pd
import numpy as np
import re
from math import isnan

# Visualization plots
from matplotlib import pyplot as plt
import seaborn as sns

# ML Data Preparation
from sklearn.preprocessing import LabelEncoder # labeling of categorical target variables
from sklearn.preprocessing import StandardScaler # scale values to standard normal distribution X~N(0,1)
from sklearn.model_selection import train_test_split # train-test data split

# ML Feature Importance Analysis
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline

# ML Techniques
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# ML Model Evaluation
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score

# Support Vector Machine
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVC

### Special Requirement - merge the replacement values from the auxiliary file (as specified in the project description)

In [2]:
df = pd.read_csv('train.csv')
df_replace = pd.read_csv('train_updates_20220929.csv')
df = df.drop('data_source', axis=1)
df_replace = df_replace.drop('data_source', axis=1)

df = df.merge(df_replace, on='seq_id', how='left')
df = df.loc[:,~df.columns.str.match(r'(.*?)\_y')]

In [3]:
df = df.rename(columns=lambda x: re.sub(r'\_x', '', x))

In [4]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
seq_id,31390.0,15694.5,9061.656811,0.0,7847.25,15694.5,23541.75,31389.0
pH,31104.0,6.892339,1.612225,1.99,7.0,7.0,7.0,64.9
tm,31390.0,49.147337,14.010089,-1.0,42.1,48.0,53.8,130.0


# 1. Data Preparation

## 1.1 Create new columns/features

In [9]:
df_n = df.groupby(['protein_sequence','pH']).agg({'tm': 'max'})
df_n = pd.DataFrame(df_n).reset_index()
df_n['len_of_seq'] = df_n['protein_sequence'].apply(lambda x:len(x))

df_n['len_of_seq']
df_n = df_n[(df_n['len_of_seq'] < 300) & (df_n['len_of_seq'] > 200)].reset_index().drop(['len_of_seq','index'],axis=1)
df_n

Unnamed: 0,protein_sequence,pH,tm
0,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,7.0,50.5
1,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,7.0,47.2
2,AAEYAVVLKTLSNPFWVDMKKGIEDEAKTLGVSVDIFASPSEGDFQ...,7.0,48.1
3,AAGGQPQGATPGQPDQNFDYMFKLLIIGNSSVGKTSFLFRYCDDSF...,7.0,45.1
4,AAIDPNRIVALEWLPVELLLALGIVPYGVADTINYRLWVSEPPLPD...,7.0,60.6
...,...,...,...
5400,YSIYHHVKQTANQMHENVNWKSEKRKENVSFERKTPISILLIGVDE...,7.0,68.2
5401,YTGSVSILAALVDSLVDIGASLTNLLVVRYSLQPADDNHSFGHGKA...,7.0,62.4
5402,YVFDCLDDCECDTVDEVIHCHNGDRTKLKLPASSRLRGFPVIGLTY...,7.0,44.6
5403,YVSILLQSDKKLTQEQVSDSQVLIRSRVLRENGKYIPKQSFLTRKY...,7.0,49.4


In [10]:
for i in range(80):
    df_n['AA_'+ str(i+1)] = df_n['protein_sequence'].apply(lambda x:x[3*i:3*i+3] if len(x)/(3*(i+1))>1 else np.nan)
    

In [11]:
df_n

Unnamed: 0,protein_sequence,pH,tm,AA_1,AA_2,AA_3,AA_4,AA_5,AA_6,AA_7,...,AA_71,AA_72,AA_73,AA_74,AA_75,AA_76,AA_77,AA_78,AA_79,AA_80
0,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,7.0,50.5,AAA,DGE,PLH,NEE,ERA,GAG,QVG,...,MGL,RTQ,DAI,NRI,QDL,LTE,GTL,TGV,IDD,RGK
1,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,7.0,47.2,AAA,SGL,RTA,IPA,QPL,RHL,LQP,...,NTD,SLD,WAL,YDH,LMD,FLA,DRG,VDN,TFA,DEL
2,AAEYAVVLKTLSNPFWVDMKKGIEDEAKTLGVSVDIFASPSEGDFQ...,7.0,48.1,AAE,YAV,VLK,TLS,NPF,WVD,MKK,...,AVA,NAG,KTG,KVL,VVG,TDG,IPE,ARK,MVE,AGQ
3,AAGGQPQGATPGQPDQNFDYMFKLLIIGNSSVGKTSFLFRYCDDSF...,7.0,45.1,AAG,GQP,QGA,TPG,QPD,QNF,DYM,...,PAQ,QQC,,,,,,,,
4,AAIDPNRIVALEWLPVELLLALGIVPYGVADTINYRLWVSEPPLPD...,7.0,60.6,AAI,DPN,RIV,ALE,WLP,VEL,LLA,...,DNS,KDM,DAL,MAT,PLW,QAM,PFV,RAG,RFQ,RVP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5400,YSIYHHVKQTANQMHENVNWKSEKRKENVSFERKTPISILLIGVDE...,7.0,68.2,YSI,YHH,VKQ,TAN,QMH,ENV,NWK,...,IGK,NVK,TNL,TFE,EMK,EIQ,ANY,KDA,RKH,IKQ
5401,YTGSVSILAALVDSLVDIGASLTNLLVVRYSLQPADDNHSFGHGKA...,7.0,62.4,YTG,SVS,ILA,ALV,DSL,VDI,GAS,...,RFI,QIH,LEM,EDS,LPL,VQA,HMV,ADQ,VEQ,AIL
5402,YVFDCLDDCECDTVDEVIHCHNGDRTKLKLPASSRLRGFPVIGLTY...,7.0,44.6,YVF,DCL,DDC,ECD,TVD,EVI,HCH,...,,,,,,,,,,
5403,YVSILLQSDKKLTQEQVSDSQVLIRSRVLRENGKYIPKQSFLTRKY...,7.0,49.4,YVS,ILL,QSD,KKL,TQE,QVS,DSQ,...,ARD,LHF,EGM,FKK,ELQ,,,,,


In [12]:
a = df_n.groupby(['pH','AA_1']).agg({'tm': 'mean'}).reset_index()
a

Unnamed: 0,pH,AA_1,tm
0,4.50,MHS,65.550000
1,5.00,MLV,42.314286
2,5.20,MLV,56.200000
3,5.28,MLV,38.300000
4,5.35,MLV,37.600000
...,...,...,...
1100,8.00,MST,49.000000
1101,8.20,MLV,20.000000
1102,9.00,MER,37.028571
1103,9.00,MKI,81.150000


In [13]:
new = df_n.iloc[:,0:3]
for i in range(80):
    a = df_n.groupby(['pH','AA_'+str(i+1)]).agg({'tm': 'mean'}).reset_index()
    a = pd.merge(df_n.iloc[:,list(range(i+4))],a, on=['pH','AA_'+str(i+1)], how='left')
    a = a.drop('AA_'+str(i+1),axis=1)
    a = a.rename(columns = {'tm_y':'AA_'+str(i+1)})
    new= pd.concat([new,a.iloc[:,-1]],axis=1)
new

Unnamed: 0,protein_sequence,pH,tm,AA_1,AA_2,AA_3,AA_4,AA_5,AA_6,AA_7,...,AA_71,AA_72,AA_73,AA_74,AA_75,AA_76,AA_77,AA_78,AA_79,AA_80
0,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,7.0,50.5,48.85,50.750000,49.500000,50.000000,49.65,59.516667,50.400000,...,50.50,52.733333,50.50,50.500000,50.600000,48.900,50.500000,48.666667,50.50,50.50
1,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,7.0,47.2,48.85,62.433333,47.200000,64.300000,46.75,47.600000,47.200000,...,66.70,47.500000,47.20,47.200000,48.650000,46.225,48.966667,47.200000,48.85,64.70
2,AAEYAVVLKTLSNPFWVDMKKGIEDEAKTLGVSVDIFASPSEGDFQ...,7.0,48.1,48.10,48.866667,51.250000,49.650000,57.15,49.150000,48.100000,...,62.15,48.100000,45.55,46.533333,48.100000,47.950,48.100000,44.800000,48.10,48.10
3,AAGGQPQGATPGQPDQNFDYMFKLLIIGNSSVGKTSFLFRYCDDSF...,7.0,45.1,45.10,45.100000,45.100000,45.100000,46.45,41.050000,41.050000,...,45.10,45.100000,,,,,,,,
4,AAIDPNRIVALEWLPVELLLALGIVPYGVADTINYRLWVSEPPLPD...,7.0,60.6,60.60,60.600000,62.500000,51.300000,60.60,60.600000,48.600000,...,60.60,60.600000,60.60,60.600000,60.600000,60.600,60.600000,60.600000,60.60,64.85
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5400,YSIYHHVKQTANQMHENVNWKSEKRKENVSFERKTPISILLIGVDE...,7.0,68.2,68.20,68.200000,57.750000,68.200000,59.25,59.000000,68.200000,...,68.20,68.200000,61.65,54.850000,60.800000,68.200,60.600000,68.200000,68.20,55.00
5401,YTGSVSILAALVDSLVDIGASLTNLLVVRYSLQPADDNHSFGHGKA...,7.0,62.4,62.40,62.400000,50.466667,53.733333,55.55,50.450000,49.683333,...,62.40,62.400000,50.55,64.066667,51.933333,62.400,62.400000,62.400000,62.40,62.40
5402,YVFDCLDDCECDTVDEVIHCHNGDRTKLKLPASSRLRGFPVIGLTY...,7.0,44.6,44.60,44.600000,44.600000,44.600000,44.60,56.950000,44.600000,...,,,,,,,,,,
5403,YVSILLQSDKKLTQEQVSDSQVLIRSRVLRENGKYIPKQSFLTRKY...,7.0,49.4,49.40,54.444444,49.400000,43.433333,49.40,46.950000,46.100000,...,54.20,49.400000,49.40,49.400000,49.400000,,,,,


In [18]:
new['len_of_seq'] = new['protein_sequence'].apply(lambda x:len(x))

In [21]:
new = pd.DataFrame(new.sort_values(by='protein_sequence',ascending=False))

In [None]:
import re


def find_ngrams(text: str, number: int=3) -> set:
    """
    returns a set of ngrams for the given string
    :param text: the string to find ngrams for
    :param number: the length the ngrams should be. defaults to 3 (trigrams)
    :return: set of ngram strings
    """

    if not text:
        return set()

    words = [f'  {x} ' for x in re.split(r'\W+', text.lower()) if x.strip()]

    ngrams = set()

    for word in words:
        for x in range(0, len(word) - number + 1):
            ngrams.add(word[x:x+number])

    return ngrams


def similarity(text1: str, text2: str, number: int=3) -> float:
    """
    Finds the similarity between 2 strings using ngrams.
    0 being completely different strings, and 1 being equal strings
    """

    ngrams1 = find_ngrams(text1, number)
    ngrams2 = find_ngrams(text2, number)

    num_unique = len(ngrams1 | ngrams2)
    num_equal = len(ngrams1 & ngrams2)

    return float(num_equal) / float(num_unique)

## Test

In [23]:
test = pd.read_csv('test.csv')

In [27]:
test.shape

(2413, 4)

In [None]:
find_similarity(df['protein_sequence'])

In [1]:
import tensorflow as tf

In [2]:
print(tf.__version__)

2.10.0


In [3]:
if tf.test.gpu_device_name():
    print('Default GPU Device:{}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")

Default GPU Device:/device:GPU:0
