# SIB KAGGLE PROJECT

In [1]:
#Imports
from utils.func import *
from utils.kmers import KMer_prot
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [2]:
# First lets see the data!
# Train dataframeW
data = pd.read_csv("data/train.csv")
# Test dataframe
validation = pd.read_csv("data/test.csv")
print(f"Data is divided in {data.shape[0]} lines and {data.shape[1]} col")
print(f"Validation data is divided in {validation.shape[0]} lines and {validation.shape[1]} col")
print(f"Labels: {[labels for labels in data.columns]}")
print("Remove data_source and seq_id")
data = data.drop(columns=["seq_id", "data_source"])
validation = validation.drop(columns=["seq_id", "data_source"])
print(f"Data is divided in {data.shape[0]} lines and {data.shape[1]} col")
print(f"Validation data is divided in {validation.shape[0]} lines and {validation.shape[1]} col")
print("We want to predict tm values for test data")


Data is divided in 31390 lines and 5 col
Validation data is divided in 2413 lines and 4 col
Labels: ['seq_id', 'protein_sequence', 'pH', 'data_source', 'tm']
Remove data_source and seq_id
Data is divided in 31390 lines and 3 col
Validation data is divided in 2413 lines and 2 col
We want to predict tm values for test data


### The data possesses swapped values between the pH and tm columns, then, "swap_ph_tm" was built to fix this issue.

In [3]:
update_train = pd.read_csv("data/train_updates_20220929.csv")
data = swap_ph_tm(data, update_train)

In [4]:
data

Unnamed: 0,protein_sequence,pH,tm
0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,,
1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,,
2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,,
3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,,
4,AAATKSGPRRQSQGASVRTFTPFYFLVEPVDTLSVRGSSVILNCSA...,,
...,...,...,...
31385,YYMYSGGGSALAAGGGGAGRKGDWNDIDSIKKKDLHHSRGDEKAQG...,7.0,51.8
31386,YYNDQHRLSSYSVETAMFLSWERAIVKPGAMFKKAVIGFNCNVDLI...,7.0,37.2
31387,YYQRTLGAELLYKISFGEMPKSAQDSAENCPSGMQFPDTAIAHANV...,7.0,64.6
31388,YYSFSDNITTVFLSRQAIDDDHSLSLGTISDVVESENGVVAADDAR...,7.0,50.7


In [5]:
print(data.isnull().sum().sort_values(ascending=False))
print()
print(validation.isnull().sum().sort_values(ascending=False))
# There are some missing values in train
# Data_source values are not that important


pH                  2694
tm                  2409
protein_sequence       0
dtype: int64

protein_sequence    0
pH                  0
dtype: int64


In [6]:
missing_data = data[data["pH"].isnull()]
missing_data


Unnamed: 0,protein_sequence,pH,tm
0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,,
1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,,
2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,,
3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,,
4,AAATKSGPRRQSQGASVRTFTPFYFLVEPVDTLSVRGSSVILNCSA...,,
...,...,...,...
28753,MVLKQRANYLGFLIVFFTAFLVEAVPIKRQSNSTVDSLPPLIPSRT...,,58.9
28754,MVLKQRANYLGFLIVFFTAFLVEAVPIKRQSNSTVDSLPPLIPSRT...,,59.4
28755,MVLKQRANYLGFLIVFFTAFLVEAVPIKRQSNSTVDSLPPLIPSRT...,,57.8
28756,MVLKQRANYLGFLIVFFTAFLVEAVPIKRQSNSTVDSLPPLIPSRT...,,59.3


In [7]:
data = data.drop((missing_data).index).reset_index(drop=True)
data
# Podemos remover também a data_source? Não deve de trazer nada de relevante para a analise dos dados


Unnamed: 0,protein_sequence,pH,tm
0,AAPDEITTAWPVNVGPLNPHLYTPNQMFAQSMVYEPLVKYQADGSV...,7.0,48.4
1,AARRFSGPRNQRQQGGGDPGLMHGKTVLITGANSGLGRATAAELLR...,7.0,48.4
2,AASSPEADFVKKTISSHKIVIFSKSYCPYCKKAKSVFRELDQVPYV...,7.0,49.0
3,AATFAYSQSQKRSSSSPGGGSNHGWNNWGKAAALASTTPLVHVASV...,5.5,55.6
4,AAVLVTFIGGLYFITHHKKEESETLQSQKVTGNGLPPKPEERWRYI...,7.0,48.4
...,...,...,...
28691,YYMYSGGGSALAAGGGGAGRKGDWNDIDSIKKKDLHHSRGDEKAQG...,7.0,51.8
28692,YYNDQHRLSSYSVETAMFLSWERAIVKPGAMFKKAVIGFNCNVDLI...,7.0,37.2
28693,YYQRTLGAELLYKISFGEMPKSAQDSAENCPSGMQFPDTAIAHANV...,7.0,64.6
28694,YYSFSDNITTVFLSRQAIDDDHSLSLGTISDVVESENGVVAADDAR...,7.0,50.7


In [9]:
print(f"Train data is divided in {data.shape[0]} lines and {data.shape[1]} col")
print(f"Validation data is divided in {validation.shape[0]} lines and {validation.shape[1]} col")
print(f"Labels: {[labels for labels in data.columns]}")
# 28696 


Train data is divided in 28696 lines and 3 col
Validation data is divided in 2413 lines and 2 col
Labels: ['protein_sequence', 'pH', 'tm']


In [None]:
data.describe()

Unnamed: 0,pH,tm
count,28696.0,28696.0
mean,6.852437,49.079321
std,1.168815,14.210971
min,1.99,-1.0
25%,7.0,41.9
50%,7.0,48.0
75%,7.0,53.8
max,64.9,130.0


In [12]:
data_array = np.array(data.loc[:, ["protein_sequence"]])
# test_array = np.array(test.loc[:, ["protein_sequence"]])

data_array

array([['AAPDEITTAWPVNVGPLNPHLYTPNQMFAQSMVYEPLVKYQADGSVIPWLAKSWTHSEDGKTWTFTLRDDVKFSNGEPFDAEAAAENFRAVLDNRQRHAWLELANQIVDVKALSKTELQITLKSAYYPFLQELALPRPFRFIAPSQFKNHETMNGIKAPIGTGPWILQESKLNQYDVFVRNENYWGEKPAIKKITFNVIPDPTTRAVAFETGDIDLLYGNEGLLPLDTFARFSQNPAYHTQLSQPIETVMLALNTAKAPTNELAVREALNYAVNKKSLIDNALYGTQQVADTLFAPSVPYANLGLKPSQYDPQKAKALLEKAGWTLPAGKDIREKNGQPLRIELSFIGTDALSKSMAEIIQADMRQIGADVSLIGEEESSIYARQRDGRFGMIFHRTWGAPYDPHAFLSSMRVPSHADFQAQQGLADKPLIDKEIGEVLATHDETQRQALYRDILTRLHDEAVYLPISYISMMVVSKPELGNIPYAPIATEIPFEQIKPVK'],
       ['AARRFSGPRNQRQQGGGDPGLMHGKTVLITGANSGLGRATAAELLRLGARVIMGCRDRARAEEAAGQLRQELCQAGGAGPDGTDGQLVVKELDLASLRSVRAFCQELLQEEPRLDVLINNAGVFHCPYTKTEDGFEMQFGVNHLGHFLLTNLLLGLLKSSAPSRIVVVSSKLYKYGEINFEDLNSEQSYNKSFCYSRSKLANILFTRELARRLEGTNVTVNVLHPGIVRTNLGRHIHIPLLARPLFNLVSWAFFKTPLEGAQTSIYLACSPDVEGVSGRYFGDCKEEELLPKAMDESVARKLWDISEVMVGIL'],
       ['AASSPEADFVKKTISSHKIVIFSKSYCPYCKKAKSVFRELDQVPYVVELDEREDGWSIQTALGEIVGRRTVPQVFINGKHLGGSDDTVDAYESGELAKLLGVSGNKEAE'],
       ...,
       ['YYQRTLGAELLYKISFG

In [14]:
# Data dipeptide
data_dipeptide = calculate_dipeptide_composition(data_array)
data_dipeptide_df = pd.DataFrame(data_dipeptide)

# Test dipeptide
# test_dipeptide = calculate_dipeptide_composition(test_array)
# test_dipeptide_df = pd.DataFrame(test_dipeptide)
# test_dipeptide_df


### Add values to Train data

In [15]:
molecular_weight = calculate_molecular_weight(data_array)
isoelectric_point = calculate_isoelectric_point(data_array)
aromaticity = calculate_aromaticity(data_array)
instability_index = calculate_instability_index(data_array)

data["molecular_weight"] = molecular_weight
data["isoelectric_point"] = isoelectric_point
data["aromaticity"] = aromaticity
data["instability_index"] = instability_index

# Merge the dipeptide features
data = data.join(data_dipeptide_df)
data

Unnamed: 0,protein_sequence,pH,tm,molecular_weight,isoelectric_point,aromaticity,instability_index,AA,AR,AN,...,VL,VK,VM,VF,VP,VS,VT,VW,VY,VV
0,AAPDEITTAWPVNVGPLNPHLYTPNQMFAQSMVYEPLVKYQADGSV...,7.0,48.4,56204.2095,5.507664,0.097804,31.797804,0.40,0.40,0.40,...,0.40,0.80,0.20,0.20,0.40,0.40,0.00,0.00,0.40,0.20
1,AARRFSGPRNQRQQGGGDPGLMHGKTVLITGANSGLGRATAAELLR...,7.0,48.4,34386.0086,8.195366,0.070288,41.705431,0.96,1.92,0.64,...,0.96,0.32,0.32,0.32,0.00,0.96,0.32,0.00,0.00,0.64
2,AASSPEADFVKKTISSHKIVIFSKSYCPYCKKAKSVFRELDQVPYV...,7.0,49.0,11936.2894,5.417575,0.082569,28.518349,0.93,0.00,0.00,...,0.00,0.93,0.00,1.85,1.85,0.93,0.00,0.00,0.00,0.93
3,AATFAYSQSQKRSSSSPGGGSNHGWNNWGKAAALASTTPLVHVASV...,5.5,55.6,37040.5511,5.361247,0.130699,27.470243,0.91,0.00,0.91,...,0.30,0.30,0.00,0.30,0.00,0.00,0.30,0.00,0.30,0.30
4,AAVLVTFIGGLYFITHHKKEESETLQSQKVTGNGLPPKPEERWRYI...,7.0,48.4,31215.4943,9.511939,0.046763,67.340288,1.81,0.36,0.00,...,0.36,0.72,0.00,0.00,0.36,0.00,0.72,0.00,0.00,0.36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28691,YYMYSGGGSALAAGGGGAGRKGDWNDIDSIKKKDLHHSRGDEKAQG...,7.0,51.8,61997.6230,8.466327,0.089253,48.604026,0.73,0.73,0.00,...,0.55,0.18,0.18,0.55,0.55,0.36,0.00,0.55,0.18,0.91
28692,YYNDQHRLSSYSVETAMFLSWERAIVKPGAMFKKAVIGFNCNVDLI...,7.0,37.2,52637.6897,6.532244,0.087420,35.285096,1.07,0.21,0.85,...,0.64,0.43,0.21,0.21,0.21,0.43,0.00,0.00,0.21,0.85
28693,YYQRTLGAELLYKISFGEMPKSAQDSAENCPSGMQFPDTAIAHANV...,7.0,64.6,14203.9225,5.271214,0.117188,44.198437,0.79,0.00,1.57,...,0.79,0.79,0.00,0.00,0.79,0.00,0.79,0.00,0.00,0.79
28694,YYSFSDNITTVFLSRQAIDDDHSLSLGTISDVVESENGVVAADDAR...,7.0,50.7,64367.8724,5.688128,0.074199,36.618398,1.01,0.68,0.00,...,1.01,0.34,0.34,0.51,0.17,0.84,0.34,0.00,0.17,1.01


### Add values to Test data

In [None]:
# molecular_weight = calculate_molecular_weight(test_array)
# isoelectric_point = calculate_isoelectric_point(test_array)
# aromaticity = calculate_aromaticity(test_array)
# instability_index = calculate_instability_index(test_array)

# test["molecular_weight"] = molecular_weight
# test["isoelectric_point"] = isoelectric_point
# test["aromaticity"] = aromaticity
# test["instability_index"] = instability_index

# # Merge the dipeptide features
# test = test.join(test_dipeptide_df)
# test

Unnamed: 0,protein_sequence,pH,molecular_weight,isoelectric_point,aromaticity,instability_index,AA,AR,AN,AD,...,VL,VK,VM,VF,VP,VS,VT,VW,VY,VV
0,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,23910.2619,8.751535,0.099548,43.819955,0.45,0.0,0.91,0.45,...,0.91,0.45,0.0,0.0,0.45,0.0,0.0,0.00,0.0,0.0
1,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,23909.3202,9.024494,0.099548,43.819955,0.45,0.0,0.91,0.45,...,0.91,0.45,0.0,0.0,0.45,0.0,0.0,0.00,0.0,0.0
2,VPVNPEPDATSVENVAKTGSGDSQSDPIKADLEVKGQSALPFDVDC...,8,23781.1479,8.907484,0.100000,43.973682,0.46,0.0,0.91,0.46,...,0.91,0.46,0.0,0.0,0.46,0.0,0.0,0.00,0.0,0.0
3,VPVNPEPDATSVENVALCTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,23869.2761,8.699638,0.099548,45.295068,0.45,0.0,0.91,0.45,...,0.91,0.45,0.0,0.0,0.45,0.0,0.0,0.00,0.0,0.0
4,VPVNPEPDATSVENVALFTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,23913.3070,8.761012,0.104072,43.819955,0.45,0.0,0.91,0.45,...,0.91,0.45,0.0,0.0,0.45,0.0,0.0,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2408,VPVNPEPDATSVENVILKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,23936.3852,8.907484,0.099548,44.307285,0.45,0.0,0.91,0.45,...,0.91,0.45,0.0,0.0,0.45,0.0,0.0,0.00,0.0,0.0
2409,VPVNPEPDATSVENVLLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,23936.3852,8.907484,0.099548,43.435792,0.45,0.0,0.91,0.45,...,1.36,0.45,0.0,0.0,0.45,0.0,0.0,0.00,0.0,0.0
2410,VPVNPEPDATSVENVNLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,23937.3302,8.907484,0.099548,43.435792,0.45,0.0,0.91,0.45,...,0.91,0.45,0.0,0.0,0.45,0.0,0.0,0.00,0.0,0.0
2411,VPVNPEPDATSVENVPLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,23920.3428,8.907484,0.099548,44.307285,0.45,0.0,0.91,0.45,...,0.91,0.45,0.0,0.0,0.91,0.0,0.0,0.00,0.0,0.0


# Supervised Machine Learning

## Pre-Processing

#### Verificação da Média e Desvio Padrão dos dados de treino:

In [16]:
data_input = data.loc[:, ~data.columns.isin(["protein_sequence", "tm"])].values
data_output = data.loc[:, "tm"].values

pre_processed_data = preprocessing.scale(data_input)

print("Media global: ", pre_processed_data.mean())
print("Desvio padrao global: ", pre_processed_data.std())
print("\nVerificar se a média e o desvio padrão estão próximos dos valores 0 e 1, respetivamente.")
print("\tMédia:", ((pre_processed_data.mean(axis=0) < 0.000001) & (pre_processed_data.mean(axis=0) > -0.000001)).all())
print("\tDesvio Padrão:", ((pre_processed_data.std(axis=0) < 1.000001) & (pre_processed_data.std(axis=0) > 0.999999)).all())

Media global:  -3.815033945577674e-18
Desvio padrao global:  1.0

Verificar se a média e o desvio padrão estão próximos dos valores 0 e 1, respetivamente.
	Média: True
	Desvio Padrão: True


### Divide the train and test data

In [18]:
num_test = int(pre_processed_data.shape[0] * 0.3)

print("Numero de exemplos para teste: ", num_test)

indices = np.random.permutation(len(data_input))

# Get the Input data pre-processed according with the indexes
train_input = pre_processed_data[indices[:-num_test]]
test_input = pre_processed_data[indices[-num_test:]]

# Get the output data according with the indexes
train_output = data_output[indices[:-num_test]]
test_output = data_output[indices[-num_test:]]

Numero de exemplos para teste:  8608
[11759  5449 21163 ... 23864  2205   481]


## KNNClassifier Model

In [None]:
from sklearn.neighbors import KNeighborsRegressor

# Divide the train data to input and output
train_output = train.loc[:, "tm"].values

# Divide the test data to input and output
test_input = test.loc[:, ~test.columns.isin(["protein_sequence", "tm"])].values

knn = KNeighborsRegressor()
knn.fit(train_input, train_output)
test_prevision = knn.predict(test_input)
print(test_prevision)

[46.48 47.66 54.64 ... 43.78 42.22 56.1 ]
