# SIB KAGGLE PROJECT

In [1]:
#Imports
from utils.func import *
from utils.kmers import KMer_prot
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [2]:
# First lets see the data!
# Train dataframe
train = pd.read_csv("data/train.csv")
# Test dataframe
test = pd.read_csv("data/test.csv")
print(f"Train data is divided in {train.shape[0]} lines and {train.shape[1]} col")
print(f"Test data is divided in {test.shape[0]} lines and {test.shape[1]} col")
print(f"Labels: {[labels for labels in train.columns]}")
print("Remove data_source and seq_id")
train = train.drop(columns=["seq_id", "data_source"])
test = test.drop(columns=["seq_id", "data_source"])
print(f"Train data is divided in {train.shape[0]} lines and {train.shape[1]} col")
print(f"Test data is divided in {test.shape[0]} lines and {test.shape[1]} col")
print("We want to predict tm values for test data")


Train data is divided in 31390 lines and 5 col
Test data is divided in 2413 lines and 4 col
Labels: ['seq_id', 'protein_sequence', 'pH', 'data_source', 'tm']
Remove data_source and seq_id
Train data is divided in 31390 lines and 3 col
Test data is divided in 2413 lines and 2 col
We want to predict tm values for test data


### The data possesses swapped values between the pH and tm columns, then, "swap_ph_tm" was built to fix this issue.

In [3]:
update_train = pd.read_csv("data/train_updates_20220929.csv")
train = swap_ph_tm(train, update_train)

In [4]:
train

Unnamed: 0,protein_sequence,pH,tm
0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,,
1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,,
2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,,
3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,,
4,AAATKSGPRRQSQGASVRTFTPFYFLVEPVDTLSVRGSSVILNCSA...,,
...,...,...,...
31385,YYMYSGGGSALAAGGGGAGRKGDWNDIDSIKKKDLHHSRGDEKAQG...,7.0,51.8
31386,YYNDQHRLSSYSVETAMFLSWERAIVKPGAMFKKAVIGFNCNVDLI...,7.0,37.2
31387,YYQRTLGAELLYKISFGEMPKSAQDSAENCPSGMQFPDTAIAHANV...,7.0,64.6
31388,YYSFSDNITTVFLSRQAIDDDHSLSLGTISDVVESENGVVAADDAR...,7.0,50.7


In [5]:
print(train.isnull().sum().sort_values(ascending=False))
print()
print(test.isnull().sum().sort_values(ascending=False))
# There are some missing values in train
# Data_source values are not that important


pH                  2694
tm                  2409
protein_sequence       0
dtype: int64

protein_sequence    0
pH                  0
dtype: int64


In [6]:
missing_data = train[train["pH"].isnull()]
missing_data


Unnamed: 0,protein_sequence,pH,tm
0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,,
1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,,
2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,,
3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,,
4,AAATKSGPRRQSQGASVRTFTPFYFLVEPVDTLSVRGSSVILNCSA...,,
...,...,...,...
28753,MVLKQRANYLGFLIVFFTAFLVEAVPIKRQSNSTVDSLPPLIPSRT...,,58.9
28754,MVLKQRANYLGFLIVFFTAFLVEAVPIKRQSNSTVDSLPPLIPSRT...,,59.4
28755,MVLKQRANYLGFLIVFFTAFLVEAVPIKRQSNSTVDSLPPLIPSRT...,,57.8
28756,MVLKQRANYLGFLIVFFTAFLVEAVPIKRQSNSTVDSLPPLIPSRT...,,59.3


In [7]:
train = train.drop((missing_data).index).reset_index(drop=True)
train
# Podemos remover também a data_source? Não deve de trazer nada de relevante para a analise dos dados


Unnamed: 0,protein_sequence,pH,tm
0,AAPDEITTAWPVNVGPLNPHLYTPNQMFAQSMVYEPLVKYQADGSV...,7.0,48.4
1,AARRFSGPRNQRQQGGGDPGLMHGKTVLITGANSGLGRATAAELLR...,7.0,48.4
2,AASSPEADFVKKTISSHKIVIFSKSYCPYCKKAKSVFRELDQVPYV...,7.0,49.0
3,AATFAYSQSQKRSSSSPGGGSNHGWNNWGKAAALASTTPLVHVASV...,5.5,55.6
4,AAVLVTFIGGLYFITHHKKEESETLQSQKVTGNGLPPKPEERWRYI...,7.0,48.4
...,...,...,...
28691,YYMYSGGGSALAAGGGGAGRKGDWNDIDSIKKKDLHHSRGDEKAQG...,7.0,51.8
28692,YYNDQHRLSSYSVETAMFLSWERAIVKPGAMFKKAVIGFNCNVDLI...,7.0,37.2
28693,YYQRTLGAELLYKISFGEMPKSAQDSAENCPSGMQFPDTAIAHANV...,7.0,64.6
28694,YYSFSDNITTVFLSRQAIDDDHSLSLGTISDVVESENGVVAADDAR...,7.0,50.7


In [8]:
print(f"Train data is divided in {train.shape[0]} lines and {train.shape[1]} col")
print(f"Test data is divided in {test.shape[0]} lines and {test.shape[1]} col")
print(f"Labels: {[labels for labels in train.columns]}")


Train data is divided in 28696 lines and 3 col
Test data is divided in 2413 lines and 2 col
Labels: ['protein_sequence', 'pH', 'tm']


In [9]:
train.describe()

Unnamed: 0,pH,tm
count,28696.0,28696.0
mean,6.852437,49.079321
std,1.168815,14.210971
min,1.99,-1.0
25%,7.0,41.9
50%,7.0,48.0
75%,7.0,53.8
max,64.9,130.0


In [10]:
train_array = np.array(train.loc[:, ["protein_sequence"]])
test_array = np.array(test.loc[:, ["protein_sequence"]])

In [11]:
train

Unnamed: 0,protein_sequence,pH,tm
0,AAPDEITTAWPVNVGPLNPHLYTPNQMFAQSMVYEPLVKYQADGSV...,7.0,48.4
1,AARRFSGPRNQRQQGGGDPGLMHGKTVLITGANSGLGRATAAELLR...,7.0,48.4
2,AASSPEADFVKKTISSHKIVIFSKSYCPYCKKAKSVFRELDQVPYV...,7.0,49.0
3,AATFAYSQSQKRSSSSPGGGSNHGWNNWGKAAALASTTPLVHVASV...,5.5,55.6
4,AAVLVTFIGGLYFITHHKKEESETLQSQKVTGNGLPPKPEERWRYI...,7.0,48.4
...,...,...,...
28691,YYMYSGGGSALAAGGGGAGRKGDWNDIDSIKKKDLHHSRGDEKAQG...,7.0,51.8
28692,YYNDQHRLSSYSVETAMFLSWERAIVKPGAMFKKAVIGFNCNVDLI...,7.0,37.2
28693,YYQRTLGAELLYKISFGEMPKSAQDSAENCPSGMQFPDTAIAHANV...,7.0,64.6
28694,YYSFSDNITTVFLSRQAIDDDHSLSLGTISDVVESENGVVAADDAR...,7.0,50.7


In [12]:
# Train dipeptide
train_dipeptide = calculate_dipeptide_composition(train_array)
train_dipeptide_df = pd.DataFrame(train_dipeptide)

# Test dipeptide
test_dipeptide = calculate_dipeptide_composition(test_array)
test_dipeptide_df = pd.DataFrame(test_dipeptide)
test_dipeptide_df


### Add values to Train data

In [None]:
molecular_weight = calculate_molecular_weight(train_array)
isoelectric_point = calculate_isoelectric_point(train_array)
aromaticity = calculate_aromaticity(train_array)
instability_index = calculate_instability_index(train_array)

train["molecular_weight"] = molecular_weight
train["isoelectric_point"] = isoelectric_point
train["aromaticity"] = aromaticity
train["instability_index"] = instability_index

# Merge the dipeptide features
train = train.join(train_dipeptide_df)
train

KeyboardInterrupt: 

### Add values to Test data

In [None]:
molecular_weight = calculate_molecular_weight(test_array)
isoelectric_point = calculate_isoelectric_point(test_array)
aromaticity = calculate_aromaticity(test_array)
instability_index = calculate_instability_index(test_array)

test["molecular_weight"] = molecular_weight
test["isoelectric_point"] = isoelectric_point
test["aromaticity"] = aromaticity
test["instability_index"] = instability_index

# Merge the dipeptide features
test = test.join(test_dipeptide_df)
test

Unnamed: 0,protein_sequence,pH,molecular_weight,isoelectric_point,aromaticity,instability_index,AA,AR,AN,AD,...,VL,VK,VM,VF,VP,VS,VT,VW,VY,VV
0,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,23910.2619,8.751535,0.099548,43.819955,0.45,0.0,0.91,0.45,...,0.91,0.45,0.0,0.0,0.45,0.0,0.0,0.00,0.0,0.0
1,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,23909.3202,9.024494,0.099548,43.819955,0.45,0.0,0.91,0.45,...,0.91,0.45,0.0,0.0,0.45,0.0,0.0,0.00,0.0,0.0
2,VPVNPEPDATSVENVAKTGSGDSQSDPIKADLEVKGQSALPFDVDC...,8,23781.1479,8.907484,0.100000,43.973682,0.46,0.0,0.91,0.46,...,0.91,0.46,0.0,0.0,0.46,0.0,0.0,0.00,0.0,0.0
3,VPVNPEPDATSVENVALCTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,23869.2761,8.699638,0.099548,45.295068,0.45,0.0,0.91,0.45,...,0.91,0.45,0.0,0.0,0.45,0.0,0.0,0.00,0.0,0.0
4,VPVNPEPDATSVENVALFTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,23913.3070,8.761012,0.104072,43.819955,0.45,0.0,0.91,0.45,...,0.91,0.45,0.0,0.0,0.45,0.0,0.0,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2408,VPVNPEPDATSVENVILKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,23936.3852,8.907484,0.099548,44.307285,0.45,0.0,0.91,0.45,...,0.91,0.45,0.0,0.0,0.45,0.0,0.0,0.00,0.0,0.0
2409,VPVNPEPDATSVENVLLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,23936.3852,8.907484,0.099548,43.435792,0.45,0.0,0.91,0.45,...,1.36,0.45,0.0,0.0,0.45,0.0,0.0,0.00,0.0,0.0
2410,VPVNPEPDATSVENVNLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,23937.3302,8.907484,0.099548,43.435792,0.45,0.0,0.91,0.45,...,0.91,0.45,0.0,0.0,0.45,0.0,0.0,0.00,0.0,0.0
2411,VPVNPEPDATSVENVPLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,23920.3428,8.907484,0.099548,44.307285,0.45,0.0,0.91,0.45,...,0.91,0.45,0.0,0.0,0.91,0.0,0.0,0.00,0.0,0.0


# Supervised Machine Learning

## KNNClassifier Model

#### Verificação da Média e Desvio Padrão dos dados de treino:

In [None]:
train_input = train.loc[:, ~train.columns.isin(["protein_sequence", "tm"])].values
pre_processed_train = preprocessing.scale(train_input)

print("Media global: ", pre_processed_train.mean())
print("Desvio padrao global: ", pre_processed_train.std())
print("\nVerificar se a média e o desvio padrão estão próximos dos valores 0 e 1, respetivamente.")
print("\tMédia:", ((pre_processed_train.mean(axis=0) < 0.000001) & (pre_processed_train.mean(axis=0) > -0.000001)).all())
print("\tDesvio Padrão:", ((pre_processed_train.std(axis=0) < 1.000001) & (pre_processed_train.std(axis=0) > 0.999999)).all())

Media global:  nan
Desvio padrao global:  nan

Verificar se a média e o desvio padrão estão próximos dos valores 0 e 1, respetivamente.
	Média: False
	Desvio Padrão: False


In [None]:
from sklearn.neighbors import KNeighborsRegressor

# Divide the train data to input and output
train_output = train.loc[:, "tm"].values

# Divide the test data to input and output
test_input = test.loc[:, ~test.columns.isin(["protein_sequence", "tm"])].values

knn = KNeighborsRegressor()
knn.fit(train_input, train_output)
test_prevision = knn.predict(test_input)
print(test_prevision)

ValueError: Input X contains NaN.
KNeighborsRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values