# Notebook Monica

## Importação de Packages Necessários

In [1]:
#Imports
from utils.func import swap_ph_tm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Exploração de Dados e pré-processamento

Esta etapa deve corresponder à secção 1 do Notebook onde deverá:
- descrever e caracterizar os dados atribuídos de acordo com a documentação/
literatura existente;
- descrever sucintamente as características dos dados disponíveis a partir da análise exploratória inicial;
- descrever os passos de preparação dos dados e pré-processamento que efetuou, justificando as suas escolhas;
- incluir os gráficos exploratórios iniciais que ilustrem as principais características dos dados.

In [2]:
#First lets see the data!

#Train dataframe
train = pd.read_csv("data/train.csv",index_col="seq_id")
#Test dataframe
test = pd.read_csv("data/test.csv",index_col="seq_id")

print(f"Train data is divided in {train.shape[0]} lines and {train.shape[1]} columns")
print(f"Test data is divided in {test.shape[0]} lines and {test.shape[1]} columns")
print(f"Labels: {[labels for labels in train.columns]}")

Train data is divided in 31390 lines and 4 columns
Test data is divided in 2413 lines and 3 columns
Labels: ['protein_sequence', 'pH', 'data_source', 'tm']


In [3]:
test.describe()

Unnamed: 0,pH
count,2413.0
mean,8.0
std,0.0
min,8.0
25%,8.0
50%,8.0
75%,8.0
max,8.0


In [4]:
train.describe()

Unnamed: 0,pH,tm
count,31104.0,31390.0
mean,6.892339,49.147337
std,1.612225,14.010089
min,1.99,-1.0
25%,7.0,42.1
50%,7.0,48.0
75%,7.0,53.8
max,64.9,130.0


Aqui é possível verificar que a variável pH tem erros no dataset "train", uma vez que o seu máximo é de 64.9 (valor impossível de obter). Segundo a fonte dos dados, a variável pH e tm tem alguns exemplos com os valores trocados. Assim, é necessário fazer a troca desses valores nas sequências identificadas (dataset "train_updates").

In [5]:
update_train= pd.read_csv("data/train_updates_20220929.csv",index_col="seq_id")
train= swap_ph_tm(train,update_train)


In [6]:
print(f"Labels: {[labels for labels in train.columns]}")
print(f"Labels: {[labels for labels in test.columns]}")

Labels: ['protein_sequence', 'pH', 'data_source', 'tm']
Labels: ['protein_sequence', 'pH', 'data_source']


In [3]:
print(train.data_source)

seq_id
0        doi.org/10.1038/s41592-020-0801-4
1        doi.org/10.1038/s41592-020-0801-4
2        doi.org/10.1038/s41592-020-0801-4
3        doi.org/10.1038/s41592-020-0801-4
4        doi.org/10.1038/s41592-020-0801-4
                       ...                
31385    doi.org/10.1038/s41592-020-0801-4
31386    doi.org/10.1038/s41592-020-0801-4
31387    doi.org/10.1038/s41592-020-0801-4
31388    doi.org/10.1038/s41592-020-0801-4
31389    doi.org/10.1038/s41592-020-0801-4
Name: data_source, Length: 31390, dtype: object


In [4]:
sources = {}
count = 1
for n, i in enumerate(train.data_source):
    if i not in sources.keys():
        sources[i] = count
        train.data_source[n] = sources[i]
        count += 1
    else:
        train.data_source[n] = sources[i]

print(train.data_source)
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.data_source[n] = sources[i]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.data_source[n] = sources[i]


seq_id
0        1
1        1
2        1
3        1
4        1
        ..
31385    1
31386    1
31387    1
31388    1
31389    1
Name: data_source, Length: 31390, dtype: object


In [5]:
print(min(train.data_source))
print(max(train.data_source))

1
325


In [13]:
print("Remove data_source")
train= train.drop(columns="data_source")
test= test.drop(columns="data_source")
print(f"Train data is divided in {train.shape[0]} lines and {train.shape[1]} col")
print(f"Test data is divided in {test.shape[0]} lines and {test.shape[1]} col")
print("We want to predict tm values for test data")

Remove data_source
Train data is divided in 31390 lines and 3 col
Test data is divided in 2413 lines and 2 col
We want to predict tm values for test data


In [111]:
print(train.isnull().sum().sort_values(ascending=False))
print(test.isnull().sum().sort_values(ascending=False))
#There are some missing values in train
#Data_source values are not that important

pH                  286
protein_sequence      0
data_source           0
tm                    0
dtype: int64
protein_sequence    0
pH                  0
data_source         0
dtype: int64


In [105]:
missing_data= train[train["pH"].isnull()]
missing_data

Unnamed: 0_level_0,protein_sequence,pH,data_source,tm
seq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
484,ARINTVRGPITISEAGFTLTHEHICGSSAGFLRAWPEFFGSRKALA...,,5,84.0
6215,MASHKLLVTPLKALLKPLSIPNQLLLGPGPSNLPPRIMAAGGLQMI...,,35,73.8
6216,MASHKLLVTPPKALLKPLSIPNQLLLGPGPSNLPPRIMAAGGLQMI...,,35,85.5
6217,MASHKLLVTPPKALLKPLSIPNQLLLGPGPSNLPPRIMAAGGLQMI...,,35,67.1
6218,MASHKLLVTPPKALLKPLSIPNQLLLGPGPSNLPPRIMAAGGLQMI...,,35,83.2
...,...,...,...,...
28753,MVLKQRANYLGFLIVFFTAFLVEAVPIKRQSNSTVDSLPPLIPSRT...,,317,58.9
28754,MVLKQRANYLGFLIVFFTAFLVEAVPIKRQSNSTVDSLPPLIPSRT...,,317,59.4
28755,MVLKQRANYLGFLIVFFTAFLVEAVPIKRQSNSTVDSLPPLIPSRT...,,317,57.8
28756,MVLKQRANYLGFLIVFFTAFLVEAVPIKRQSNSTVDSLPPLIPSRT...,,317,59.3


In [88]:
train= train.drop((missing_data).index)
train
#Podemos remover também a data_source? Não deve de trazer nada de relevante para a analise dos dados

Unnamed: 0_level_0,protein_sequence,pH,data_source,tm
seq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,7.0,1,75.7
1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,7.0,1,50.5
2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,7.0,1,40.5
3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,7.0,1,47.2
4,AAATKSGPRRQSQGASVRTFTPFYFLVEPVDTLSVRGSSVILNCSA...,7.0,1,49.5
...,...,...,...,...
31385,YYMYSGGGSALAAGGGGAGRKGDWNDIDSIKKKDLHHSRGDEKAQG...,7.0,1,51.8
31386,YYNDQHRLSSYSVETAMFLSWERAIVKPGAMFKKAVIGFNCNVDLI...,7.0,1,37.2
31387,YYQRTLGAELLYKISFGEMPKSAQDSAENCPSGMQFPDTAIAHANV...,7.0,1,64.6
31388,YYSFSDNITTVFLSRQAIDDDHSLSLGTISDVVESENGVVAADDAR...,7.0,1,50.7
