#Carga de datos

## Carga y tamaño

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests

In [2]:
!mkdir dataset/

In [3]:
df_test = pd.read_csv("dataset/Test_path_dataset.csv", sep=",", dtype=str, skipinitialspace=True)
df_train = pd.read_csv("dataset/Train_path_dataset.csv", sep=",", dtype=str, skipinitialspace=True)
df_val = pd.read_csv("dataset/Val_path_dataset.csv", sep=",", dtype=str, skipinitialspace=True)

In [4]:
print(f"Tes: {df_test.shape}")
print(f"Val: {df_val.shape}")
print(f"Train: {df_train.shape}")

Tes: (531, 11)
Val: (522, 11)
Train: (30076, 11)


## Test

In [5]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 531 entries, 0 to 530
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Protein_ID            531 non-null    object
 1   Sequence              531 non-null    object
 2   Sequence_Length       531 non-null    object
 3   Type                  531 non-null    object
 4   Tm                    531 non-null    object
 5   Ambiguous_Aminoacids  531 non-null    object
 6   Organism_ID           0 non-null      object
 7   Origin                531 non-null    object
 8   PDB_ID                531 non-null    object
 9   Alpha_ID              531 non-null    object
 10  PDB_Path              531 non-null    object
dtypes: object(11)
memory usage: 45.8+ KB


In [6]:
df_test["Sequence_Length"] = pd.to_numeric(df_test["Sequence_Length"], errors="coerce")
df_test["Tm"] = pd.to_numeric(df_test["Tm"], errors="coerce")
df_test["Organism_ID"] = df_test["Organism_ID"].replace("-", pd.NA)

In [7]:
##quitar columnas sin información
df_test = df_test.drop(columns=[
    'Ambiguous_Aminoacids',
    'Organism_ID',
    'Origin',
    'PDB_ID',
    'Alpha_ID'
])


In [8]:
# Crear una nueva columna con ID único basado en el Protein_ID y su aparición YA QUE HAY UNAS SEQ CON MISMO ID, ya que pueden ser la misma proteina pero mutada
df_test['Unique_ID'] = df_test.groupby('Protein_ID').cumcount()
df_test['Unique_ID'] = df_test['Protein_ID'] + '_' + df_test['Unique_ID'].astype(str)


In [9]:
df_test.head()

Unnamed: 0,Protein_ID,Sequence,Sequence_Length,Type,Tm,PDB_Path,Unique_ID
0,A0A0K2H5Z1,MEKVYGLIGFPVEHSLSPLMHNDAFARLGIPARYHLFSVEPGQVGA...,276,Thermophilic,70.92,estructuras/test/A0A0K2H5Z1.pdb.gz,A0A0K2H5Z1_0
1,A0A0K2H966,MWKKFLSKLGIGAAKVDLVLHRPHVRLGETLEGEFLLEGGSVAQHI...,327,Thermophilic,71.73,estructuras/test/A0A0K2H966.pdb.gz,A0A0K2H966_0
2,A0A0K2H9T2,MKILLAEDDLHLGELIVHLLKKKGIDHIDWVQEGEDAYDYAMAEFY...,223,Thermophilic,71.946,estructuras/test/A0A0K2H9T2.pdb.gz,A0A0K2H9T2_0
3,A0A0K2HC89,MRKIVIVGGVAGGATAAARLRRLSEADHIVLFERGEYISFANCGLP...,547,Thermophilic,72.421,estructuras/test/A0A0K2HC89.pdb.gz,A0A0K2HC89_0
4,A0A0K2HCU8,MTVGKVYLVGAGPGDEKLITVYGRECLERADVIIYDRLINRKLLRY...,256,Thermophilic,74.424,estructuras/test/A0A0K2HCU8.pdb.gz,A0A0K2HCU8_0


## Val

In [10]:
df_val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522 entries, 0 to 521
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Protein_ID            522 non-null    object
 1   Sequence              522 non-null    object
 2   Sequence_Length       522 non-null    object
 3   Type                  522 non-null    object
 4   Tm                    522 non-null    object
 5   Ambiguous_Aminoacids  522 non-null    object
 6   Organism_ID           0 non-null      object
 7   Origin                522 non-null    object
 8   PDB_ID                522 non-null    object
 9   Alpha_ID              522 non-null    object
 10  PDB_Path              522 non-null    object
dtypes: object(11)
memory usage: 45.0+ KB


In [11]:
df_val["Sequence_Length"] = pd.to_numeric(df_val["Sequence_Length"], errors="coerce")
df_val["Tm"] = pd.to_numeric(df_val["Tm"], errors="coerce")
df_val["Organism_ID"] = df_val["Organism_ID"].replace("-", pd.NA)

In [12]:
##quitar columnas sin información
df_val = df_val.drop(columns=[
    'Ambiguous_Aminoacids',
    'Organism_ID',
    'Origin',
    'PDB_ID',
    'Alpha_ID'
])

In [13]:
df_val["Unique_ID"] = df_val.groupby("Protein_ID").cumcount()
df_val["Unique_ID"] = df_val["Protein_ID"] + "_" + df_val["Unique_ID"].astype(str)

In [14]:
df_val.head()

Unnamed: 0,Protein_ID,Sequence,Sequence_Length,Type,Tm,PDB_Path,Unique_ID
0,A0A0K2H545,MAKRERKGGLSGRQKAAILLISLGPDVSASVYKHLSEEEIEKLTLE...,339,Thermophilic,70.523,estructuras/val/A0A0K2H545.pdb.gz,A0A0K2H545_0
1,P61671,MGNMLKGEGPGPLPPLLQQYVELRDRYPDYLLLFQVGDFYECFGED...,811,Thermophilic,76.259,estructuras/val/P61671.pdb.gz,P61671_0
2,Q72KI6,MERELAHWVERLRERAEAEGLSFPPVAFQEVGPEEMAMLAAYGGFP...,456,Thermophilic,76.489,estructuras/val/Q72KI6.pdb.gz,Q72KI6_0
3,Q746B7,MAWLNPFWSSPKAYVRPPKRGLAPELWASRIPFTHLENFYEVFRAL...,1042,Thermophilic,76.772,estructuras/val/Q746B7.pdb.gz,Q746B7_0
4,Q72IX8,MKAFWDYLFKEWFRQVGEALLVAFLVTTFVFTTVGVVGQSMYPTLR...,267,Thermophilic,77.617,estructuras/val/Q72IX8.pdb.gz,Q72IX8_0


In [15]:
df_val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522 entries, 0 to 521
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Protein_ID       522 non-null    object 
 1   Sequence         522 non-null    object 
 2   Sequence_Length  522 non-null    int64  
 3   Type             522 non-null    object 
 4   Tm               522 non-null    float64
 5   PDB_Path         522 non-null    object 
 6   Unique_ID        522 non-null    object 
dtypes: float64(1), int64(1), object(5)
memory usage: 28.7+ KB


## Train

In [16]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30076 entries, 0 to 30075
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Protein_ID            30076 non-null  object
 1   Sequence              30076 non-null  object
 2   Sequence_Length       30076 non-null  object
 3   Type                  30076 non-null  object
 4   Tm                    30076 non-null  object
 5   Ambiguous_Aminoacids  30076 non-null  object
 6   Organism_ID           0 non-null      object
 7   Origin                30076 non-null  object
 8   PDB_ID                30076 non-null  object
 9   Alpha_ID              30076 non-null  object
 10  PDB_Path              30076 non-null  object
dtypes: object(11)
memory usage: 2.5+ MB


In [17]:
df_train["Sequence_Length"] = pd.to_numeric(df_train["Sequence_Length"], errors="coerce")
df_train["Tm"] = pd.to_numeric(df_train["Tm"], errors="coerce")
df_train["Organism_ID"] = df_train["Organism_ID"].replace("-", pd.NA)

In [18]:
df_train=df_train.drop(columns=[
    'Ambiguous_Aminoacids',
    'Organism_ID',
    'Origin',
    'PDB_ID',
    'Alpha_ID'
])


In [19]:
df_train["Unique_ID"] = df_train.groupby("Protein_ID").cumcount()
df_train["Unique_ID"] = df_train["Protein_ID"] + "_" + df_train["Unique_ID"].astype(str)

In [20]:
df_train.head()

Unnamed: 0,Protein_ID,Sequence,Sequence_Length,Type,Tm,PDB_Path,Unique_ID
0,P94532,MNAVTIVIASMCILAIAYRLYGTFMMVKVLKVNDDKPTPAHALEDG...,598,Non-thermophilic,31.257,estructuras/train/P94532.pdb.gz,P94532_0
1,P94532,MNAVTIVIASMCILAIAYRLYGTFMMVKVLKVNDDKPTPAHALEDG...,598,Non-thermophilic,31.257,estructuras/train/P94532.pdb.gz,P94532_1
2,P94532,MNAVTIVIASMCILAIAYRLYGTFMMVKVLKVNDDKPTPAHALEDG...,598,Non-thermophilic,31.257,estructuras/train/P94532.pdb.gz,P94532_2
3,P94532,MNAVTIVIASMCILAIAYRLYGTFMMVKVLKVNDDKPTPAHALEDG...,598,Non-thermophilic,31.257,estructuras/train/P94532.pdb.gz,P94532_3
4,P94532,MNAVTIVIASMCILAIAYRLYGTFMMVKVLKVNDDKPTPAHALEDG...,598,Non-thermophilic,31.257,estructuras/train/P94532.pdb.gz,P94532_4


#Exploración


¿Cuantas observaciones tienen estructura en formato pdb.gz que fue descargada de Alphafold?

In [21]:
def limpiar_paths(df, verbose=True):
    """
    Elimina las filas donde PDB_Path es exactamente '0' y muestra estadísticas.

    Parámetros:
    - df: DataFrame de entrada con la columna 'PDB_Path'.
    - verbose: Si True, imprime estadísticas y filas con PDB_Path == '0'.

    Retorna:
    - Un nuevo DataFrame sin las filas con PDB_Path igual a '0'.
    """
    total = len(df)
    paths_cero = (df['PDB_Path'] == '0').sum()
    porcentaje_cero = (paths_cero / total) * 100

    df_filtrado = df[df['PDB_Path'] != '0'].reset_index(drop=True)
    total_filtrado = len(df_filtrado)
    porcentaje_restante = (total_filtrado / total) * 100

    if verbose:
        print(f"Sin estructura: {paths_cero} ({porcentaje_cero:.2f}%) del total de {total}")
        print(f"Después de limpiar: {total_filtrado} filas ({porcentaje_restante:.2f}%) del total original")
        print("\n")
        print(df[df['PDB_Path'] == '0'])
        print(f"Después de limpiar: {total_filtrado} filas ({porcentaje_restante:.2f}%) del total original")

    return df_filtrado


In [22]:
print("Test\n")
df_test_limpio = limpiar_paths(df_test)


Test

Sin estructura: 8 (1.51%) del total de 531
Después de limpiar: 523 filas (98.49%) del total original


     Protein_ID                                           Sequence  \
248    O15151-5  MTSFSTSAQCSTSDSACRISPGQINQVRPKLPLLKILHAAGAQGEM...   
252    P52747-2  MLLAQINRDSQGMTEFPGGGMEAQHVTLCLTEAVTVADAKLIDGQV...   
324      B0V123  MSWDFMVPVCLDDLVKSGAVNQYVVQDVLSAKQLPSHINSFKAAMR...   
341      Q5IBM5  MMCAAAAAGAGGSGILSSSSHSMGLGVRVIPGAGNDFAPIGSGMGS...   
359      F1R6W8  MMRDKRSGSFRRDKTEKPAPISRALSWLSVSSLSQQTRKLFRSQNS...   
361  A0A0R4IGN4  MHHGGGPPNVQRNLQRSKSFTGSEAEDQQQQQSQQPQQQQQQPVPV...   
460      L8E6W9  MGNKSKEKRKARKRQLSNSKTNEDLEVIEVSLQKKRLLDDNFRIVQ...   
468      X5LVB3  MSCLRQSSQSFYFTQIRWKTVKFKTKIAKTKPLKTPSAQALDHFDF...   

     Sequence_Length              Type      Tm PDB_Path     Unique_ID  
248              440  Non-thermophilic  67.060        0    O15151-5_0  
252              607  Non-thermophilic  61.110        0    P52747-2_0  
324             1380  Non-thermophilic  47.5

In [23]:
print("Val \n")
df_val_limpio = limpiar_paths(df_val)

Val 

Sin estructura: 9 (1.72%) del total de 522
Después de limpiar: 513 filas (98.28%) del total original


     Protein_ID                                           Sequence  \
310      Q07878  MLESLAANLLNRLLGSYVENFDPNQLNVGIWSGDVKLKNLKLRKDC...   
311      Q12019  MSQDRILLDLDVVNQRLILFNSAFPSDAIEAPFHFSNKESTSENLD...   
341  A0A0R4IBK5  MKCPKCSHEALEKAPKFCSECGHKLQSQSYETTQGTPHDKSQTPSI...   
358      F6PC67  MSMWIPTEHEKYGVVLAGFRGTVQHGLPLEIGDTVQILEKCEGWYR...   
363      F1QJI9  MDALLNPELREDVKDCSIKVENFPKQVIFKGLAPRVVLTNHLLMKG...   
422      A5JYS1  MSQRRHFKMSVTAKNFYRRPLPETCIEFSSELGKKLFTEALVRGSA...   
434      Q93442  MSSAKDSDKDASRKMKLQKRPEWNNGGSLEDCISNVYALLELPGIK...   
441      G5EF28  MSYAFNLARPNRLVKIVNVKLNGDNPSLKFSSARSEIFVEGSFEVF...   
458      Q6BEV4  MIEDEKAGLQPMDISDASVFFFQADVESKDFLTSLFDCEGKSDDRM...   

     Sequence_Length              Type         Tm PDB_Path     Unique_ID  
310             3144  Non-thermophilic  60.391671        0      Q07878_0  
311             4910  Non-thermophilic  

In [24]:
print("Train")
df_train_limpio = limpiar_paths(df_train)

Train
Sin estructura: 1333 (4.43%) del total de 30076
Después de limpiar: 28743 filas (95.57%) del total original


      Protein_ID                                           Sequence  \
722       Q58AA9  MLRSVVNNNDKAKESEQSVRVATDDPTYQSYAFQPSTSSSNISWFG...   
809       Q58AA9  MLRSVVNNNDKAKESEQSVRVATDDPTYQSYAFQPSTSSSNISWFG...   
822       Q58AA9  MLRSVVNNNDKAKESEQSVRVATDDPTYQSYAFQPSTSSSNISWFG...   
868       Q58AA9  MLRSVVNNNDKAKESEQSVRVATDDPTYQSYAFQPSTSSSNISWFG...   
935       Q58AA9  MLRSVVNNNDKAKESEQSVRVATDDPTYQSYAFQPSTSSSNISWFG...   
...          ...                                                ...   
28211     Q5SHP7  MPKKVLTGVVVSDKMQKTVTVLVERQFPHPLYGKVIKRSKKYLAHD...   
28282     Q5SHP7  MPKKVLTGVVVSDKMQKTVTVLVERQFPHPLYGKVIKRSKKYLAHD...   
28293     Q5SHP7  MPKKVLTGVVVSDKMQKTVTVLVERQFPHPLYGKVIKRSKKYLAHD...   
28315     Q5SIA6  MSWKDAYPDIPLGRDACGIIAMAEKSGKPSHRVVRRTLESLYRMAH...   
28316     Q5SIA6  MSWKDAYPDIPLGRDACGIIAMAEKSGKPSHRVVRRTLESLYRMAH...   

       Sequence_Length         

#Shape Dataset final

In [25]:
print(f"Tes: {df_test_limpio.shape}")
print(f"Val: {df_val_limpio.shape}")
print(f"Train: {df_train_limpio.shape}")

Tes: (523, 7)
Val: (513, 7)
Train: (28743, 7)


#¿Cómo es la distribución del Splitting?
¿Cuantos datos hay en total?

In [26]:
# Contar cuántas filas hay en cada conjunto
n_train = len(df_train_limpio)
n_val = len(df_val_limpio)
n_test = len(df_test_limpio)

# Calcular total
total = n_train + n_val + n_test

# Calcular porcentajes
p_train = (n_train / total) * 100
p_val = (n_val / total) * 100
p_test = (n_test / total) * 100

# Mostrar resultados
print(f"Total de datos: {total}\n")
print(f"Train: {n_train} ({p_train:.2f}%)")
print(f"Validation: {n_val} ({p_val:.2f}%)")
print(f"Test: {n_test} ({p_test:.2f}%)")


Total de datos: 29779

Train: 28743 (96.52%)
Validation: 513 (1.72%)
Test: 523 (1.76%)


In [27]:
df_train_limpio.head()

Unnamed: 0,Protein_ID,Sequence,Sequence_Length,Type,Tm,PDB_Path,Unique_ID
0,P94532,MNAVTIVIASMCILAIAYRLYGTFMMVKVLKVNDDKPTPAHALEDG...,598,Non-thermophilic,31.257,estructuras/train/P94532.pdb.gz,P94532_0
1,P94532,MNAVTIVIASMCILAIAYRLYGTFMMVKVLKVNDDKPTPAHALEDG...,598,Non-thermophilic,31.257,estructuras/train/P94532.pdb.gz,P94532_1
2,P94532,MNAVTIVIASMCILAIAYRLYGTFMMVKVLKVNDDKPTPAHALEDG...,598,Non-thermophilic,31.257,estructuras/train/P94532.pdb.gz,P94532_2
3,P94532,MNAVTIVIASMCILAIAYRLYGTFMMVKVLKVNDDKPTPAHALEDG...,598,Non-thermophilic,31.257,estructuras/train/P94532.pdb.gz,P94532_3
4,P94532,MNAVTIVIASMCILAIAYRLYGTFMMVKVLKVNDDKPTPAHALEDG...,598,Non-thermophilic,31.257,estructuras/train/P94532.pdb.gz,P94532_4


In [29]:
# Crear la carpeta si no existe
import os
output_dir = "preprocessed_data"
os.makedirs(output_dir, exist_ok=True)

# Guardar cada DataFrame como .txt con formato CSV dentro de la carpeta
df_train_limpio.to_csv(os.path.join(output_dir, "train_preprocessed.txt"), index=False)
df_val_limpio.to_csv(os.path.join(output_dir, "val_preprocessed.txt"), index=False)
df_test_limpio.to_csv(os.path.join(output_dir, "test_preprocessed.txt"), index=False)

In [31]:
os.system(f"tar -czvf {output_dir}.tar.gz {output_dir}")

0