In [22]:
import pandas as pd
import matplotlib.pyplot as plt

In [23]:
def read_rename(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)

    df = df.rename(columns={
        'ID_Proteína': 'protein_id',
        'Sequência': 'sequence',
        'Massa_Molecular': 'molecular_weight',
        'Ponto_Isoelétrico': 'isoelectric_point',
        'Hidrofobicidade': 'hydrophobicity',
        'Carga_Total': 'total_charge',
        'Proporção_Polar': 'polar_ratio',
        'Proporção_Apolar': 'nonpolar_ratio',
        'Comprimento_Sequência': 'sequence_length',
        'Classe': 'class'
    })

    return df

In [24]:
df = read_rename('proteinas_20000_enriquecido.csv')
test = read_rename('proteinas_test.csv')
train = read_rename('proteinas_train.csv')

In [25]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.utils import to_categorical

class ProteinPreprocessor:
    def __init__(self, max_len=None):
        self.tokenizer = Tokenizer(char_level=True)
        self.scaler = StandardScaler()
        self.label_encoder = LabelEncoder()
        self.max_len = max_len  # Will be set during fit if not provided

        self.numeric_targets = [
            'molecular_weight', 'isoelectric_point', 'hydrophobicity',
            'total_charge', 'polar_ratio', 'nonpolar_ratio', 'sequence_length'
        ]
    
    def fit(self, df):
        # Fit tokenizer on sequence
        self.tokenizer.fit_on_texts(df['sequence'])
        
        # Convert sequences and get max length
        seqs = self.tokenizer.texts_to_sequences(df['sequence'])
        self.max_len = self.max_len or max(len(s) for s in seqs)

        # Fit scaler on numeric targets
        self.scaler.fit(df[self.numeric_targets])

        # Fit label encoder
        self.label_encoder.fit(df['class'])

    def transform(self, df):
        # Tokenize and pad sequences
        seqs = self.tokenizer.texts_to_sequences(df['sequence'])
        X = pad_sequences(seqs, maxlen=self.max_len, padding='post')
        
        # Normalize numeric targets
        y_numeric = self.scaler.transform(df[self.numeric_targets])
        
        # Encode class labels
        y_class = to_categorical(self.label_encoder.transform(df['class']))

        return X, y_numeric, y_class
    
    def fit_transform(self, df):
        self.fit(df)
        return self.transform(df)


In [26]:
preprocessor = ProteinPreprocessor()

X, y_numeric, y_class = preprocessor.fit_transform(df)

In [27]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, GlobalMaxPool1D, Dropout

# Assume preprocessor has already been fitted
VOCAB_SIZE = len(preprocessor.tokenizer.word_index) + 1
MAX_LEN = preprocessor.max_len
NUMERIC_OUTPUT_DIM = len(preprocessor.numeric_targets)
CLASS_OUTPUT_DIM = len(preprocessor.label_encoder.classes_)

# Define model
input_seq = Input(shape=(MAX_LEN,), name='sequence_input')
x = Embedding(input_dim=VOCAB_SIZE, output_dim=64)(input_seq)
x = LSTM(64, return_sequences=True)(x)
x = GlobalMaxPool1D()(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.3)(x)

# Output heads
output_numeric = Dense(NUMERIC_OUTPUT_DIM, name='numeric_output')(x)
output_class = Dense(CLASS_OUTPUT_DIM, activation='softmax', name='class_output')(x)

model = Model(inputs=input_seq, outputs=[output_numeric, output_class])

# Compile the model
model.compile(
    optimizer='adam',
    loss={
        'numeric_output': 'mse',
        'class_output': 'categorical_crossentropy'
    },
    metrics={
        'numeric_output': 'mae',
        'class_output': 'accuracy'
    }
)

model.summary()


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 sequence_input (InputLayer  [(None, 300)]                0         []                            
 )                                                                                                
                                                                                                  
 embedding_1 (Embedding)     (None, 300, 64)              1344      ['sequence_input[0][0]']      
                                                                                                  
 lstm_1 (LSTM)               (None, 300, 64)              33024     ['embedding_1[0][0]']         
                                                                                                  
 global_max_pooling1d_1 (Gl  (None, 64)                   0         ['lstm_1[0][0]']        

In [28]:
# X_train, X_val, y_num_train, y_num_val, y_class_train, y_class_val = train_test_split(
#     X, y_numeric, y_class, test_size=0.2, random_state=42)

# history = model.fit(
#     X_train, {'numeric_output': y_num_train, 'class_output': y_class_train},
#     validation_data=(X_val, {'numeric_output': y_num_val, 'class_output': y_class_val}),
#     epochs=30,
#     batch_size=32
# )

# model.save("protein_model.h5")

In [29]:
from tensorflow.keras.models import load_model

model = load_model('protein_model.h5')

In [30]:
X_test, y_numeric_test, y_class_test = preprocessor.fit_transform(test)

In [31]:
results = model.evaluate(
    X_test, 
    {'numeric_output': y_numeric_test, 'class_output': y_class_test},
    batch_size=32
)

print("\n=== Evaluation Results ===")
for name, value in zip(model.metrics_names, results):
    print(f"{name}: {value:.4f}")



=== Evaluation Results ===
loss: 2.8263
numeric_output_loss: 1.2154
class_output_loss: 1.6109
numeric_output_mae: 0.7226
class_output_accuracy: 0.2030


In [33]:
from sklearn.metrics import classification_report

# Predict class probabilities
_, pred_class = model.predict(X_test)
pred_class_labels = pred_class.argmax(axis=1)
true_class_labels = y_class_test.argmax(axis=1)

# Decode labels (optional)
decoded_preds = preprocessor.label_encoder.inverse_transform(pred_class_labels)
decoded_trues = preprocessor.label_encoder.inverse_transform(true_class_labels)

# Classification metrics
print("=== Classification Metrics (per class) ===")
print(classification_report(decoded_trues, decoded_preds))

=== Classification Metrics (per class) ===
              precision    recall  f1-score   support

      Enzima       0.20      0.60      0.30       792
  Estrutural       0.19      0.14      0.16       863
      Outras       0.22      0.24      0.23       765
   Receptora       0.19      0.02      0.03       774
  Transporte       0.17      0.02      0.04       806

    accuracy                           0.20      4000
   macro avg       0.19      0.20      0.15      4000
weighted avg       0.19      0.20      0.15      4000



In [36]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd

# Inverse transform both predictions and ground truth
y_numeric_test_orig = preprocessor.scaler.inverse_transform(y_numeric_test)
pred_numeric_orig = preprocessor.scaler.inverse_transform(pred_numeric)

# Evaluate per feature
numeric_results = {}

for i, feature in enumerate(preprocessor.numeric_targets):
    mae = mean_absolute_error(y_numeric_test_orig[:, i], pred_numeric_orig[:, i])
    mse = mean_squared_error(y_numeric_test_orig[:, i], pred_numeric_orig[:, i])
    r2 = r2_score(y_numeric_test_orig[:, i], pred_numeric_orig[:, i])
    
    numeric_results[feature] = {
        'MAE': mae,
        'MSE': mse,
        'R2': r2
    }

# Display nicely
numeric_df = pd.DataFrame(numeric_results).T
print("\n=== Numeric Regression Metrics (per feature) ===")
print(numeric_df.round(4))



=== Numeric Regression Metrics (per feature) ===
                        MAE          MSE      R2
molecular_weight   492.5705  379865.2230  0.9949
isoelectric_point    1.5583       3.7096 -0.7604
hydrophobicity       0.0333       0.0019 -1.6088
total_charge         6.3677      65.9421 -0.1368
polar_ratio          0.0211       0.0007  0.3689
nonpolar_ratio       0.0493       0.0041 -1.3626
sequence_length      3.0515      14.9126  0.9972


In [35]:
test.describe()

Unnamed: 0,molecular_weight,isoelectric_point,hydrophobicity,total_charge,polar_ratio,nonpolar_ratio,sequence_length
count,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0
mean,20831.685609,7.219644,0.149125,8.592,0.199522,0.39936,174.987
std,8636.43534,1.451794,0.026778,7.617056,0.034355,0.041419,72.576978
min,5525.4957,4.050028,0.018182,-18.0,0.0875,0.245902,50.0
25%,13432.387125,6.031947,0.132743,3.0,0.177419,0.372727,112.0
50%,20773.42605,7.080769,0.149239,8.0,0.199662,0.4,174.0
75%,28258.8636,8.564367,0.166243,13.0,0.220779,0.424561,238.0
max,36511.2899,11.255745,0.285714,39.0,0.35,0.607143,300.0
