In [None]:
# Instalação de Bibliotecas
pip install pandas numpy matplotlib scikit-learn tensorflow keras seaborn

In [17]:
# Importação de Bibliotecas
 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.neural_network import MLPClassifier

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten, MaxPooling1D, Dropout, SpatialDropout1D, SimpleRNN, Embedding
from keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences



In [18]:
# Carregar os dados
df = pd.read_csv("C:/Users/ana_v/OneDrive/Documentos/Repositórios/TCGA.csv", low_memory=False)

# Obtém os valores únicos na coluna 'Type'
unique_types = df['Type'].unique()

# Cria um dicionário mapeando cada tipo único para um número
type_to_numeric = {type_name: index for index, type_name in enumerate(unique_types)}

# Aplica a substituição usando o método map
df['Type'] = df['Type'].map(type_to_numeric)

# Armazena a coluna 'Type' para adicioná-la de volta posteriormente
type_column = df['Type']

# Prepara o DataFrame para normalização (remover colunas desnecessárias)
df_num = df.drop(columns=["Sample", "Type"])

# Normalizar os dados
scaler = StandardScaler()
dados_normalizados = scaler.fit_transform(df_num)

# Aplicação do PCA
pca = PCA(n_components=0.8)  
pca.fit(dados_normalizados)
dados_pca = pca.transform(dados_normalizados)

In [19]:
# Fixar a seed para garantir a reprodutibilidade
seed = 1

# Divisão de treino e teste com random_state
X_train, X_test, y_train, y_test = train_test_split(dados_pca, df['Type'], test_size=0.2, random_state=seed)

In [20]:
# Regressão Logística
lr_model = LogisticRegression(max_iter=6500) 
lr_model.fit(X_train, y_train) 
lr_predictions_train = lr_model.predict(X_train)  
lr_predictions_test = lr_model.predict(X_test) 

# Avaliação Regressão Logística
lr_accuracy_train = accuracy_score(y_train, lr_predictions_train) 
lr_accuracy_test = accuracy_score(y_test, lr_predictions_test)  
lr_report = classification_report(y_test, lr_predictions_test, zero_division=1) 

# Criar um objeto de validação cruzada
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

# Substituir a divisão de treino e teste pelo código de validação cruzada
scores = cross_val_score(lr_model, dados_pca, df['Type'], cv=cv, scoring='accuracy')

# Imprimir os resultados
print(f'Regressão Logística - Acurácia (Teste): {lr_accuracy_test}')
print(f'Acurácia média na validação cruzada: {scores.mean()}')
print(f'Classification Report:\n{lr_report}')
print("Matriz de Confusão:\n", confusion_matrix(y_test, lr_predictions_test))


Regressão Logística - Acurácia (Teste): 0.7165775401069518
Acurácia média na validação cruzada: 0.7346945778997942
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.82      0.83        90
           1       0.55      0.66      0.60        41
           2       0.80      0.77      0.79        31
           3       0.78      0.33      0.47        21
           4       0.18      0.50      0.27         4

    accuracy                           0.72       187
   macro avg       0.63      0.62      0.59       187
weighted avg       0.75      0.72      0.72       187

Matriz de Confusão:
 [[74  9  3  1  3]
 [ 9 27  2  1  2]
 [ 2  4 24  0  1]
 [ 1  9  1  7  3]
 [ 2  0  0  0  2]]


In [21]:
# SVM
svm_model = SVC()
svm_model.fit(X_train, y_train)  
svm_predictions_train = svm_model.predict(X_train)  
svm_predictions_test = svm_model.predict(X_test) 

# Avaliação SVM
svm_accuracy_train = accuracy_score(y_train, svm_predictions_train)  
svm_accuracy_test = accuracy_score(y_test, svm_predictions_test)  
svm_report = classification_report(y_test, svm_predictions_test, zero_division=1)  

# Criar um objeto de validação cruzada
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

# Substituir a divisão de treino e teste pelo código de validação cruzada
scores_svm = cross_val_score(svm_model, dados_pca, df['Type'], cv=cv, 
                         scoring='accuracy')

# Imprimir os resultados
print(f'Regressão Logística - Acurácia (Teste): {svm_accuracy_test}')
print(f'Acurácia média na validação cruzada: {scores_svm.mean()}')
print(f'Classification Report:\n{lr_report}')
print("Matriz de Confusão:\n", confusion_matrix(y_test, svm_predictions_test))


Regressão Logística - Acurácia (Teste): 0.7700534759358288
Acurácia média na validação cruzada: 0.7925303134294212
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.82      0.83        90
           1       0.55      0.66      0.60        41
           2       0.80      0.77      0.79        31
           3       0.78      0.33      0.47        21
           4       0.18      0.50      0.27         4

    accuracy                           0.72       187
   macro avg       0.63      0.62      0.59       187
weighted avg       0.75      0.72      0.72       187

Matriz de Confusão:
 [[88  2  0  0  0]
 [18 20  1  2  0]
 [ 2  0 29  0  0]
 [ 5  5  4  7  0]
 [ 4  0  0  0  0]]


In [22]:
# Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)  
rf_predictions_train = rf_model.predict(X_train)  
rf_predictions_test = rf_model.predict(X_test) 

# Avaliação Random Forest
rf_accuracy_train = accuracy_score(y_train, rf_predictions_train) 
rf_accuracy_test = accuracy_score(y_test, rf_predictions_test) 
rf_report = classification_report(y_test, rf_predictions_test, 
                                  zero_division=1) 

# Criar um objeto de validação cruzada
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

# Substituir a divisão de treino e teste pelo código de validação cruzada
scores_rf = cross_val_score(rf_model, dados_pca, df['Type'], cv=cv, scoring='accuracy')

# Imprimir os resultados
print(f'Regressão Logística - Acurácia (Teste): {rf_accuracy_test}')
print(f'Acurácia média na validação cruzada: {scores_rf.mean()}')
print(f'Classification Report:\n{rf_report}')
print("Matriz de Confusão:\n", confusion_matrix(y_test, rf_predictions_test))


Regressão Logística - Acurácia (Teste): 0.7112299465240641
Acurácia média na validação cruzada: 0.7294326241134752
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.93      0.81        90
           1       0.61      0.49      0.54        41
           2       0.80      0.90      0.85        31
           3       1.00      0.05      0.09        21
           4       1.00      0.00      0.00         4

    accuracy                           0.71       187
   macro avg       0.82      0.47      0.46       187
weighted avg       0.74      0.71      0.66       187

Matriz de Confusão:
 [[84  6  0  0  0]
 [19 20  2  0  0]
 [ 2  1 28  0  0]
 [10  6  4  1  0]
 [ 3  0  1  0  0]]


In [24]:
# Criação e treinamento do MLP com 4 camadas ocultas
mlp_model = MLPClassifier(hidden_layer_sizes=(100, 75, 50, 25), max_iter=500, random_state=seed)
mlp_model.fit(X_train, y_train)

# Predições no conjunto de treino e teste
mlp_predictions_train = mlp_model.predict(X_train)
mlp_predictions_test = mlp_model.predict(X_test)

# Avaliação MLP
mlp_accuracy_train = accuracy_score(y_train, mlp_predictions_train)
mlp_accuracy_test = accuracy_score(y_test, mlp_predictions_test)
mlp_report = classification_report(y_test, mlp_predictions_test, zero_division=1)

# Criar um objeto de validação cruzada
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

# Substituir a divisão de treino e teste pelo código de validação cruzada
mlp_scores = cross_val_score(mlp_model, dados_pca, df['Type'], cv=cv, scoring='accuracy')

# Exibindo resultados do MLP
print(f'MLP - Acurácia (Teste): {mlp_accuracy_test}')
print(f'Acurácia média na validação cruzada (MLP): {mlp_scores.mean()}')
print(f'Classification Report MLP:\n{mlp_report}')
print("Matriz de Confusão:\n", confusion_matrix(y_test, mlp_predictions_test))


MLP - Acurácia (Teste): 0.7272727272727273
Acurácia média na validação cruzada (MLP): 0.7497712194005948
Classification Report MLP:
              precision    recall  f1-score   support

           0       0.78      0.89      0.83        90
           1       0.64      0.51      0.57        41
           2       0.75      0.77      0.76        31
           3       0.62      0.38      0.47        21
           4       0.50      0.75      0.60         4

    accuracy                           0.73       187
   macro avg       0.66      0.66      0.65       187
weighted avg       0.72      0.73      0.72       187

Matriz de Confusão:
 [[80  6  2  2  0]
 [14 21  2  3  1]
 [ 3  3 24  0  1]
 [ 5  3  4  8  1]
 [ 1  0  0  0  3]]


In [25]:
np.random.seed(seed)
tf.random.set_seed(seed)

unique_types = df['Type'].unique()
num_classes = len(unique_types)
target_names = [str(cls) for cls in unique_types]

X_train = np.expand_dims(X_train, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)

# Função para criar o modelo CNN
def create_cnn_model():
    model = Sequential([
        Conv1D(32, 3, activation='relu', input_shape=(X_train.shape[1], 1)),
        MaxPooling1D(2),
        Conv1D(64, 3, activation='relu'),
        MaxPooling1D(2),
        Flatten(),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')  
    ])
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',  
                  metrics=['accuracy'])
    return model

# Validação cruzada
kfold = KFold(n_splits=5, shuffle=True, random_state=seed)
cross_val_scores = []

for train_index, val_index in kfold.split(X_train):
    X_fold_train, X_fold_val = X_train[train_index], X_train[val_index]
    y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]

    model = create_cnn_model()
    history = model.fit(X_fold_train, y_fold_train, epochs=25, batch_size=32, verbose=0, validation_data=(X_fold_val, y_fold_val))

    val_accuracy = history.history['val_accuracy'][-1]
    cross_val_scores.append(val_accuracy)

cross_val_mean = np.mean(cross_val_scores)
cross_val_std = np.std(cross_val_scores)

# Treinamento final da CNN
model = create_cnn_model()
history = model.fit(X_train, y_train, epochs=25, batch_size=32, validation_data=(X_test, y_test), verbose=1)

# Avaliação da CNN no conjunto de teste
cnn_loss, cnn_accuracy = model.evaluate(X_test, y_test)

# Exibindo resultados da CNN
train_accuracy = history.history['accuracy'][-1]
val_accuracy = history.history['val_accuracy'][-1]

# Predições da CNN no conjunto de teste
cnn_predictions = model.predict(X_test)
cnn_predictions_classes = np.argmax(cnn_predictions, axis=1)

# Classification report da CNN
cnn_report = classification_report(y_test, cnn_predictions_classes, target_names=target_names, zero_division=1, digits=2)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/25
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.4109 - loss: 1.8372 - val_accuracy: 0.4973 - val_loss: 1.4201
Epoch 2/25
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.4887 - loss: 1.4032 - val_accuracy: 0.5294 - val_loss: 1.2765
Epoch 3/25
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.4989 - loss: 1.2324 - val_accuracy: 0.5775 - val_loss: 1.1448
Epoch 4/25
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.5202 - loss: 1.1884 - val_accuracy: 0.5989 - val_loss: 1.1004
Epoch 5/25
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.5850 - loss: 0.9948 - val_accuracy: 0.6043 - val_loss: 1.0440
Epoch 6/25
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6229 - loss: 0.9728 - val_accuracy: 0.5829 - val_loss: 0.9659
Epoch 7/25
[1m24/24[0m [32m━━━━━━━━━

In [27]:
# Resultados
print(f'Acurácia média na validação cruzada (CNN): {cross_val_mean:.4f} ± {cross_val_std:.4f}')
print(f'Acurácia (Teste): {cnn_accuracy:.4f}')
print(f'Classification Report CNN:\n{cnn_report}')
print("Matriz de Confusão:\n", confusion_matrix(y_test, cnn_predictions_classes))

Acurácia média na validação cruzada (CNN): 0.7154 ± 0.0573
Acurácia (Teste): 0.6471
Classification Report CNN:
              precision    recall  f1-score   support

           0       0.63      0.96      0.76        90
           1       0.62      0.12      0.20        41
           2       0.68      0.81      0.74        31
           3       1.00      0.24      0.38        21
           4       1.00      0.00      0.00         4

    accuracy                           0.65       187
   macro avg       0.79      0.42      0.42       187
weighted avg       0.68      0.65      0.57       187

Matriz de Confusão:
 [[86  2  2  0  0]
 [35  5  1  0  0]
 [ 6  0 25  0  0]
 [ 8  1  7  5  0]
 [ 2  0  2  0  0]]
