In [None]:
# Configuração inicial e pastas

import os
import pandas as pd

# Pastas
RAW_FOLDER = r"C:\Users\andr3\Documents\DATA SCIENCE\10794 - Programação avançada com Python\Age Assessment & Disease Risk Prediction\src"
CLEAN_FOLDER = os.path.join(os.path.dirname(RAW_FOLDER), "clean")
os.makedirs(CLEAN_FOLDER, exist_ok=True)
# Parâmetros
CHUNK_SIZE = 50000  # número de samples por Parquet

In [None]:
# Ler e checar metadados

trainmap_df = pd.read_csv(os.path.join(RAW_FOLDER, "trainmap.csv"))
trainmap_df['sample_id'] = trainmap_df['sample_id'].astype(str)

print("Colunas do trainmap:", trainmap_df.columns)
trainmap_df.head()



In [None]:
# Função de preprocessing (traindata wide → long + merge)

def process_traindata(file_path):
    parquet_files = []
    for i, chunk in enumerate(pd.read_csv(file_path, chunksize=1000)):
        # Wide → long
        df_long = chunk.melt(id_vars="cpgsite", var_name="sample_id", value_name="value")
        df_long["sample_id"] = df_long["sample_id"].astype(str)
        
        # Merge com trainmap
        df_merged = df_long.merge(trainmap_df, on="sample_id", how="left")
        
        # Salva em chunks menores para RAM
        n_chunks = (len(df_merged) // CHUNK_SIZE) + 1
        for j in range(n_chunks):
            start = j * CHUNK_SIZE
            end = (j + 1) * CHUNK_SIZE
            chunk_parquet = df_merged.iloc[start:end]
            if len(chunk_parquet) == 0:
                continue
            file_name = os.path.join(CLEAN_FOLDER, f"traindata_part_{i:04d}_{j:04d}.parquet")
            chunk_parquet.to_parquet(file_name, index=False)
            parquet_files.append(file_name)
    return parquet_files


In [None]:
# Rodar o preprocessing

traindata_path = os.path.join(RAW_FOLDER, "traindata.csv")
parquet_files = process_traindata(traindata_path)
print(f"Parquet files gerados: {len(parquet_files)}")


In [None]:
# Leitura de todos os Parquet chunks para análise

import glob

all_parquets = glob.glob(os.path.join(CLEAN_FOLDER, "*.parquet"))
df = pd.concat([pd.read_parquet(f) for f in all_parquets], ignore_index=True)
print(df.shape)
df.head()


In [None]:
# PCA + Clustering
import os
import pandas as pd
import numpy as np
from sklearn.decomposition import IncrementalPCA
from sklearn.cluster import MiniBatchKMeans

# ===== CONFIGURAÇÃO =====
SRC_DIR = r"C:\Users\andr3\Documents\DATA SCIENCE\10794 - Programação avançada com Python\Age Assessment & Disease Risk Prediction\src"
CLEAN_FOLDER = os.path.join(SRC_DIR, "clean")
os.makedirs(CLEAN_FOLDER, exist_ok=True)

TRAIN_FILE = os.path.join(SRC_DIR, "traindata.csv")
TRAINMAP_FILE = os.path.join(SRC_DIR, "trainmap.csv")
OUTPUT_FILE = os.path.join(CLEAN_FOLDER, "train_processed.parquet")

# ===== CARREGAR TRAINMAP =====
trainmap_df = pd.read_csv(TRAINMAP_FILE, usecols=["age", "gender", "sample_type", "disease"])
trainmap_df["sample_id"] = trainmap_df.index.astype(str)

# ===== PARAMETROS =====
CHUNK_SIZE = 10000  # menor para reduzir uso de RAM
N_COMPONENTS = 2
N_CLUSTERS = 4

# ===== Inicializar PCA e KMeans =====
pca = IncrementalPCA(n_components=N_COMPONENTS)
kmeans = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=42, batch_size=500)

# ===== PRIMEIRA PASSAGEM: Treinar PCA =====
print("Treinando PCA incrementalmente...")
for chunk in pd.read_csv(TRAIN_FILE, chunksize=CHUNK_SIZE, usecols=lambda c: c.startswith("train")):
    chunk = chunk.fillna(0).astype("float32")
    pca.partial_fit(chunk)
print("✅ PCA treinado!")

# ===== SEGUNDA PASSAGEM: Transformar e clusterizar =====
print("Aplicando PCA e KMeans em blocos...")
results = []
sample_counter = 0

for chunk in pd.read_csv(TRAIN_FILE, chunksize=CHUNK_SIZE):
    feature_cols = [c for c in chunk.columns if c.startswith("train")]
    chunk_features = chunk[feature_cols].fillna(0).astype("float32")

    # sample_id incremental
    chunk["sample_id"] = np.arange(sample_counter, sample_counter + len(chunk)).astype(str)
    sample_counter += len(chunk)

    # PCA transform
    pca_result = pca.transform(chunk_features)
    chunk["PC1"] = pca_result[:, 0]
    chunk["PC2"] = pca_result[:, 1]

    # Clustering
    cluster_labels = kmeans.partial_fit(chunk_features).predict(chunk_features)
    chunk["cluster"] = cluster_labels

    # Selecionar colunas relevantes
    results.append(chunk[["sample_id", "PC1", "PC2", "cluster"]])
    del chunk, chunk_features, pca_result, cluster_labels  # liberar memória

# ===== JUNTAR TODOS E MERGE =====
final_df = pd.concat(results, ignore_index=True)
final_df = final_df.merge(trainmap_df, on="sample_id", how="left")

# ===== SALVAR PARQUET =====
final_df.to_parquet(OUTPUT_FILE, index=False)
print("✅ Arquivo processado salvo em:", OUTPUT_FILE)
print(final_df.head())

In [None]:
# Estrutura sugerida para modelo supervisionado

import os
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

# ===== CONFIGURAÇÃO DE CAMINHOS =====
SRC_DIR = r"C:\Users\andr3\Documents\DATA SCIENCE\10794 - Programação avançada com Python\Age Assessment & Disease Risk Prediction\src"
CLEAN_FOLDER = os.path.join(SRC_DIR, "clean")
PARQUET_FILE = os.path.join(CLEAN_FOLDER, "train_processed.parquet")
MODEL_PATH = os.path.join(CLEAN_FOLDER, "model_sgd.pkl")
OUTPUT_PARQUET = os.path.join(CLEAN_FOLDER, "train_with_predictions.parquet")

# ===== CARREGAR DADOS =====
df = pd.read_parquet(PARQUET_FILE)

# ===== FILTRAR REGISTOS VÁLIDOS =====
df = df.dropna(subset=["disease", "PC1", "PC2"])
X = df[["PC1", "PC2"]].astype("float32")

# ===== CODIFICAR TARGET =====
le = LabelEncoder()
df["target"] = le.fit_transform(df["disease"].astype(str))
y = df["target"].astype("int32")
classes = np.unique(y)

# ===== INICIALIZAR MODELO INCREMENTAL =====
model = SGDClassifier(loss="log_loss", random_state=42)

# ===== TREINO EM MINI-BATCHES =====
BATCH_SIZE = 1000
for i in range(0, len(X), BATCH_SIZE):
    X_batch = X.iloc[i:i + BATCH_SIZE]
    y_batch = y.iloc[i:i + BATCH_SIZE]
    model.partial_fit(X_batch, y_batch, classes=classes)

# ===== SALVAR MODELO TREINADO =====
joblib.dump(model, MODEL_PATH)
print(f"✅ Modelo supervisionado salvo em: {MODEL_PATH}")

# ===== PREVISÕES =====
y_pred = model.predict(X)
if len(y_pred) != len(df):
    raise ValueError("Número de previsões não corresponde ao número de amostras.")

# ===== DECODIFICAR PREVISÕES =====
df["predicted_disease"] = le.inverse_transform(y_pred)

# ===== AVALIAÇÃO DO MODELO =====
df_eval = df.dropna(subset=["disease", "predicted_disease"])
print("\n📊 Relatório de Classificação:")
print(classification_report(df_eval["disease"], df_eval["predicted_disease"]))

# ===== MATRIZ DE CONFUSÃO =====
cm = confusion_matrix(df_eval["disease"], df_eval["predicted_disease"], labels=le.classes_)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", xticklabels=le.classes_, yticklabels=le.classes_, cmap="Blues")
plt.title("Matriz de Confusão - Previsão de Doença")
plt.xlabel("Predito")
plt.ylabel("Real")
plt.tight_layout()
plt.show()

# ===== EXPORTAR PARQUET COM PREVISÕES =====
df.to_parquet(OUTPUT_PARQUET, index=False)
print(f"✅ Dados com previsões salvos em: {OUTPUT_PARQUET}")

In [None]:
# Código com RandomForestClassifier
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

# ===== CONFIGURAÇÃO =====
SRC_DIR = r"C:\Users\andr3\Documents\DATA SCIENCE\10794 - Programação avançada com Python\Age Assessment & Disease Risk Prediction\src"
CLEAN_FOLDER = os.path.join(SRC_DIR, "clean")
PARQUET_FILE = os.path.join(CLEAN_FOLDER, "train_processed.parquet")
MODEL_PATH = os.path.join(CLEAN_FOLDER, "model_rf.pkl")
OUTPUT_PARQUET = os.path.join(CLEAN_FOLDER, "train_rf_predictions.parquet")

# ===== CARREGAR DADOS =====
df = pd.read_parquet(PARQUET_FILE)
df = df.dropna(subset=["disease", "PC1", "PC2"])

# ===== FEATURES E TARGET =====
X = df[["PC1", "PC2"]]
le = LabelEncoder()
df["target"] = le.fit_transform(df["disease"].astype(str))
y = df["target"]

# ===== TREINAR MODELO =====
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)

# ===== SALVAR MODELO =====
joblib.dump(model, MODEL_PATH)
print(f"✅ Modelo Random Forest salvo em: {MODEL_PATH}")

# ===== PREVISÕES =====
y_pred = model.predict(X)
df["predicted_disease"] = le.inverse_transform(y_pred)

# ===== AVALIAÇÃO =====
print("\n📊 Relatório de Classificação:")
print(classification_report(df["disease"], df["predicted_disease"]))

cm = confusion_matrix(df["disease"], df["predicted_disease"], labels=le.classes_)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", xticklabels=le.classes_, yticklabels=le.classes_, cmap="Greens")
plt.title("Matriz de Confusão - Random Forest")
plt.xlabel("Predito")
plt.ylabel("Real")
plt.tight_layout()
plt.show()

# ===== EXPORTAR PARQUET COM PREVISÕES =====
df.to_parquet(OUTPUT_PARQUET, index=False)
print(f"✅ Dados com previsões salvos em: {OUTPUT_PARQUET}")

In [None]:
# Comparativo em Código: SGD vs Random ForesT
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix

# ===== CARREGAR PREVISÕES =====
SRC_DIR = r"C:\Users\andr3\Documents\DATA SCIENCE\10794 - Programação avançada com Python\Age Assessment & Disease Risk Prediction\src"
CLEAN_FOLDER = os.path.join(SRC_DIR, "clean")

df_sgd = pd.read_parquet(os.path.join(CLEAN_FOLDER, "train_with_predictions.parquet"))
df_rf = pd.read_parquet(os.path.join(CLEAN_FOLDER, "train_rf_predictions.parquet"))

# ===== RELATÓRIO DE CLASSIFICAÇÃO =====
print("📊 SGDClassifier:")
print(classification_report(df_sgd["disease"], df_sgd["predicted_disease"]))

print("\n📊 RandomForestClassifier:")
print(classification_report(df_rf["disease"], df_rf["predicted_disease"]))

# ===== MATRIZES DE CONFUSÃO LADO A LADO =====
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

cm_sgd = confusion_matrix(df_sgd["disease"], df_sgd["predicted_disease"], labels=df_sgd["disease"].unique())
sns.heatmap(cm_sgd, annot=True, fmt="d", cmap="Blues", ax=axes[0])
axes[0].set_title("SGDClassifier")
axes[0].set_xlabel("Predito")
axes[0].set_ylabel("Real")

cm_rf = confusion_matrix(df_rf["disease"], df_rf["predicted_disease"], labels=df_rf["disease"].unique())
sns.heatmap(cm_rf, annot=True, fmt="d", cmap="Greens", ax=axes[1])
axes[1].set_title("RandomForestClassifier")
axes[1].set_xlabel("Predito")
axes[1].set_ylabel("Real")

plt.tight_layout()
plt.show()

In [None]:
# DASHBOARD DE CLUSTERS - AVALIAÇÃO BIOLÓGICA

import os
import pandas as pd
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

# ===== CONFIGURAÇÃO =====
SRC_DIR = r"C:\Users\andr3\Documents\DATA SCIENCE\10794 - Programação avançada com Python\Age Assessment & Disease Risk Prediction\src"
CLEAN_FOLDER = os.path.join(SRC_DIR, "clean")
PARQUET_FILE = os.path.join(CLEAN_FOLDER, "train_processed.parquet")

# ===== CARREGAR DADOS =====
if not os.path.exists(PARQUET_FILE):
    raise FileNotFoundError(f"Nenhum arquivo encontrado em {PARQUET_FILE}")

df = pd.read_parquet(PARQUET_FILE)

# ===== APLICAR FILTROS MANUALMENTE =====
idade = (20, 60)
genero = df["gender"].dropna().unique().tolist()
tipo = df["sample_type"].dropna().unique().tolist()
doenca = df["disease"].dropna().unique().tolist()

df_filtrado = df[
    (df["age"].between(idade[0], idade[1])) &
    (df["gender"].isin(genero)) &
    (df["sample_type"].isin(tipo)) &
    (df["disease"].isin(doenca))
]

# ===== GRÁFICOS INTERATIVOS =====
px.scatter(df_filtrado, x="PC1", y="PC2", color="cluster",
           hover_data=["sample_id", "age", "gender", "sample_type", "disease"],
           title="Clusters no Espaço PCA").show()

px.scatter(df_filtrado, x="PC1", y="PC2", color="gender", symbol="cluster",
           hover_data=["sample_id", "age", "sample_type", "disease"],
           title="Distribuição PCA por Gênero").show()

doenca_cluster = df_filtrado.groupby(["cluster", "disease"]).size().reset_index(name="n")
px.bar(doenca_cluster, x="cluster", y="n", color="disease",
       title="Distribuição de Doenças por Cluster", barmode="stack").show()

px.box(df_filtrado, x="cluster", y="age", color="cluster",
       title="Distribuição de Idade por Cluster").show()

# ===== HEATMAP DE CORRELAÇÃO =====
df_corr = df_filtrado.copy()
for col in ["gender", "sample_type", "disease"]:
    df_corr[col] = df_corr[col].astype("category").cat.codes

corr_matrix = df_corr[["age", "gender", "sample_type", "disease", "cluster"]].corr()
plt.figure(figsize=(6, 5))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
plt.title("Correlação entre Variáveis")
plt.show()

# ===== CONTAGEM POR CLUSTER =====
count_df = df_filtrado["cluster"].value_counts().rename("n_amostras").reset_index().rename(columns={"index": "cluster"})
print("📊 Contagem por Cluster")
print(count_df)