In [40]:
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, Dataset

In [2]:
path = "/Users/M283455/VAE_prject/scripts/"
sys.path.insert(0, path)

In [3]:
import VAE_tybalt
from VAE_tybalt import VAE

In [4]:
clincal_file = "../../VAE_prject_data/raw/clinical_data.tsv"
clincal_df = pd.read_table(clincal_file)

In [5]:
# load the complete input file
gene_file = "../../VAE_prject_data/raw/pancan_scaled_zeroone_rnaseq.tsv.gz"
rnaseq_df_test = pd.read_table(gene_file, index_col=0)
print(rnaseq_df_test.shape)
rnaseq_df_test.head(2)
rnaseq_df_test = rnaseq_df_test.drop("TCGA-33-4579-01", axis=0)

(10459, 5000)


In [6]:
# Input used to obtain the embeddings
tcga_tybalt_file_location = "../../VAE_prject_data/raw/rnaseq_df_test.csv"
rnaseq_df = pd.read_csv(tcga_tybalt_file_location)
rnaseq_df.drop(columns=rnaseq_df.columns[0], axis=1, inplace=True)
rnaseq_df = rnaseq_df.dropna()

In [7]:
def find_matching_row_indices(df1, df2):
    # Concatenate the two DataFrames and reset their indices
    combined_df = pd.concat([df1, df2]).reset_index(drop=True)

    # Find the duplicated rows in the combined DataFrame and retrieve their indices
    duplicated_rows = combined_df.duplicated(keep=False)
    matching_indices = combined_df[duplicated_rows].index

    # Separate the indices for the two original DataFrames
    df1_indices = matching_indices[matching_indices < len(df1)]
    df2_indices = matching_indices[matching_indices >= len(df1)] - len(df1)

    return df1_indices, df2_indices


df1_matching_indices, df2_matching_indices = find_matching_row_indices(
    rnaseq_df, rnaseq_df_test
)

In [8]:
all_indices = list(rnaseq_df_test.index)

In [9]:
rnaseq_df_indexes = [all_indices[i] for i in df1_matching_indices]

In [10]:
len(rnaseq_df_indexes)

2092

In [11]:
len(df2_matching_indices)

2092

In [12]:
def find_rows_with_substring(df, column_name, substring_list):
    # Find the rows where the specified column contains any of the substrings
    matching_rows = df[
        df[column_name].apply(
            lambda x: any(substring in x for substring in substring_list)
            if isinstance(x, str)
            else False
        )
    ]

    # Update the column values to the values in the substring list
    matching_rows[column_name] = matching_rows[column_name].apply(
        lambda x: next((substring for substring in substring_list if substring in x), x)
    )

    return matching_rows


column_name = "portion_id"
substring_list = rnaseq_df_indexes

result = find_rows_with_substring(clincal_df, column_name, substring_list)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matching_rows[column_name] = matching_rows[column_name].apply(


In [13]:
def keep_first_duplicate(df, column_name):
    unique_rows_df = df.drop_duplicates(subset=column_name, keep="first")
    return unique_rows_df


rnaseq_unique_rows_df = keep_first_duplicate(result, column_name)

In [14]:
rnaseq_unique_rows_df.head(6)

Unnamed: 0,sample_id,ethnicity,year_of_diagnosis,gender,race,organ,percent_tumor_nuclei,age_at_diagnosis,sample_type,analysis_center,vital_status,acronym,portion_id,disease,drug,platform,stage
5,TCGA-2G-AAKD,not hispanic or latino,,male,white,Testis,60.0,18.0,Primary Tumor,UNC,alive,TGCT,TCGA-2G-AAKD-01,Testicular Germ Cell Tumors,Cisplatin,Illumina HiSeq,Stage III
8,TCGA-A2-A0CL,not hispanic or latino,2006.0,female,black or african american,Breast,60.0,37.0,Primary Tumor,UNC,alive,BRCA,TCGA-A2-A0CL-01,Breast invasive carcinoma,Taxol,Illumina HiSeq,Stage IIIA
16,TCGA-A2-A0D2,not hispanic or latino,2008.0,female,white,Breast,80.0,45.0,Primary Tumor,UNC,alive,BRCA,TCGA-A2-A0D2-01,Breast invasive carcinoma,Adriamycin,Illumina HiSeq,Stage IIA
24,TCGA-78-7542,not reported,1993.0,male,white,Lung,80.0,56.0,Primary Tumor,UNC,dead,LUAD,TCGA-78-7542-01,Lung adenocarcinoma,,Illumina HiSeq,Stage IB
29,TCGA-56-7823,not hispanic or latino,2011.0,female,white,Lung,,,Solid Tissue Normal,UNC,alive,LUSC,TCGA-56-7823-11,Lung squamous cell carcinoma,,Illumina HiSeq,Stage IIA
35,TCGA-31-1959,not hispanic or latino,2009.0,female,white,Ovary,75.0,49.0,Primary Tumor,BCGSC,alive,OV,TCGA-31-1959-01,Ovarian serous cystadenocarcinoma,Paciltaxel,Illumina HiSeq,


In [15]:
disease_labels = list(rnaseq_unique_rows_df["disease"])

### Downstream tasks

In [16]:
model = VAE(input_dim=5000, hidden_dim=[100], z_dim=100)

In [31]:
# load model
def load_trained_model(model, model_name):
    model.load_state_dict(
        torch.load("../output/models/models_batc_size_32/" + model_name)
    )
    return model


# How well does the model reconstruct the input RNAseq data
def VAE_latent_z_out(df, model):
    z = model.forward(torch.tensor(df.values, dtype=torch.float32))[4]
    column_names = [str(i) for i in range(100)]

    z = pd.DataFrame(z.detach().numpy(), columns=column_names)

    return z

In [32]:
model = load_trained_model(model, "tcga_rnaseq_df_0p00vae_weights.pth")
z_latent = VAE_latent_z_out(rnaseq_df, model)

In [33]:
labels = list(set(rnaseq_unique_rows_df["disease"]))

In [34]:
X = z_latent
y = rnaseq_unique_rows_df.copy()
y["disease"].replace(labels, list(range(24)), inplace=True)
y = y["disease"]

In [41]:
# Define custom dataset class
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data.iloc[idx].values, dtype=torch.float32)

In [None]:
# Create data loaders for the training and testing sets
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Load the pre-trained VAE model and extract its encoder


# Define the classifier architecture
class Classifier(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = nn.functional.relu(self.fc1(x))
        x = nn.functional.relu(self.fc2(x))
        x = nn.functional.log_softmax(self.fc3(x), dim=1)
        return x


# Create an instance of the classifier
classifier = Classifier(input_dim=5000, output_dim=24)

# Define the loss function and optimizer
criterion = nn.NLLLoss()
optimizer = optim.Adam(classifier.parameters(), lr=0.001)

# Train the classifier using the pre-trained VAE embeddings
for epoch in range(10):
    running_loss = 0.0
    for i, (x, y) in enumerate(train_loader):
        with torch.no_grad():
            _, _, z = vae(x)
        y_pred = classifier(z)
        loss = criterion(y_pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * x.size(0)
    epoch_loss = running_loss / len(train_dataset)
    print(f"Epoch {epoch + 1} loss: {epoch_loss:.4f}")

In [None]:
# Evaluate the classifier on the test set
correct = 0
total = 0
with torch.no_grad():
    for x, y in test_loader:
        _, _, z = vae(x)
        y_pred = classifier(z).argmax(dim=1)
        total += y.size(0)
        correct += (y_pred == y).sum().item()
accuracy = correct / total
print(f"Test accuracy: {accuracy:.4f}")