# Data Preparation

In [1]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd

Mounted at /content/drive


In [2]:
def data_info(data: pd.DataFrame):
  """
  Prints the information of a pandas DataFrame.
  Args:
  data: A pandas DataFrame.
  Returns:
  None
  """
  print("=== DataFrame Information ===")
  print(f"Shape of the dataframe: {data.shape}")
  print("\n--- Head of the DataFrame ---")
  print(data.head())
  print("\n--- Missing Values (NA counts) ---")
  print(data.isna().sum())
  print(f"Summary of the data: \n{data.describe()}")

In [3]:
file_path1 = "/content/drive/MyDrive/Datasets/TCGA_SKCM/rna_data_processed.csv"
metadata = "/content/drive/MyDrive/Datasets/TCGA_SKCM/RNA_Seq_transcriptom_meta_data.csv"
df = pd.read_csv(file_path1)
metadata = pd.read_csv(metadata)

In [4]:
data_info(df)

=== DataFrame Information ===
Shape of the dataframe: (471, 60662)

--- Head of the DataFrame ---
                     Unnamed: 0  ENSG00000000003.15  ENSG00000000005.6  \
0  TCGA.D3.A3C8.06A.12R.A18S.07              4.3466             0.1138   
1  TCGA.ER.A2NC.06A.11R.A18T.07             15.1229             0.0232   
2  TCGA.EE.A2GO.06A.11R.A18S.07              3.4421             0.0264   
3  TCGA.HR.A2OH.06A.11R.A18U.07              2.4587             0.0124   
4  TCGA.FS.A1ZT.06A.11R.A18U.07             12.5001             0.0372   

   ENSG00000000419.13  ENSG00000000457.14  ENSG00000000460.17  \
0             24.5221              1.7528              1.7758   
1             29.9988              1.1248              1.7931   
2             30.0859              1.5614              1.1442   
3             30.2149              2.4344              2.2350   
4             62.1598              3.7531              2.7662   

   ENSG00000000938.13  ENSG00000000971.16  ENSG00000001036.14  \
0

In [5]:
data_info(metadata)

=== DataFrame Information ===
Shape of the dataframe: (473, 31)

--- Head of the DataFrame ---
   Unnamed: 0                                    id data_format  \
0           1  738d4e33-28a2-42a7-a9b9-c8b44bf3e787         TSV   
1           2  ca2eed30-6062-4beb-b9a8-f7034812857a         TSV   
2           3  8620ba67-3c2b-43c8-a72d-987f52b14037         TSV   
3           4  b00650c2-a3dc-49ac-9d84-785186aa8c4d         TSV   
4           5  a550e9b4-f276-4bb2-903c-8055fc96fa43         TSV   

                          cases access  \
0  TCGA-D3-A3C8-06A-12R-A18S-07   open   
1  TCGA-ER-A2NC-06A-11R-A18T-07   open   
2  TCGA-EE-A2GO-06A-11R-A18S-07   open   
3  TCGA-HR-A2OH-06A-11R-A18U-07   open   
4  TCGA-FS-A1ZT-06A-11R-A18U-07   open   

                                           file_name  \
0  4ff81b7f-2dbc-453f-a68b-fe40315b2ef7.rna_seq.a...   
1  1fd89d3a-5991-4bfb-b27f-66b5f2a289d8.rna_seq.a...   
2  a7bda5b9-9c95-4f90-bab6-10ab15ee2b48.rna_seq.a...   
3  b4fd9abd-cb2a-4d1b-abb

In [6]:
# metadata = metadata.set_index("Sample")
df = df.set_index("Unnamed: 0")

In [7]:
df.shape

(471, 60661)

In [8]:
labels = {
    "Metastatic": 1,
    "Primary Tumor": 0
}

In [9]:
df["Sample_type"] = df["Sample_type"].map(labels)
df.head()

Unnamed: 0_level_0,ENSG00000000003.15,ENSG00000000005.6,ENSG00000000419.13,ENSG00000000457.14,ENSG00000000460.17,ENSG00000000938.13,ENSG00000000971.16,ENSG00000001036.14,ENSG00000001084.13,ENSG00000001167.14,...,ENSG00000288662.1,ENSG00000288663.1,ENSG00000288665.1,ENSG00000288667.1,ENSG00000288669.1,ENSG00000288670.1,ENSG00000288671.1,ENSG00000288674.1,ENSG00000288675.1,Sample_type
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA.D3.A3C8.06A.12R.A18S.07,4.3466,0.1138,24.5221,1.7528,1.7758,16.7941,8.7496,22.6171,4.1525,10.0889,...,0.0,0.0505,0.0,0.3932,0.0,1.2875,0.0,0.0075,0.5283,1
TCGA.ER.A2NC.06A.11R.A18T.07,15.1229,0.0232,29.9988,1.1248,1.7931,1.889,14.3699,28.9755,2.6553,19.4021,...,0.0,0.0691,0.0,0.0,0.0,3.8104,0.0,0.0196,0.5811,1
TCGA.EE.A2GO.06A.11R.A18S.07,3.4421,0.0264,30.0859,1.5614,1.1442,0.5309,1.1821,7.7457,1.3059,5.6264,...,0.0,0.0411,0.0,0.0,0.0098,4.7234,0.0,0.0054,1.1152,1
TCGA.HR.A2OH.06A.11R.A18U.07,2.4587,0.0124,30.2149,2.4344,2.235,9.909,9.0167,21.474,2.3573,8.8175,...,0.0,0.032,0.0,0.0,0.0,1.8586,0.0,0.0133,0.2825,1
TCGA.FS.A1ZT.06A.11R.A18U.07,12.5001,0.0372,62.1598,3.7531,2.7662,2.9557,3.2831,12.4303,3.6412,21.1089,...,0.0,0.1381,0.0,0.0,0.0,4.1642,0.0,0.0228,0.3051,1


In [10]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [11]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np

# Ensure class is defined
class RNASeqDataset(Dataset):
    def __init__(self, data, labels):
        self.data = torch.tensor(data, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

In [12]:
X = df.drop("Sample_type", axis = 1)
y = df["Sample_type"]

In [14]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype = torch.float32)

  y_tensor = torch.tensor(y, dtype = torch.float32)


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()
X_test = X_test.to_numpy()
y_test = y_test.to_numpy()

In [None]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, latent_dim)  # Bottleneck Layer
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [16]:
import os
import torch
BATCH_SIZE = 32
NUM_WORKERS = os.cpu_count()

dataset_train = RNASeqDataset(X_train, y_train)
dataset_test = RNASeqDataset(X_test, y_test)

ValueError: could not determine the shape of object type 'DataFrame'