In [17]:
import os
import torch
from torch.utils.data import Dataset
import pydicom
import numpy as np
from torchvision import transforms
import pandas as pd
from sklearn.preprocessing import StandardScaler

generate the RNA data set, will put citation here later

In [18]:
df = pd.read_csv('data/GSE103584_R01_NSCLC_RNAseq.txt', sep='\t', index_col=0)

# Check shape and preview
print(df.shape)
print(df.head())

(22126, 130)
             R01-023   R01-024   R01-006   R01-153   R01-031   R01-032  \
1/2-SBSRNA4      NaN       NaN       NaN       NaN       NaN       NaN   
A1BG             NaN  2.528510  1.713994  3.143938  1.795080  2.410910   
A1BG-AS1         NaN       NaN       NaN  0.646213       NaN       NaN   
A1CF             NaN       NaN       NaN       NaN       NaN       NaN   
A2LD1        2.03438  0.436761  1.601030  3.366031  0.994382  2.130685   

              R01-033   R01-034    R01-035   R01-037  ...   R01-136   R01-137  \
1/2-SBSRNA4       NaN       NaN        NaN       NaN  ...       NaN       NaN   
A1BG         2.538406       NaN  10.386501  1.826220  ...  3.534986  7.560916   
A1BG-AS1          NaN       NaN        NaN       NaN  ...  2.408296  3.474290   
A1CF              NaN       NaN        NaN       NaN  ...       NaN       NaN   
A2LD1        0.842759  1.835353   0.662647  0.646078  ...       NaN       NaN   

              R01-138  R01-139  R01-140  R01-141  R01-1

variance filtering, selecting only the ones that vary the most, as i suspect this to be the most important information. 

But also if N/A is a lot in these sample dont want them so i should filter by that as well

I want to have at least 80% of the samples with relevant genes

In [19]:
# First, filter by expression frequency
df.replace('NA', np.nan, inplace=True)

# Calculate % of non-missing (or non-zero) values per gene
expression_counts = df.notna().sum(axis=1)
expression_fraction = expression_counts / df.shape[1]

# Keep genes expressed in at least 80% of samples
df_frequent = df[expression_fraction > 0.8]


print(df_frequent.shape)


(10693, 130)


now need to filter by variance to focus on interesting genes, will use a log as tends to be exponential

In [20]:

df_frequent = df_frequent.astype(float)
df_log = np.log10(df_frequent + 1)
# print(df_log)

# Assumes df is already loaded and numeric with gene names as index
gene_variances = df_log.var(axis=1)  # variance across samples for each gene

# Sort by variance
top_genes = gene_variances.sort_values(ascending=False)

# Select top N high-variance genes (e.g., 1000)
df_top = df_log.loc[top_genes.index[:1000]]

print(df_top.shape)

print("Top 5 gene variances:")
print(top_genes.head())



(1000, 130)
Top 5 gene variances:
TOR1AIP1    8.245657
CHD7        7.297720
ZFYVE26     7.106993
NAA38       5.580402
CASP2       4.665585
dtype: float64


In [21]:
print("Max value:", df_top.max())
print("Min value:", df_top.min())
# print the values of the first 5 genes in df_top
print("Top 5 values:")
print(df_top.head())

Max value: R01-023    8.053078
R01-024    8.996512
R01-006    7.232996
R01-153    7.720986
R01-031    9.113943
             ...   
R01-141    5.147284
R01-142    5.423483
R01-144    4.975018
R01-145    5.227481
R01-146    5.991229
Length: 130, dtype: float64
Min value: R01-023    0.164201
R01-024    0.142637
R01-006    0.123429
R01-153    0.109324
R01-031    0.166936
             ...   
R01-141    0.191111
R01-142    0.228900
R01-144    0.224928
R01-145    0.160699
R01-146    0.236914
Length: 130, dtype: float64
Top 5 values:
           R01-023   R01-024   R01-006   R01-153   R01-031   R01-032  \
TOR1AIP1  7.195900  3.360916  7.232996  7.720986  7.346353  7.155336   
CHD7      1.322126  1.345326  1.833200  1.131132  9.113943  7.705864   
ZFYVE26   6.723456  6.940517  0.827326  1.710891  6.569375  1.277587   
NAA38     8.053078  8.996512  1.275730  1.611778  2.638750  8.597695   
CASP2     0.888649  1.161618  0.637659  0.896804  8.399674  0.971874   

           R01-033   R01-034   R01-

In [22]:
# Standardize df_log so each gene (row) has mean 0 and std 1 (z-score)
df_zscore = df_top.sub(df_top.mean(axis=1), axis=0).div(df_top.std(axis=1), axis=0)
print(df_zscore.head())

           R01-023   R01-024   R01-006   R01-153   R01-031   R01-032  \
TOR1AIP1  1.080242 -0.255279  1.093161  1.263102  1.132637  1.066116   
CHD7     -0.488815 -0.480227 -0.299628 -0.559516  2.395517  1.874282   
ZFYVE26   1.451424  1.532845 -0.760265 -0.428833  1.393627 -0.591369   
NAA38     2.383069  2.782442 -0.485908 -0.343652  0.091084  2.613616   
CASP2    -0.406776 -0.280401 -0.522975 -0.403000  3.070556 -0.368246   

           R01-033   R01-034   R01-035   R01-037  ...   R01-136   R01-137  \
TOR1AIP1 -0.208266  1.149996 -0.909890  1.275782  ... -0.982044 -0.995134   
CHD7     -0.436542 -0.506787  1.974409  2.482991  ... -0.388950 -0.457817   
ZFYVE26   1.350093  1.314315 -0.520850  1.584860  ... -0.719606 -0.765384   
NAA38    -0.312542  2.408834 -0.168357 -0.298869  ... -0.614600 -0.494518   
CASP2    -0.309923 -0.513435  2.567867 -0.529047  ... -0.301494 -0.348947   

           R01-138   R01-139   R01-140   R01-141   R01-142   R01-144  \
TOR1AIP1 -1.024807 -1.085336 -1.

#grab the relevant images and combine them into Xy

In [52]:
#save the standardized data so i can load it in a different notebook
df_zscore.to_csv('data/df_zscore.csv')