In [130]:
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd 
import numpy as np
from input_str import pair_sample

In [131]:
cancer_type = "CRC"  # STAD, UCEC
sample_type = "tumor" # normal

464

In [None]:
df_str = pair_sample(cancer_type) 

if sample_type == "tumor":
    # STR profile from tumor samples
    paired_pt = df_str.iloc[:,:8]
    paired_pt["mean_allele_length"] = paired_pt["allele_a_t"] + paired_pt["allele_b_t"]
    cluster_data = paired_pt.pivot(index = "tmp_id", columns = "sample_t", values = "mean_allele_length")
else:
    # STR profile from normal samples
    paired_bn = df_str.iloc[:,8:]
    paired_bn["mean_allele_length"] = paired_bn["allele_a_n"] + paired_bn["allele_b_n"]
    paired_bn["tmp_id"] = df_str["tmp_id"]
    cluster_data = paired_bn.pivot(index = "tmp_id", columns = "sample_n", values = "mean_allele_length")

In [145]:
cluster_data = cluster_data.loc[cluster_data.isna().sum(axis=1) < cluster_data.shape[1]*0.8,]
cluster_data = cluster_data.loc[cluster_data.std(axis = 1) > 0.5]
cluster_data.shape

In [148]:
imp_knn = KNNImputer(n_neighbors = 5, weights = "distance")
imputed = imp_knn.fit_transform(cluster_data.T)

df_scaled = StandardScaler().fit_transform(imputed)
model = PCA(n_components = 5, random_state=42)
pca_res = model.fit(df_scaled)
pca_data = pd.DataFrame(model.fit_transform(df_scaled))

In [149]:
pca_res.explained_variance_ratio_

array([0.04564988, 0.01267099, 0.01026046, 0.00759367, 0.0074275 ,
       0.00678779, 0.00648665, 0.00629227, 0.00599307, 0.00563944])

In [150]:
msi_info = pd.read_csv("../clinical/" + cancer_type + "_msi.csv")
msi_info.loc[msi_info["msi_status"] == "msi-l", "msi_status"] = "mss"
pca_data["sample"] =cluster_data.columns
pca_data["patient"] = ["-".join(i.split("-")[:3]) for i in pca_data["sample"]]
pca_data = pca_data.merge(msi_info, left_on = "patient", right_on = "patient")
meta = pd.read_csv("../processed_data/meta/" + cancer_type + "_meta_filtered.csv")
pca_data = pca_data.merge(meta, left_on= "sample", right_on = "name")

In [154]:
pca_data.to_csv("./pca_data/" + cancer_type + sample_type+ "_pca.csv", index=False)