In [90]:
import pandas as pd
import numpy as np

# Path to your CSV file
file = 'TCGA-LUAD'
csv_file_path = f'{file}_counts.csv'

data_df = pd.read_csv(csv_file_path, index_col=0)
data_df = data_df.copy().T
data_df = data_df.apply(pd.to_numeric, errors='coerce')
data_array = data_df.values

In [91]:
mutations = data_df.columns

In [92]:
from sklearn.decomposition import LatentDirichletAllocation

n_components = 10 
lda = LatentDirichletAllocation(n_components=n_components, random_state=0)
lda.fit(data_array)

topic_word_distributions = lda.components_
doc_topic_distributions = lda.transform(data_array)

In [93]:
topic_word_df = pd.DataFrame(topic_word_distributions, columns=mutations)
topic_word_df.index = [f'Topic {i+1}' for i in range(n_components)]
topic_word_df.to_csv(f'topic_mutation_{file}_{n_components}.csv')
#topic_word_df

In [94]:
doc_topic_df = pd.DataFrame(doc_topic_distributions, index=data_df.index)
doc_topic_df.columns = [f'Topic {i+1}' for i in range(n_components)]
doc_topic_df.to_csv(f'topic_sample_{file}_{n_components}.csv')
#doc_topic_df

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10
TCGA-38-7271-01A-11D-2036-08,0.001819,0.001819,0.453861,0.001819,0.001819,0.001819,0.001819,0.001819,0.531589,0.001819
TCGA-78-7166-01A-12D-2063-08,0.000415,0.270296,0.000415,0.235281,0.041159,0.000415,0.000415,0.075068,0.376121,0.000415
TCGA-55-1594-01A-01D-1040-01,0.000625,0.000625,0.428111,0.000625,0.000625,0.093889,0.178466,0.108727,0.187682,0.000625
TCGA-49-6742-01A-11D-1855-08,0.000344,0.000344,0.590706,0.000344,0.165027,0.241862,0.000344,0.000344,0.000344,0.000344
TCGA-44-2661-01A-01D-1105-08,0.002273,0.002273,0.002274,0.002273,0.002273,0.574163,0.002273,0.407650,0.002273,0.002273
...,...,...,...,...,...,...,...,...,...,...
TCGA-97-A4M7-01A-11D-A24P-08,0.000316,0.038871,0.000316,0.331214,0.000316,0.000316,0.000316,0.195547,0.432474,0.000316
TCGA-49-4494-01A-01D-1265-08,0.000498,0.000498,0.000498,0.113780,0.000498,0.000498,0.050204,0.227258,0.605773,0.000498
TCGA-86-7713-01A-11D-2063-08,0.000621,0.000621,0.000621,0.290680,0.318909,0.000621,0.000621,0.347300,0.039383,0.000621
TCGA-62-A46O-01A-11D-A24D-08,0.033506,0.021924,0.155224,0.251323,0.181720,0.000107,0.005286,0.137811,0.212992,0.000107


### Aggregate

In [99]:
import pandas as pd
import os

data_folder = "data"

csv_files = [os.path.join(data_folder, file) for file in os.listdir(data_folder) if file.endswith('.csv')]
print(csv_files)
combined_count_data = pd.DataFrame()
for file in csv_files:
    temp = pd.read_csv(file, index_col=0)  # Use the first column as the row index
    combined_count_data = pd.concat([combined_count_data, temp], axis=1)
combined_count_data.dropna(how='all', axis=1, inplace=True)

data_df = combined_count_data.copy().T
data_df = data_df.apply(pd.to_numeric, errors='coerce')
data_array = data_df.values

mutations = data_df.columns

['data/TCGA-BRCA_counts.csv', 'data/TCGA-COAD_counts.csv', 'data/TCGA-LUAD_counts.csv']


In [None]:
from sklearn.decomposition import LatentDirichletAllocation

n_components = 6 
lda = LatentDirichletAllocation(n_components=n_components, random_state=0)
lda.fit(data_array)

topic_word_distributions = lda.components_
doc_topic_distributions = lda.transform(data_array)

In [None]:
topic_word_df = pd.DataFrame(topic_word_distributions, columns=mutations)
topic_word_df.index = [f'Topic {i+1}' for i in range(n_components)]
topic_word_df.to_csv(f'topic_mutation_combined_{n_components}.csv')
#topic_word_df

In [None]:
doc_topic_df = pd.DataFrame(doc_topic_distributions, index=data_df.index)
doc_topic_df.columns = [f'Topic {i+1}' for i in range(n_components)]
doc_topic_df.to_csv(f'topic_sample_combined_{n_components}.csv')
#doc_topic_df