In [27]:
import pandas as pd
import numpy as np
import random
from glob import glob
import os
import re

import matplotlib.pyplot as plt

from graphviz import Source
import networkx as nx

from networkx.drawing.nx_agraph import graphviz_layout, to_agraph

In [41]:
#10*x=100
# 10 clones, 10 cells per clone, num_mutation per clone = 3

# 25 clones, 4 cells per clone
# 5 clones, 20 cells per clone

100/20

5.0

In [28]:
# Read the mutation probability distribution observed in scRNAseq data
# This is used to assign mutation probabilities in simulated data
probabilities = pd.read_csv('../../data/YFV2001_matrix_output/0.05.csv', sep=' ', header=None).stack().loc[lambda x : x >= 0.5].array


## Generate pmats with mutations with phylogenetic signal

In [29]:

def generate_pmats_w_phylo_signal(num_rows, num_cols, num_clones, dfs):
    # Iterate to create three separate DataFrames
    for clone in range(num_clones): # number of clones
        # Create an empty DataFrame
        df = pd.DataFrame()

        # Randomly determine the condition for each row
        #conditions = np.random.choice([1, 2, 3, 4], size=num_rows)
        conditions = np.random.choice([1, 2, 3], size=num_rows)

        # Iterate over rows
        for condition in conditions:
            # Create row data based on the condition
            if condition == 1:
                row_data = ['X'] * (num_cols // 2) + [0.0001] * (num_cols // 2)
            elif condition == 2:
                row_data = [0.0001] * (num_cols // 2) + ['X'] * (num_cols // 2)
            elif condition == 3:
                row_data = ['X'] * num_cols
            #else:
                #row_data = [0] * num_cols

            # Create a temporary DataFrame for the current row
            temp_df = pd.DataFrame([row_data])

            # Concatenate the temporary DataFrame to the main DataFrame
            df = pd.concat([df, temp_df], ignore_index=True)

        # Append the current DataFrame to the list
        dfs.append(df)


    # Concatenate the list of DataFrames into a final DataFrame
    out = pd.concat(dfs, keys=range(len(dfs)))
    out.index = pd.MultiIndex.from_arrays([out.index.get_level_values(0),
                                           range(len(out))])

    out = out.stack().unstack([0, -1], fill_value=0.0001)
    out = out.set_axis(range(out.shape[1]), axis=1)

    # Replace 'X' by np.nan
    out = out.replace('X', np.nan)

    # Replace np.nan by sampling from the probability distribution
    out[out.isna()] = np.random.choice(probabilities, size=out.shape, replace=True)

    ## Remove rows with sum equal to 0
    #out = out[out.sum(axis=1) != 0]
    
    return out

In [18]:
# Set the number of rows and columns per clone

GT_PATH = "../../data/simulations_matrix_output/ground_truth/"

for i in range(1,11): # create 10 pmats
    num_mutations = 3
    num_cells = 10
    num_clones = 10
    dfs = []
    df_w_phylo = generate_pmats_w_phylo_signal(num_mutations, num_cells, num_clones, dfs)
    
    # Save to file
    gt_path = GT_PATH + str(i) + '.csv'
    df_w_phylo.to_csv(gt_path, index=False, sep=' ', header= False)
    

## Add mutations without a phylogenetic signal

In [19]:
def create_non_informative_df(affected_cells):
    
    # Create a df to sample from
    non_informative_df = pd.DataFrame(data=0.0001, index=range(1000), columns=range(100))

    # Randomly select n columns and replace default value with np.nan. n = cells_with_ni_pos
    for i in range(1000):
        random_cols = np.random.choice(non_informative_df.columns, affected_cells, replace=False)
        non_informative_df.iloc[i, random_cols] = np.nan

    # Replace nan by sampling from the distribution of mutations probabilities observed in scRNAseq
    non_informative_df[non_informative_df.isna()] = np.random.choice(probabilities, size=non_informative_df.shape, replace=True)

    return non_informative_df


In [20]:
def create_pmats(tree1, n_ni_pos, affected_cells, OUTPUT_PATH, experiment):
    for it in range(1,2): # number of reps
        for n in n_ni_pos:
            for i in affected_cells:
                non_info = create_non_informative_df(i)

                # Sample n rows randomly from df with non informative mutations
                non_info_sample = non_info.sample(n)

                # Concatenate tree1 with non_info_sample, with tree1 as the first rows
                tree1_w_ni_pos = pd.concat([tree1, non_info_sample]).reset_index().set_index('index')

                # Save to file
                matrix_path = OUTPUT_PATH + experiment + '_' + str(i) + '_cells_' + str(n) + '_pos_' + str(it) + '_iter.csv'
                tree1_w_ni_pos.to_csv(matrix_path, index=False, sep=' ', header= False)


In [21]:
def extract_numeric_part(filename):
    return int(filename.split('.')[0])

In [30]:
# Read the pmats with phylo signal and add mutations w/o phylo signal

OUTPUT_PATH = '../../data/simulations_matrix_output/non_informative/'
n_ni_pos = range(10, 110, 10) #110
affected_cells = [5, 10, 20, 30] #10, 20

GT_PATH = "../../data/simulations_matrix_output/ground_truth/"


for filename in sorted(os.listdir(GT_PATH), key=extract_numeric_part):
    if filename.endswith(".csv"):
        experiment = filename.split('.')[0]
        print(experiment)
        file_path = os.path.join(GT_PATH, filename)
        tree = pd.read_csv(file_path, header=None, sep=' ')
        create_pmats(tree, n_ni_pos, affected_cells, OUTPUT_PATH, experiment)

1
2
3
4
5
6
7
8
9
10


### Run mt-SCITE

In [31]:
# prepare pmat data

pmat_names = []
shapes = []

pmat_input_path = f'../../data/simulations_matrix_output/non_informative/'
#print(pmat_input_path)
pmats = list(glob(os.path.join(pmat_input_path, '*.csv')))
tree_name = []

for filename in sorted(pmats, key=lambda var:[int(x) if x.isdigit() else x for x in re.findall(r'[^0-9]|[0-9]+', var)]):
    name = os.path.basename(filename).split('-')[0].rsplit('.', 2)[0]
    #print(name)
    pmat_names.append(name)
    df = pd.read_csv(filename, sep=' ', header=None)
    shapes.append(len(df))

# make df with pmat info
pmat_data = pd.DataFrame(
    {'pmat_names': pmat_names,
     'len': shapes,
    })


# add some info

pmat_data['tree_name'] = pmat_data['pmat_names'].str.split('_').str[0]

pmat_data['cells_w_ni'] = pmat_data['pmat_names'].str.split('_').str[1]

pmat_data['iter_directory'] = pmat_data['pmat_names'].str.split('_').str[5]



In [33]:
pmat_data

Unnamed: 0,pmat_names,len,tree_name,cells_w_ni,iter_directory
0,1_5_cells_10_pos_1_iter,40,1,5,1
1,1_5_cells_20_pos_1_iter,50,1,5,1
2,1_5_cells_30_pos_1_iter,60,1,5,1
3,1_5_cells_40_pos_1_iter,70,1,5,1
4,1_5_cells_50_pos_1_iter,80,1,5,1
...,...,...,...,...,...
395,10_30_cells_60_pos_1_iter,90,10,30,1
396,10_30_cells_70_pos_1_iter,100,10,30,1
397,10_30_cells_80_pos_1_iter,110,10,30,1
398,10_30_cells_90_pos_1_iter,120,10,30,1


In [34]:
#testi = pd.read_csv('../../data/simulations_matrix_output/non_informative/9_5_cells_20_pos_1_iter.csv', sep=' ', header=None)
#testi

In [35]:

n_cells = '100'


for pmat in pmat_names:
    SCITE_PATH = '../../../mt-SCITE'
    experiment = pmat_data.loc[pmat_data['pmat_names'] == pmat, 'tree_name'].iloc[0]
    PMAT_PATH = f'../../data/simulations_matrix_output/non_informative/'
    OUTPUT = f'../../../mt-SCITE/mt-SCITE_output/Simulations/non_informative_mutations/{experiment}/'
    #print(PMAT_PATH)

    # Get number of mutations
    n = pmat_data.loc[pmat_data['pmat_names'] == pmat, 'len'].iloc[0].astype(str)
    #print(n)

    # folder was changed to iter_directory Get iter directory for output dir name
    iter_directory = pmat_data.loc[pmat_data['pmat_names'] == pmat, 'iter_directory'].iloc[0]
    #print(iter_directory)

    # Get number of added non-information mutations for output dir name
    cells_w_ni = pmat_data.loc[pmat_data['pmat_names'] == pmat, 'cells_w_ni'].iloc[0]

    try:
        os.makedirs(OUTPUT + '/' + iter_directory + '/' + cells_w_ni + '/') 
        os.makedirs(OUTPUT + '/' + 'stdout' + '/') 
    except FileExistsError :
        pass
    except :
        raise

    ! $SCITE_PATH/mt-SCITE/scite -i $PMAT_PATH/$pmat\.csv -n $n -m $n_cells -r 1 -l 200000 -fd 0.0001 -ad 0.0001 -cc 0.0 -s -a -o $OUTPUT/$iter_directory/$cells_w_ni/$pmat 1> $OUTPUT/stdout/$pmat\.stdout.txt

    