# Running mtSCITE 

In [8]:
import pandas as pd 
import numpy as np 
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import os
import datetime


In [9]:
# Create directory if it does not exist
def create_directory(directory_path):
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)

In [10]:
donors = ['A1_06', 'A1_30']#, 'JH', 'YFV2003']

In [11]:
meta_data = pd.read_csv('/home/carler/lab/t_cell/meta_data/2023_04_06_A1_06_A1_30_YFV2003_JH_meta_data_invivo_t_cell_mito_project.csv', index_col=0)


In [12]:
# Get current date
today = datetime.date.today()

# Format date as a string in the format "YYYY_M_D"
date_string = today.strftime("%Y_%-m_%-d")


INPUT_PATH = '/home/carler/lab/t_cell/phylogeny/data/output/P20407/2023_4_27_pmat_clust_mutations_0/'
OUTPUT_PATH = '/home/carler/lab/t_cell/phylogeny/data/output/P20407/' + date_string + '_mt-scite_2_pos_filter/'
SCITE_PATH = '/home/carler/lab/t_cell/phylogeny/mt-SCITE/'
create_directory(OUTPUT_PATH)


In [13]:
meta_data_mt_SCITE = {}
for donor in donors:
    meta_data_mt_SCITE[donor] = pd.read_csv(INPUT_PATH + '0.0556_' + donor + '_samples_meta_data.csv', index_col=0)
    meta_data_mt_SCITE[donor].columns = [x.split('.')[0] for x in meta_data_mt_SCITE[donor].columns]

In [14]:
mcmt_data = {}
for donor in donors:
    mcmt_data[donor] = pd.read_csv(INPUT_PATH + '0.0556_' + donor + '.csv', sep='\s', names=meta_data_mt_SCITE[donor].columns, dtype=np.float64, engine='python')
    mcmt_data[donor].index = meta_data_mt_SCITE[donor].index

## Check to see if any mutations are present in a large number of cells 

Over 10% likelyhood in at least 20% of cells

In [15]:
all_positions = []

for donor in donors:
    positions = []
    positions = mcmt_data[donor][((mcmt_data[donor] > 0.1).sum(axis=1) /len(mcmt_data[donor].columns)) >= 0.2].index
    all_positions.append(positions)

In [16]:
print(f'Number of positions for {donors[0]}: {len(all_positions[0])} \nNumber of positions for {donors[1]}: {len(all_positions[1])}') #\nNumber of positions for {donors[2]}: {len(all_positions[2])} \nNumber of positions for {donors[3]}: {len(all_positions[3])} \n') 

Number of positions for A1_06: 0 
Number of positions for A1_30: 0


# Generate mtSCITE commands 

This will generate a mtSCITE command for each clone meeting the criteria, in this case at least 10 cells in the clone. 

In [17]:
scite_command_total = []
i = 0
j = 0 
k = 0 
for donor in donors: 
    idx_clones = meta_data.loc[meta_data.Donor == donor].groupby('clone_id')['Donor'].count()
    idx_clones = idx_clones.loc[(idx_clones >= 10) & (idx_clones.index.str[1] != '-')].index
    for clone in idx_clones:
        sample_names = meta_data.loc[meta_data.Donor == donor].loc[meta_data.loc[meta_data.Donor == donor]['clone_id'] == clone].index
        mcmt_data_current_clone = mcmt_data[donor][np.intersect1d(mcmt_data[donor].columns, sample_names)]
        pos_with_mut = mcmt_data_current_clone[(mcmt_data_current_clone > 0.5).sum(axis=1) > 1].index
        mcmt_data_current_clone = mcmt_data_current_clone.loc[pos_with_mut, :]
        if pos_with_mut.size == 0: 
            j += 1
            continue
        elif pos_with_mut.size > 1:
            k += 1
        if pos_with_mut.size == 1:
            i += 1
            new_row = pd.DataFrame([[0.99] * len(mcmt_data_current_clone.columns)], columns=mcmt_data_current_clone.columns, index=[0])
            mcmt_data_current_clone = pd.concat([mcmt_data_current_clone, new_row], axis=0)
        create_directory(f'{OUTPUT_PATH}{donor}/0.0556_{clone}/')
        #create_directory(f'{OUTPUT_PATH}{donor}/stdout/')
        create_directory(f'{OUTPUT_PATH}{donor}_matrix_output_clonal/')
        mcmt_data_current_clone.to_csv(f'{OUTPUT_PATH}{donor}_matrix_output_clonal/0.0556_{clone}.csv', sep=' ', index=False, header=False)
        mcmt_data_current_clone.to_csv(f'{OUTPUT_PATH}{donor}/0.0556_{clone}/0.0556_meta_data_{clone}.csv', sep=',')
        scite_command = f'{SCITE_PATH}scite -i  {OUTPUT_PATH}{donor}_matrix_output_clonal/0.0556_{clone}.csv -n {mcmt_data_current_clone.index.size} -m {mcmt_data_current_clone.columns.size} -r 1 -l 200000 -fd 0.0001 -ad 0.0001 -cc 0.0 -s -a -o {OUTPUT_PATH}{donor}/0.0556_{clone}/0.0556_{clone} 1> {OUTPUT_PATH}{donor}/0.0556_{clone}/0.0556_{clone}.stdout.txt' 
        #print(f"Running SCITE for {donor} clone {clone} with {mcmt_data_current_clone.index.size} number of positions and {mcmt_data_current_clone.columns.size} samples \n specific posistions {mcmt_data_current_clone.index} \n command: {scite_command} ")
        scite_command_total.append(scite_command)



In [18]:
print(f'Number of clones that have no mutations: {j}')
print(f'Number of clones that have one mutation: {i}')
print(f'Number of clones that have more than one mutation: {k}')
print(f'Number of clones that are used for mtSCITE: {len(scite_command_total)}')


Number of clones that have no mutations: 10
Number of clones that have one mutation: 16
Number of clones that have more than one mutation: 90
Number of clones that are used for mtSCITE: 106


In [20]:
scite_command_total

['/home/carler/lab/t_cell/phylogeny/mt-SCITE/scite -i  /home/carler/lab/t_cell/phylogeny/data/output/P20407/2023_4_27_mt-scite_2_pos_filter/A1_06_matrix_output_clonal/0.0556_A0.csv -n 4 -m 11 -r 1 -l 200000 -fd 0.0001 -ad 0.0001 -cc 0.0 -s -a -o /home/carler/lab/t_cell/phylogeny/data/output/P20407/2023_4_27_mt-scite_2_pos_filter/A1_06/0.0556_A0/0.0556_A0 1> /home/carler/lab/t_cell/phylogeny/data/output/P20407/2023_4_27_mt-scite_2_pos_filter/A1_06/0.0556_A0/0.0556_A0.stdout.txt',
 '/home/carler/lab/t_cell/phylogeny/mt-SCITE/scite -i  /home/carler/lab/t_cell/phylogeny/data/output/P20407/2023_4_27_mt-scite_2_pos_filter/A1_06_matrix_output_clonal/0.0556_A1.csv -n 2 -m 20 -r 1 -l 200000 -fd 0.0001 -ad 0.0001 -cc 0.0 -s -a -o /home/carler/lab/t_cell/phylogeny/data/output/P20407/2023_4_27_mt-scite_2_pos_filter/A1_06/0.0556_A1/0.0556_A1 1> /home/carler/lab/t_cell/phylogeny/data/output/P20407/2023_4_27_mt-scite_2_pos_filter/A1_06/0.0556_A1/0.0556_A1.stdout.txt',
 '/home/carler/lab/t_cell/phylog

# Run mtSCITE through command line 

In [21]:
#for scite_command in scite_command_total:
#    !{scite_command}

In [22]:
import subprocess

for i in range(0, len(scite_command_total), 40):
    # start 40 processes at a time
    processes = []
    for cmd in scite_command_total[i:i+40]:
        processes.append(subprocess.Popen(cmd, shell=True))
    # wait for all processes to finish
    for process in processes:
        process.communicate()

In [17]:
j = 0 
for i in range(0, len(scite_command_total), 40):
    # start 40 processes at a time
    processes = []
    for cmd in scite_command_total[i:i+40]:
        j += 1 
        print(cmd)

/home/carler/lab/t_cell/phylogeny/mt-SCITE/scite -i  /home/carler/lab/t_cell/phylogeny/data/output/P20407/2023_4_25_mt-scite/A1_06_matrix_output_clonal/0.0556_A0.csv -n 2 -m 15 -r 1 -l 2000000 -fd 0.0001 -ad 0.0001 -cc 0.0 -s -a -o /home/carler/lab/t_cell/phylogeny/data/output/P20407/2023_4_25_mt-scite/A1_06/0.0556_A0/0.0556_A0 1> /home/carler/lab/t_cell/phylogeny/data/output/P20407/2023_4_25_mt-scite/A1_06/0.0556_A0/0.0556_A0.stdout.txt
/home/carler/lab/t_cell/phylogeny/mt-SCITE/scite -i  /home/carler/lab/t_cell/phylogeny/data/output/P20407/2023_4_25_mt-scite/A1_06_matrix_output_clonal/0.0556_A1.csv -n 2 -m 24 -r 1 -l 2000000 -fd 0.0001 -ad 0.0001 -cc 0.0 -s -a -o /home/carler/lab/t_cell/phylogeny/data/output/P20407/2023_4_25_mt-scite/A1_06/0.0556_A1/0.0556_A1 1> /home/carler/lab/t_cell/phylogeny/data/output/P20407/2023_4_25_mt-scite/A1_06/0.0556_A1/0.0556_A1.stdout.txt
/home/carler/lab/t_cell/phylogeny/mt-SCITE/scite -i  /home/carler/lab/t_cell/phylogeny/data/output/P20407/2023_4_25_

In [18]:
j

81

# Run for a single Donor and Single clone

In [17]:
donor_to_use = 'JH'
clone_to_use = 26

In [77]:
sample_names = meta_data[donor_to_use].loc[meta_data[donor_to_use]['clone_id'] == clone_to_use, 'Sample_id'].values
mcmt_data_current_clone = mcmt_data[donor_to_use][np.intersect1d(mcmt_data[donor_to_use].columns, sample_names)]

In [82]:
create_directory(f'../../../mt-SCITE_output/{donor_to_use}/0.0556_{clone_to_use}/')
create_directory(f'../../../mt-SCITE_output/{donor_to_use}/stdout/')
create_directory(f'../../../data/{donor_to_use}_matrix_output_clonal/')


In [83]:
pos_with_mut = mcmt_data_current_clone[(mcmt_data_current_clone > 0.5).sum(axis=1) > 1].index
mcmt_data_current_clone = mcmt_data_current_clone.loc[pos_with_mut, :]
mcmt_data_current_clone.to_csv(f'../../../data/{donor_to_use}_matrix_output_clonal/0.0556_{clone_to_use}.csv', sep=' ', index=False, header=False)

In [85]:
scite_command = f'../../../mt-SCITE/scite -i  ../../../data/{donor_to_use}_matrix_output_clonal/0.0556_{clone_to_use}.csv -n {mcmt_data_current_clone.index.size} -m {mcmt_data_current_clone.columns.size} -r 1 -l 200000 -fd 0.0001 -ad 0.0001 -cc 0.0 -s -a -o ../../../mt-SCITE_output/{donor_to_use}/0.0556_{clone_to_use}/0.0556_{clone_to_use} 1> ../../../mt-SCITE_output/{donor_to_use}/stdout/0.0556.stdout.txt' 
print(scite_command)

In [87]:
! {scite_command}
