# Running mtSCITE 

In [1]:
import pandas as pd 
import numpy as np 
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import os
import datetime


In [2]:
# Create directory if it does not exist
def create_directory(directory_path):
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)

In [3]:
donors = ['A1_06', 'A1_30']#, 'JH', 'YFV2003']

In [4]:
meta_data = pd.read_csv('/home/carler/lab/t_cell/meta_data/2023_04_06_A1_06_A1_30_YFV2003_JH_meta_data_invivo_t_cell_mito_project.csv', index_col=0)


In [5]:
# Get current date
today = datetime.date.today()

# Format date as a string in the format "YYYY_M_D"
date_string = today.strftime("%Y_%-m_%-d")


INPUT_PATH = '/home/carler/lab/t_cell/phylogeny/data/output/P20407/2023_4_27_pmat_clust_mutations_0/'
OUTPUT_PATH = f'/home/carler/lab/t_cell/phylogeny/data/output/P20407/{date_string}_mt_scite_entire_dataset/'
SCITE_PATH = '/home/carler/lab/t_cell/phylogeny/mt-SCITE/'
create_directory(OUTPUT_PATH)


In [6]:
meta_data_mt_SCITE = {}
for donor in donors:
    meta_data_mt_SCITE[donor] = pd.read_csv(INPUT_PATH + '0.0556_' + donor + '_samples_meta_data.csv', index_col=0)
    meta_data_mt_SCITE[donor].columns = [x.split('.')[0] for x in meta_data_mt_SCITE[donor].columns]

In [7]:
mcmt_data = {}
for donor in donors:
    mcmt_data[donor] = pd.read_csv(INPUT_PATH + '0.0556_' + donor + '.csv', sep='\s', names=meta_data_mt_SCITE[donor].columns, dtype=np.float64, engine='python')
    mcmt_data[donor].index = meta_data_mt_SCITE[donor].index

## Check to see if any mutations are present in a large number of cells 

Over 10% likelyhood in at least 20% of cells

In [17]:
meta_for_donor.loc['A1_06_D14_p1_A14', 'clone_id']

'A61'

In [18]:
test_concat = pd.concat([mcmt_data['A1_06'], test_df])

In [12]:
all_positions = []

for donor in donors:
    positions = []
    positions = mcmt_data[donor][((mcmt_data[donor] > 0.1).sum(axis=1) /len(mcmt_data[donor].columns)) >= 0.2].index
    all_positions.append(positions)

In [13]:
print(f'Number of positions for {donors[0]}: {len(all_positions[0])} \nNumber of positions for {donors[1]}: {len(all_positions[1])}') #\nNumber of positions for {donors[2]}: {len(all_positions[2])} \nNumber of positions for {donors[3]}: {len(all_positions[3])} \n') 

Number of positions for A1_06: 0 
Number of positions for A1_30: 0


# Generate mtSCITE commands 

In [38]:
n_repetitions = 20000000

In [39]:
scite_command_total = []
for donor in donors: 
    create_directory(f'{OUTPUT_PATH}{donor}_{n_repetitions}/')
    scite_command = f'{SCITE_PATH}scite -i  {INPUT_PATH}0.0556_{donor}.csv -n {mcmt_data[donor].index.size} -m {mcmt_data[donor].columns.size} -r 1 -l {n_repetitions} -fd 0.0001 -ad 0.0001 -cc 0.0 -s -a -o {OUTPUT_PATH}{donor}_{n_repetitions}/0.0556_{donor} 1> {OUTPUT_PATH}{donor}_{n_repetitions}/0.0556_{donor}.stdout.txt' 
    #print(f"Running SCITE for {donor} clone {clone} with {mcmt_data_current_clone.index.size} number of positions and {mcmt_data_current_clone.columns.size} samples \n specific posistions {mcmt_data_current_clone.index} \n command: {scite_command} ")
    scite_command_total.append(scite_command)



In [41]:
scite_command_total[1]

'/home/carler/lab/t_cell/phylogeny/mt-SCITE/scite -i  /home/carler/lab/t_cell/phylogeny/data/output/P20407/2023_4_27_pmat_clust_mutations_0/0.0556_A1_30.csv -n 577 -m 1169 -r 1 -l 20000000 -fd 0.0001 -ad 0.0001 -cc 0.0 -s -a -o /home/carler/lab/t_cell/phylogeny/data/output/P20407/2023_4_27_mt_scite_entire_dataset/A1_30_20000000/0.0556_A1_30 1> /home/carler/lab/t_cell/phylogeny/data/output/P20407/2023_4_27_mt_scite_entire_dataset/A1_30_20000000/0.0556_A1_30.stdout.txt'

# Run imputed TCR 

In [36]:
n_repetitions = 1000001
scite_command_total = []
RUN_SCREEN = False
for donor in donors: 
    clones = meta_data.loc[mcmt_data[donor].columns].clone_id.unique()
    clone_impute_df = pd.DataFrame(index=clones, columns=mcmt_data[donor].columns).fillna(0)
    meta_for_donor = meta_data.loc[clone_impute_df.columns]
    for clone  in clones: 
        clone_impute_df.loc[clone, meta_for_donor.loc[meta_for_donor['clone_id'] == clone].index] = 0.99 
    donor_imputed_mcmt_df = pd.concat([mcmt_data[donor], clone_impute_df])
    create_directory(f'{OUTPUT_PATH}{donor}_{n_repetitions}/')
    create_directory(f'{OUTPUT_PATH}{donor}_{n_repetitions}/imputed_input')
    donor_imputed_mcmt_df.to_csv(f'{OUTPUT_PATH}{donor}_{n_repetitions}/imputed_input/{donor}_imputed_t_cell.csv', sep=' ', index=False, header=False)
    donor_imputed_mcmt_df.to_csv(f'{OUTPUT_PATH}{donor}_{n_repetitions}/imputed_input/{donor}_imputed_t_cell_meta_data.csv')
    scite_command = f'{SCITE_PATH}scite -i  {OUTPUT_PATH}{donor}_{n_repetitions}/imputed_input/{donor}_imputed_t_cell.csv -n {donor_imputed_mcmt_df.index.size} -m {donor_imputed_mcmt_df.columns.size} -r 1 -l {n_repetitions} -fd 0.0001 -ad 0.0001 -cc 0.0 -s -a -o {OUTPUT_PATH}{donor}_{n_repetitions}/0.0556_{donor} 1> {OUTPUT_PATH}{donor}_{n_repetitions}/0.0556_{donor}.stdout.txt' 
    #print(f"Running SCITE for {donor} clone {clone} with {mcmt_data_current_clone.index.size} number of positions and {mcmt_data_current_clone.columns.size} samples \n specific posistions {mcmt_data_current_clone.index} \n command: {scite_command} ")
    scite_command_total.append(scite_command)

In [40]:
import subprocess

import time


if len(scite_command_total) < 10: 
    for i, donor in enumerate(donors):
        session_name = f"{donor}_mt-scite_{n_repetitions}"
        subprocess.run(f"screen -S {session_name} -d -m", shell=True, check=True)
        time.sleep(1)
        subprocess.run(f"screen -S {session_name} -X stuff '{scite_command_total[i]}\n'", shell=True, check=True)
else: 
    print(f"Make sure you really want to start {len(scite_command_total)} before you run this..")

In [39]:
scite_command[i]

'h'