In [1]:
import csv
import re
import pandas as pd
import collections
import numpy as np
from scipy import stats
from math import trunc
import threading
import multiprocessing as mp
from Bio import SeqIO
import pysam
import scipy
import math
import os
import subprocess
import shutil

## Single cell data analysis

### 1. Split bam file and Call SNP

In [None]:
output_path = '' # The path of output data
sample = ''

# Split the BAM file by cell barcode

barcode_file = pd.read_table(('dnbc4tools_analyze/' + sample + '/output/filter_matrix/barcodes.tsv.gz'),names = ['barcode']) # ouput file by DNBC4tools
Cellranger_barcode_list = list(barcode_file['barcode'].unique())

bam_file = pysam.AlignmentFile(('dnbc4tools_analyze/' + sample + '/output/anno_decon_sorted.bam'), 'rb') # ouput file by DNBC4tools
header = str(bam_file.header)

chromosome_name = 'Plant__ZeawithTE_8'
start_position = 123000000
end_position = 123380000

bam_file_subset = bam_file.fetch(reference=chromosome_name, start=start_position, end=end_position)


CellBarcode_dict = {}

for read in bam_file_subset:
    try:
        if read.has_tag('DB') and read.has_tag('UR'):
            CellBarcode = read.get_tag('DB')
            if CellBarcode in Cellranger_barcode_list:
                if CellBarcode in CellBarcode_dict:
                    CellBarcode_dict[CellBarcode].append(read)
                else:
                    CellBarcode_dict[CellBarcode] = [read]
            else:
                continue
        else:
            continue
    except:
        continue

cellbarcode_list = []
for cellbarcode, reads in CellBarcode_dict.items():
    cellbarcode_list.append(cellbarcode)
    with open(('output_path' + sample + '/' + f"{cellbarcode}.bam"), "w") as file:
        file.write(header)
        for read in reads:
            file.write(f"{read.tostring()}\n")

print(len(cellbarcode_list))

In [None]:
# Generate mpileup commands for each split BAM file

input_path = '' # The input path of the file containing splited bam files
output_path = '' # The path of output data
sample = ''

input_folder = (input_path + sample)
output_folder = (output_path + sample)

input_files = [f for f in os.listdir(input_folder) if f.endswith('.bam')]

mpileup_command_list = []

for input_file in input_files:
    input_file_path = os.path.join(input_folder, input_file)
    output_file = input_file.replace(".bam", ".mpileup")
    output_file_path = os.path.join(output_folder, output_file)
    
    mpileup_command = f"samtools mpileup --max-depth 0 --output-BP --output-QNAME --reference genome.fa {input_file_path} -o {output_file_path}"
    mpileup_command_list.append(mpileup_command)

with open((output_folder + sample + '_mpileup_command_list.txt'), "w") as file:
    for mpileup_command in mpileup_command_list:
        file.write(mpileup_command+'\n')

### 2. Extract mutation information from mpileup files

In [None]:
# Use raw.py to extract mutation information from the mpileup files and associate mutation data with cell barcodes and UMIs from the BAM file
# Batch copy and modify raw.py, then output Python commands

path = ''
sample = ''

input_folder = (path + sample) # path of the splited bam files
input_files = [f for f in os.listdir(input_folder) if f.endswith('.bam')]
replacement_strings = [filename.replace(".bam", "") for filename in input_files]

source_script = 'raw.py'

python_command_list = []

for replace_with in replacement_strings:
    filename = (path + sample + '/bin/' + f'{replace_with}.py')
    shutil.copyfile(source_script, filename)
    with open((filename), 'r') as file:
        file_content = file.read()
        file_content = file_content.replace('Barcode_Title', replace_with)
    with open(filename, 'w') as file:
        file.write(file_content)

    python_command = f"python {filename}"
    python_command_list.append(python_command)

print(len(python_command_list))

with open((path + sample + '_python_command_list.txt'), "w") as file:
    for python_command in python_command_list:
        file.write('"'+python_command+'"'+'\n')


In [None]:
# Merge mutation information

path = ''
sample = ''

MutReads_folder = (path + sample) # path of the splited _MutReads.txt files
MutReads_files = [f for f in os.listdir(MutReads_folder) if f.endswith('_MutReads.txt')]
print(len(MutReads_files))

MutReads_locals_list = []

for CallSNP_file in MutReads_files:
    file_path = os.path.join(MutReads_folder, CallSNP_file)
    MutReads_locals_list.append(('MutReads_' + CallSNP_file[:-13]))
    locals()['MutReads_' + CallSNP_file[:-13]] = pd.read_table(file_path)

print(len(MutReads_locals_list))
name_1 = MutReads_locals_list[0]
name_2 = MutReads_locals_list[1]

MutReads_files_merge = pd.concat([locals()[name_1],locals()[name_2]])
MutReads_locals_list.remove(name_1)
MutReads_locals_list.remove(name_2)
print(len(MutReads_locals_list))

for i in MutReads_locals_list:
    MutReads_files_merge = pd.concat([MutReads_files_merge,locals()[i]])

MutReads_files_merge = MutReads_files_merge.reset_index(drop = True).drop('Unnamed: 0', axis = 1)
MutReads_files_merge.to_csv((MutReads_folder + '/1-MutReads.txt'), sep = '\t')

# Merge the '_PosReadsDepth.txt' files with the following command:
# cat $(ls *_PosReadsDepth.txt) | awk 'NR == 1 {print; next} {print}' > 2-PosReadsDepth.txt

### 3. Filter mutation information and merge data from all samples

In [None]:
input_path = ''
output_path = ''
sample = ''

MutReads_files = pd.read_table((input_path + sample + '/1-MutReads.txt'))
MutReads_files['alt'] = MutReads_files['alt'].str.lower()

def contains_letter(s):
    return any(c.isalpha() for c in s)

MutReads_files['alt'] = MutReads_files['alt'].apply(lambda x: x if contains_letter(x) else 0)
MutReads_files['alt'] = MutReads_files['alt'].str.replace('[,.]', '', regex=True).fillna(0)

MutReads_files_chr8 = MutReads_files[MutReads_files['chr'] == 'Plant__ZeawithTE_8'].reset_index(drop = True).drop('Unnamed: 0', axis = 1)
MutReads_files_chr8['sample'] = sample
MutReads_files_chr8['TE'] = ''

TE_Target_List = [123008703,123007909,123007809,123013955,123017262,123016209,123016501,123016401,\
    123215928,123220002,123217967,123216708,123218259,123218159,123365006,123365193,\
        123367110,123369932,123368832,123368621,123368732,123014078,123014178,123365295]

def find_nearest_value(row):
    return min(TE_Target_List, key=lambda x: abs(row['pos'] - x))

MutReads_files_chr8['Target'] = MutReads_files_chr8.apply(find_nearest_value, axis=1)
MutReads_files_chr8['Distance'] = MutReads_files_chr8['pos'] - MutReads_files_chr8['Target']

TE_Target_dict = {123008703:'TE_homo_1053188',123007909:'TE_homo_1053188',123007809:'TE_homo_1053188',123013955:'TE_homo_1053200',123017262:'TE_homo_1053202',\
    123016209:'TE_homo_1053202',123016501:'TE_homo_1053202',123016401:'TE_homo_1053202',123215928:'TE_homo_1053283',123220002:'TE_homo_1053284',\
        123217967:'TE_homo_1053284',123216708:'TE_homo_1053284',123218259:'TE_homo_1053284',123218159:'TE_homo_1053284',123365006:'TE_homo_1053355',\
            123365193:'TE_homo_1053355',123367110:'TE_homo_1053358',123369932:'TE_homo_1053359',123368832:'TE_homo_1053359',123368621:'TE_homo_1053359',\
                123368732:'TE_homo_1053359',123014078:'TE_homo_1053200',123014178:'TE_homo_1053200',123365295:'TE_homo_1053355'}

MutReads_files_chr8['TE'] = MutReads_files_chr8['Target'].apply(lambda x: TE_Target_dict[x])

MutReads_files_chr8.to_csv((output_path + sample + '_Mut_Overview.txt'), sep = '\t')

MutReads_files_chr8_1000 = MutReads_files_chr8[(MutReads_files_chr8['Distance'] >= -1000) & (MutReads_files_chr8['Distance'] <= 1000)].reset_index(drop = True)
MutReads_files_chr8_1000.to_csv((output_path + sample + '_Mut_Overview_distance1000.txt'), sep = '\t')

MutReads_files_chr8_1000['CB_UMI'] = MutReads_files_chr8_1000['CellBarcode'] + '_' + MutReads_files_chr8_1000['UMI']
MutReads_files_chr8_1000['pos_UMI'] = MutReads_files_chr8_1000['pos'].apply(str) + '_' + MutReads_files_chr8_1000['UMI']
MutReads_files_chr8_1000['pos_UMI_mut'] = MutReads_files_chr8_1000['pos'].apply(str) + '_' + \
    MutReads_files_chr8_1000['UMI'] + '_' +  MutReads_files_chr8_1000['ref'] + '_' +  MutReads_files_chr8_1000['alt']

CB_UMI_count = MutReads_files_chr8_1000.drop_duplicates('CB_UMI').value_counts('CellBarcode').reset_index()
CB_UMI_count.columns = ['CellBarcode','CB_UMI_count']

pos_UMI_count = MutReads_files_chr8_1000.value_counts('pos_UMI').reset_index()
pos_UMI_count.columns = ['pos_UMI','pos_UMI_count']

pos_MutUMI_count = MutReads_files_chr8_1000[MutReads_files_chr8_1000['alt'] != '0'].value_counts('pos_UMI_mut').reset_index()
pos_MutUMI_count.columns = ['pos_UMI_mut','pos_MutUMI_count']

MutReads_files_chr8_1000 = pd.merge(MutReads_files_chr8_1000,CB_UMI_count,on = 'CellBarcode',how = 'outer')
MutReads_files_chr8_1000 = pd.merge(MutReads_files_chr8_1000,pos_UMI_count,on = 'pos_UMI',how = 'outer')
MutReads_files_chr8_1000 = pd.merge(MutReads_files_chr8_1000,pos_MutUMI_count,on = 'pos_UMI_mut',how = 'outer')

MutReads_files_chr8_1000['pos_MutUMI_freq'] = MutReads_files_chr8_1000['pos_MutUMI_count'] / MutReads_files_chr8_1000['pos_UMI_count']
MutReads_files_chr8_1000 = MutReads_files_chr8_1000.fillna(0)[MutReads_files_chr8_1000['alt'] != '0']

MutReads_files_chr8_1000['pos_CB_UMI_mut'] = MutReads_files_chr8_1000['pos'].apply(str) + '_' + MutReads_files_chr8_1000['CellBarcode'] + '_' + \
    MutReads_files_chr8_1000['UMI'] + '_' +  MutReads_files_chr8_1000['ref'] + '_' +  MutReads_files_chr8_1000['alt']

MutReads_files_chr8_1000.drop('Unnamed: 0', axis = 1).reset_index(drop = True).to_csv((output_path + sample + '_Mut_Overview_distance1000_filter.txt'), sep = '\t')
MutReads_files_chr8_1000.drop('Unnamed: 0', axis = 1).reset_index(drop = True)

In [None]:
path = '' # The path of the _Mut_Overview_distance1000_filter.txt
sample1 = ''
sample2 = ''
sample3 = ''
sample4 = ''

Filter_sample1 = pd.read_table(path + sample1 + '_Mut_Overview_distance1000_filter.txt')
Filter_sample2 = pd.read_table(path + sample2 + '_Mut_Overview_distance1000_filter.txt')
Filter_sample3 = pd.read_table(path + sample3 + '_Mut_Overview_distance1000_filter.txt')
Filter_sample4 = pd.read_table(path + sample4 + '_Mut_Overview_distance1000_filter.txt')

Filter_file_concat = pd.concat([Filter_sample1,Filter_sample2,Filter_sample3,sample4]).reset_index(drop = True).drop('Unnamed: 0', axis = 1)
Filter_file_concat.to_csv('Mut_Overview_distance1000_filter.txt', sep = '\t')
Filter_file_concat

In [None]:
path = ''
sample1 = ''
sample2 = ''
sample3 = ''
sample4 = ''

datAllReads_1 = pd.read_table((path + sample1 + '2-PosReadsDepth.txt'), usecols = ['pos','depth','CellBarcode'])
datAllReads_1['sample'] = sample1
datAllReads_1 = datAllReads_1[datAllReads_1['depth'] != 0]
datAllReads_2 = pd.read_table((path + sample2 + '2-PosReadsDepth.txt'), usecols = ['pos','depth','CellBarcode'])
datAllReads_2['sample'] = sample2
datAllReads_2 = datAllReads_2[datAllReads_2['depth'] != 0]
datAllReads_3 = pd.read_table((path + sample3 + '2-PosReadsDepth.txt'), usecols = ['pos','depth','CellBarcode'])
datAllReads_3['sample'] = sample3
datAllReads_3 = datAllReads_3[datAllReads_3['depth'] != 0]
datAllReads_4 = pd.read_table((path + sample4 + '2-PosReadsDepth.txt'), usecols = ['pos','depth','CellBarcode'])
datAllReads_4['sample'] = sample4
datAllReads_4 = datAllReads_4[datAllReads_4['depth'] != 0]

datAllReads = pd.concat([datAllReads_1,datAllReads_2,datAllReads_3,datAllReads_4]).reset_index(drop = True)
datAllReads.to_csv('PosReadsDepth.txt', sep = '\t')
datAllReads

## Mutation frequency

### 1. Merge mutation data with clusters information

In [3]:
UMAP_leaf = pd.read_table('plt_maize_leaf.txt', usecols = ['seurat_clusters','Row.names','V1','V2','sample']) # output by SingleCell_data.R
UMAP_root = pd.read_table('plt_maize_root.txt', usecols = ['seurat_clusters','Row.names','V1','V2','sample']) # output by SingleCell_data.R

UMAP = pd.concat([UMAP_leaf,UMAP_root]).reset_index(drop = True)
UMAP['CellBarcode'] = UMAP['Row.names'].str.split('_').apply(lambda x: '_'.join(x[0:2]))
UMAP.to_csv(path + '/plt_maize.txt', sep = '\t')

datMut = pd.read_table('Mut_Overview_distance1000_filter.txt', usecols = ['pos','ref','alt','CellBarcode','sample','TE','pos_MutUMI_freq','Distance'])
datMut = datMut[datMut['pos_MutUMI_freq'] == 1].reset_index(drop = True)

datMut = pd.merge(datMut,UMAP[['seurat_clusters','sample','CellBarcode']],on = ['sample','CellBarcode'])
datMut.to_csv('Mut_Overview_distance1000_filter.txt', sep = '\t')
datMut

Unnamed: 0,pos,ref,alt,CellBarcode,sample,TE,Distance,pos_MutUMI_freq,seurat_clusters
0,123013469,A,g,CELL74730_N1,9a-3,TE_homo_1053200,-486,1.0,5
1,123013490,C,t,CELL74730_N1,9a-3,TE_homo_1053200,-465,1.0,5
2,123013516,C,t,CELL74730_N1,9a-3,TE_homo_1053200,-439,1.0,5
3,123013555,A,t,CELL74730_N1,9a-3,TE_homo_1053200,-400,1.0,5
4,123013558,G,c,CELL74730_N1,9a-3,TE_homo_1053200,-397,1.0,5
...,...,...,...,...,...,...,...,...,...
154379,123370618,A,c,CELL70298_N1,12a,TE_homo_1053359,686,1.0,4
154380,123370622,C,t,CELL70298_N1,12a,TE_homo_1053359,690,1.0,4
154381,123370657,A,g,CELL70298_N1,12a,TE_homo_1053359,725,1.0,4
154382,123370660,C,t,CELL70298_N1,12a,TE_homo_1053359,728,1.0,4


### 2. # of cells in which each mutation appears

In [4]:
datMut = pd.read_table('Mut_Overview_distance1000_filter.txt', usecols = ['pos','ref','alt','CellBarcode','sample','TE'])
datMut['ref'] = datMut['ref'].str.upper()
datMut['alt'] = datMut['alt'].str.upper()

datMut['mut_info'] = datMut['pos'].astype(str) + '_' + datMut['ref'] + '_' + datMut['alt']
datMut['mut_info_2'] = datMut['sample'] + '_' + datMut['CellBarcode'] + '_' + datMut['pos'].astype(str) + '_' + datMut['ref'] + '_' + datMut['alt']
print(len(datMut['mut_info'].unique()))
datMut = datMut.drop_duplicates('mut_info_2').reset_index(drop = True)

MutCellCount = datMut.value_counts(['sample','mut_info']).reset_index().sort_values(['sample',0]).reset_index(drop = True)
MutCellCount.columns = ['sample','mut','CellCount']
MutCellCount['mut_type'] = ''
for i in range(len(MutCellCount['mut'])):
    if '+' in MutCellCount['mut'][i] or '-' in MutCellCount['mut'][i]:
        MutCellCount['mut_type'][i] = 'indel'
    else:
        MutCellCount['mut_type'][i] = 'SNP'

MutCellCount.to_csv('SampleMut_CellCount.txt', sep = '\t')
MutCellCount

18097


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  MutCellCount['mut_type'][i] = 'SNP'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  MutCellCount['mut_type'][i] = 'indel'


Unnamed: 0,sample,mut,CellCount,mut_type
0,12a,123006872_A_G,1,SNP
1,12a,123365314_A_C,1,SNP
2,12a,123365315_T_C,1,SNP
3,12a,123365316_T_A,1,SNP
4,12a,123365317_T_C,1,SNP
...,...,...,...,...
22284,9a-root,123364730_C_A,183,SNP
22285,9a-root,123215286_G_A,241,SNP
22286,9a-root,123220165_C_T,242,SNP
22287,9a-root,123215785_A_G,325,SNP


### 3. Mutation frequency

In [11]:
MutCellCount = pd.read_table('SampleMut_CellCount.txt')

datMut = pd.read_table('Mut_Overview_distance1000_filter.txt', usecols = ['pos','ref','alt','CellBarcode','sample','TE'])

pos_list = list(datMut['pos'].unique())
pos_list = list(map(str, pos_list))

datMut['mut_info'] = datMut['pos'].astype(str) + '_' + datMut['ref'] + '_' + datMut['alt']
datMut['mut_info_2'] = datMut['sample'] + '_' + datMut['CellBarcode'] + '_' + datMut['pos'].astype(str) + '_' + datMut['ref'] + '_' + datMut['alt']
datMut = datMut.drop_duplicates('mut_info_2').reset_index(drop = True)

UMAP = pd.read_table('plt_maize.txt', usecols = ['seurat_clusters','sample','CellBarcode'])
UMAP['seurat_clusters'] = UMAP['seurat_clusters'].astype(str)

datMut = pd.merge(datMut,UMAP,on = ['sample','CellBarcode'])

sample_list = list(datMut['sample'].unique())
cluster_list = list(datMut['seurat_clusters'].unique())
mut_info_list = list(datMut['mut_info'].unique())
TE_list = list(datMut['TE'].unique())

print(sample_list)
print(cluster_list)
print(mut_info_list)
print(TE_list)

datAllReads = pd.read_table('PosReadsDepth.txt', usecols = ['sample','pos','depth','CellBarcode'])
datAllReads['pos'] = datAllReads['pos'].astype(str)
datAllReads = datAllReads[datAllReads['pos'].isin(pos_list)]
datAllReads = pd.merge(datAllReads,UMAP,on = ['sample','CellBarcode'])

MutCellFreq = pd.DataFrame(columns=['sample', 'TE', 'seurat_clusters', 'mut','mut_count','total_count'])

for m in sample_list:
    print(m)
    for n in TE_list:
        SampleAndTE = datMut[(datMut['sample'] == m) & (datMut['TE'] == n)]
        datAllReads_sample = datAllReads[(datAllReads['sample'] == m)]
        print(n)
        for l in cluster_list:
            if l in SampleAndTE['seurat_clusters'].unique():
                #print(l)
                SampleAndTEAndType = SampleAndTE[SampleAndTE['seurat_clusters'] == l]
                datAllReads_sampleAndType = datAllReads_sample[(datAllReads_sample['seurat_clusters'] == l)]
                for o in mut_info_list:
                    if o in SampleAndTEAndType['mut_info'].unique():
                        #print(o)
                        SampleAndTE_subset = SampleAndTEAndType[SampleAndTE['pos'] == int(o.split('_')[0])]
                        mut_info_count = len(SampleAndTE_subset[SampleAndTE_subset['mut_info'] == o])
                        total_count = len(datAllReads_sampleAndType[datAllReads_sampleAndType['pos'] == o.split('_')[0]]['CellBarcode'].unique())
                        MutCellFreq = MutCellFreq.append({'sample':m,'TE':n,'seurat_clusters':l,'mut':o,\
                            'mut_count':mut_info_count,'total_count':total_count}, ignore_index=True)
                    else:
                        continue
            else:
                continue

MutCellFreq['mut'] = MutCellFreq['mut'].str.upper()
MutCellFreq = pd.merge(MutCellFreq,MutCellCount,on = ['sample','mut']).drop('Unnamed: 0', axis = 1).sort_values(['sample']).reset_index(drop = True)

MutCellFreq['mut_type'] = ''

for i in range(len(MutCellFreq['mut'])):
    if '+' in MutCellFreq['mut'][i] or '-' in MutCellFreq['mut'][i]:
        MutCellFreq['mut_type'][i] = 'indel'
    else:
        MutCellFreq['mut_type'][i] = 'SNP'

MutCellFreq.to_csv('SampleClusterTEMut_CellFreq_Overview.txt', sep = '\t')
MutCellFreq

['9a-3', '9a-4', '9a-root', '12a']
['5', '11', '7', '6', '1', '10', '3', '9', '2', '12', '8', '13', '4', '16', '0', '14', '15']
['123013469_A_g', '123013490_C_t', '123013516_C_t', '123013555_A_t', '123013558_G_c', '123370522_C_a', '123370530_C_a', '123370534_C_t', '123370536_A_c', '123370568_A_c', '123370657_A_g', '123370660_C_t', '123370703_C_a', '123370073_A_g', '123370075_G_t', '123215237_T_c', '123215286_G_a', '123215301_G_a', '123364903_T_c', '123220165_C_t', '123220176_C_g', '123220203_C_t', '123215250_A_g', '123215251_A_c', '123215252_A_t', '123219725_T_c', '123215314_C_a', '123215315_G_c', '123215319_A_g', '123216353_T_a', '123364730_C_a', '123364884_A_g', '123364885_C_t', '123366009_A_g', '123366039_C_g', '123366043_T_g', '123364753_C_a', '123364765_A_g', '123364769_T_g', '123018199_G_t', '123016544_C_t', '123016585_C_t', '123016802_T_+1t', '123017264_C_-1a', '123017322_A_t', '123364446_A_c', '123364448_T_c', '123364451_G_t', '123364452_A_c', '123364454_C_g', '123364455_A_c', 

  exec(code_obj, self.user_global_ns, self.user_ns)


9a-3
TE_homo_1053200
5


  SampleAndTE_subset = SampleAndTEAndType[SampleAndTE['pos'] == int(o.split('_')[0])]


11
7
6
1
10
3
9
2
12
8
13
4
0
15
TE_homo_1053359
5
11
7
6
1
10
3
9
2
12
8
13
4
0
TE_homo_1053283
5
11
7
6
1
10
3
9
2
12
8
13
4
16
0
14
15
TE_homo_1053355
5
11
7
6
1
10
3
9
2
12
8
13
4
16
0
14
15
TE_homo_1053284
5
11
7
6
1
10
3
9
2
12
8
13
4
16
0
14
15
TE_homo_1053202
5
11
7
6
1
10
3
9
2
12
8
13
4
16
0
14
15
TE_homo_1053358
5
11
7
6
1
10
3
9
2
8
4
0
14
TE_homo_1053188
5
11
7
6
1
10
3
9
2
12
8
13
4
0
14
9a-4
TE_homo_1053200
5
11
7
6
1
10
3
9
2
12
8
4
14
TE_homo_1053359
5
11
7
6
1
10
3
9
2
12
8
13
4
16
0
TE_homo_1053283
5
11
7
6
1
10
3
9
2
12
8
13
4
16
0
14
TE_homo_1053355
5
11
7
6
1
10
3
9
2
8
13
4
16
0
14
15
TE_homo_1053284
5
11
7
6
1
10
3
9
2
12
8
13
4
16
0
14
15
TE_homo_1053202
5
11
7
6
1
10
3
9
2
12
8
13
4
16
0
14
TE_homo_1053358
5
11
7
6
1
10
3
9
2
8
13
4
16
0
TE_homo_1053188
5
11
7
1
10
3
9
2
12
8
13
4
16
9a-root
TE_homo_1053200
5
7
6
1
10
3
9
2
8
4
0
TE_homo_1053359
5
7
6
1
10
3
9
2
8
4
0
TE_homo_1053283
5
7
6
1
10
3
9
2
8
4
0
TE_homo_1053355
5
7
6
1
10
3
9
2
8
4
0
TE_homo_1053284

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  MutCellFreq['mut_type'][i] = 'SNP'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  MutCellFreq['mut_type'][i] = 'indel'


Unnamed: 0,sample,TE,seurat_clusters,mut,mut_count,total_count,CellCount,mut_type
0,12a,TE_homo_1053355,15,123364769_T_G,6,8,211,SNP
1,12a,TE_homo_1053355,6,123365597_G_T,2,219,7,SNP
2,12a,TE_homo_1053355,10,123365597_G_T,1,132,7,SNP
3,12a,TE_homo_1053355,2,123365597_G_T,2,290,7,SNP
4,12a,TE_homo_1053355,8,123365597_G_T,1,251,7,SNP
...,...,...,...,...,...,...,...,...
41528,9a-root,TE_homo_1053359,4,123370502_C_A,1,13,1,SNP
41529,9a-root,TE_homo_1053359,4,123370494_C_A,1,13,1,SNP
41530,9a-root,TE_homo_1053359,4,123370471_C_G,1,9,1,SNP
41531,9a-root,TE_homo_1053359,4,123369556_A_+1A,1,20,1,indel


In [11]:
UMAP = pd.read_table('plt_maize.txt', usecols = ['seurat_clusters','sample','CellBarcode'])
UMAP['seurat_clusters'] = UMAP['seurat_clusters'].astype(str)

sample_cluster_Celltype = UMAP.copy()	
sample_cluster_Celltype['sample_cluster'] = sample_cluster_Celltype['sample'] + '_' + sample_cluster_Celltype['seurat_clusters']
sample_cluster_Celltype = sample_cluster_Celltype.drop_duplicates('sample_cluster').reset_index(drop = True).drop(['CellBarcode','sample_cluster'], axis = 1)
sample_cluster_Celltype['cell_type'] = ''
sample_cluster_Celltype['layer'] = ''

for i in range(len(sample_cluster_Celltype['cell_type'])):
    if sample_cluster_Celltype['sample'][i] != 'seedling2_root':
        if sample_cluster_Celltype['seurat_clusters'][i] == '0':
            sample_cluster_Celltype['cell_type'][i] = 'mesophyll'
            sample_cluster_Celltype['layer'][i] = 'L2'
        if sample_cluster_Celltype['seurat_clusters'][i] == '1':
            sample_cluster_Celltype['cell_type'][i] = 'mesophyll'
            sample_cluster_Celltype['layer'][i] = 'L2'
        if sample_cluster_Celltype['seurat_clusters'][i] == '2':
            sample_cluster_Celltype['cell_type'][i] = 'mesophyll'
            sample_cluster_Celltype['layer'][i] = 'L2'
        if sample_cluster_Celltype['seurat_clusters'][i] == '3':
            sample_cluster_Celltype['cell_type'][i] = 'epidermal'
            sample_cluster_Celltype['layer'][i] = 'L1'
        if sample_cluster_Celltype['seurat_clusters'][i] == '4':
            sample_cluster_Celltype['cell_type'][i] = 'mesophyll'
            sample_cluster_Celltype['layer'][i] = 'L2'
        if sample_cluster_Celltype['seurat_clusters'][i] == '5':
            sample_cluster_Celltype['cell_type'][i] = 'vascular'
            sample_cluster_Celltype['layer'][i] = 'L3'
        if sample_cluster_Celltype['seurat_clusters'][i] == '6':
            sample_cluster_Celltype['cell_type'][i] = 'epidermal'
            sample_cluster_Celltype['layer'][i] = 'L1'
        if sample_cluster_Celltype['seurat_clusters'][i] == '7':
            sample_cluster_Celltype['cell_type'][i] = 'epidermal'
            sample_cluster_Celltype['layer'][i] = 'L1'
        if sample_cluster_Celltype['seurat_clusters'][i] == '8':
            sample_cluster_Celltype['cell_type'][i] = 'epidermal'
            sample_cluster_Celltype['layer'][i] = 'L1'
        if sample_cluster_Celltype['seurat_clusters'][i] == '9':
            sample_cluster_Celltype['cell_type'][i] = 'vascular'
            sample_cluster_Celltype['layer'][i] = 'L3'
        if sample_cluster_Celltype['seurat_clusters'][i] == '10':
            sample_cluster_Celltype['cell_type'][i] = 'epidermal'
            sample_cluster_Celltype['layer'][i] = 'L1'
        if sample_cluster_Celltype['seurat_clusters'][i] == '11':
            sample_cluster_Celltype['cell_type'][i] = 'epidermal'
            sample_cluster_Celltype['layer'][i] = 'L1'
        if sample_cluster_Celltype['seurat_clusters'][i] == '12':
            sample_cluster_Celltype['cell_type'][i] = 'vascular'
            sample_cluster_Celltype['layer'][i] = 'L3'
        if sample_cluster_Celltype['seurat_clusters'][i] == '13':
            sample_cluster_Celltype['cell_type'][i] = 'vascular'
            sample_cluster_Celltype['layer'][i] = 'L3'
        if sample_cluster_Celltype['seurat_clusters'][i] == '14':
            sample_cluster_Celltype['cell_type'][i] = 'mesophyll'
            sample_cluster_Celltype['layer'][i] = 'L2'
        if sample_cluster_Celltype['seurat_clusters'][i] == '15':
            sample_cluster_Celltype['cell_type'][i] = 'mesophyll'
            sample_cluster_Celltype['layer'][i] = 'L2'
        if sample_cluster_Celltype['seurat_clusters'][i] == '16':
            sample_cluster_Celltype['cell_type'][i] = 'vascular'
            sample_cluster_Celltype['layer'][i] = 'L3'
    if sample_cluster_Celltype['sample'][i] == 'seedling2_root':
        if sample_cluster_Celltype['seurat_clusters'][i] == '0':
            sample_cluster_Celltype['cell_type'][i] = 'vascular'
            sample_cluster_Celltype['layer'][i] = 'L3'
        if sample_cluster_Celltype['seurat_clusters'][i] == '1':
            sample_cluster_Celltype['cell_type'][i] = 'cortex'
            sample_cluster_Celltype['layer'][i] = 'L2'
        if sample_cluster_Celltype['seurat_clusters'][i] == '2':
            sample_cluster_Celltype['cell_type'][i] = 'root_cap'
            sample_cluster_Celltype['layer'][i] = 'L1'
        if sample_cluster_Celltype['seurat_clusters'][i] == '3':
            sample_cluster_Celltype['cell_type'][i] = 'cortex'
            sample_cluster_Celltype['layer'][i] = 'L2'
        if sample_cluster_Celltype['seurat_clusters'][i] == '4':
            sample_cluster_Celltype['cell_type'][i] = 'epidermis'
            sample_cluster_Celltype['layer'][i] = 'L1'
        if sample_cluster_Celltype['seurat_clusters'][i] == '5':
            sample_cluster_Celltype['cell_type'][i] = 'vascular'
            sample_cluster_Celltype['layer'][i] = 'L3'
        if sample_cluster_Celltype['seurat_clusters'][i] == '6':
            sample_cluster_Celltype['cell_type'][i] = 'vascular'
            sample_cluster_Celltype['layer'][i] = 'L3'
        if sample_cluster_Celltype['seurat_clusters'][i] == '7':
            sample_cluster_Celltype['cell_type'][i] = 'vascular'
            sample_cluster_Celltype['layer'][i] = 'L3'
        if sample_cluster_Celltype['seurat_clusters'][i] == '8':
            sample_cluster_Celltype['cell_type'][i] = 'vascular'
            sample_cluster_Celltype['layer'][i] = 'L3'
        if sample_cluster_Celltype['seurat_clusters'][i] == '9':
            sample_cluster_Celltype['cell_type'][i] = 'vascular'
            sample_cluster_Celltype['layer'][i] = 'L3'
        if sample_cluster_Celltype['seurat_clusters'][i] == '10':
            sample_cluster_Celltype['cell_type'][i] = 'epidermis'
            sample_cluster_Celltype['layer'][i] = 'L1'

sample_cluster_Celltype.to_csv('Clusters_CellType.txt', sep = '\t')

MutCellFreq = pd.read_table('SampleClusterTEMut_CellFreq_Overview.txt')
MutCellFreq = MutCellFreq[((MutCellFreq['mut_type'] == 'indel')&(MutCellFreq['CellCount'] > 1))|\
    ((MutCellFreq['mut_type'] == 'SNP')&(MutCellFreq['CellCount'] >= 5))].reset_index(drop = True)


MutCellFreq = MutCellFreq.reset_index(drop = True).drop('Unnamed: 0', axis = 1)
MutCellFreq['seurat_clusters'] = MutCellFreq['seurat_clusters'].astype(str)

MutCellFreq = pd.merge(MutCellFreq, sample_cluster_Celltype, on = ['sample','seurat_clusters'])

MutCellFreq['TE_mut'] = MutCellFreq['TE'] + '_' + MutCellFreq['mut']
MutCellFreq['sample_CellType'] = MutCellFreq['sample'] + '_' + MutCellFreq['cell_type']
MutCellFreq['sample_CellType_TE_mut'] = MutCellFreq['sample_CellType'] + '_' + MutCellFreq['TE_mut']

sample_CellType_TE_mut_list = list(MutCellFreq['sample_CellType_TE_mut'].unique())
len(sample_CellType_TE_mut_list)

MutCellFreq_CellType = pd.DataFrame(columns=['sample','cell_type','TE','mut','mut_count','total_count','freq'])

for m in sample_CellType_TE_mut_list:
    MutCellFreq_subset = MutCellFreq[MutCellFreq['sample_CellType_TE_mut'] == m].reset_index(drop = True)
    sample = MutCellFreq_subset['sample'][0]
    cell_type = MutCellFreq_subset['cell_type'][0]
    TE = MutCellFreq_subset['TE'][0]
    mut = MutCellFreq_subset['mut'][0]
    mut_count = sum(MutCellFreq_subset['mut_count'])
    total_count = sum(MutCellFreq_subset['total_count'])
    freq = mut_count / total_count
    MutCellFreq_CellType = MutCellFreq_CellType.append({'sample':sample,'cell_type':cell_type,'TE':TE,'mut':mut,
                                                  'mut_count':mut_count,'total_count':total_count,'freq':freq}, ignore_index=True)

MutCellFreq_CellType.to_csv(('SampleClusterTEMut_CellFreq_Overview_CellTypeMerge_SNP>=5_Indel>1.txt'), sep = '\t')

MutCellFreq_CellType['TE_mut'] = MutCellFreq_CellType['TE'] + '_' + MutCellFreq_CellType['mut']
MutCellFreq_CellType['sample_CellType'] = MutCellFreq_CellType['sample'] + '_' + MutCellFreq_CellType['cell_type']

sample_CellType_list = list(MutCellFreq_CellType['sample_CellType'].unique())
sample_CellType_list_locals = []

for i in sample_CellType_list:
    locals()['MutCellFreq_CellType_' + i] = MutCellFreq_CellType[MutCellFreq_CellType['sample_CellType'] == i][['TE_mut','freq']]
    locals()['MutCellFreq_CellType_' + i].columns = ['mut',i]
    sample_CellType_list_locals.append(('MutCellFreq_CellType_' + i))

name_1 = sample_CellType_list_locals[0]
name_2 = sample_CellType_list_locals[1]

MutCellFreq_CellType_merge = pd.merge(locals()[name_1],locals()[name_2], on = 'mut', how = 'outer')

sample_CellType_list_locals.remove(name_1)
sample_CellType_list_locals.remove(name_2)

for i in sample_CellType_list_locals:
    MutCellFreq_CellType_merge = pd.merge(MutCellFreq_CellType_merge,locals()[i], on = 'mut', how = 'outer')

mut_info_list = list(MutCellFreq_CellType_merge['mut'].unique())
pos_list = [item.split('_')[3] for item in mut_info_list]
pos_list = list(map(str, pos_list))

datAllReads = pd.read_table('PosReadsDepth.txt', usecols = ['sample','pos','depth','CellBarcode'])
datAllReads['pos'] = datAllReads['pos'].astype(str)
datAllReads = datAllReads[datAllReads['pos'].isin(pos_list)].reset_index(drop = True)
datAllReads = pd.merge(datAllReads,UMAP,on = ['sample','CellBarcode'])
datAllReads = pd.merge(datAllReads,sample_cluster_Celltype,on = ['sample','seurat_clusters'])

column_names = MutCellFreq_CellType_merge.columns.tolist()
column_names.remove('mut')
print(column_names)

for m in column_names:
    sample = m.split('_')[0]
    Cell_type = '_'.join(m.split('_')[1:])
    datAllReads_sample_type = datAllReads[(datAllReads['sample'] == sample)&(datAllReads['cell_type'] == Cell_type)]
    print(m + '   ' + sample + '   ' + Cell_type)
    for n in range(len(MutCellFreq_CellType_merge[m])):
        site = MutCellFreq_CellType_merge['mut'][n].split('_')[3]
        if pd.isna(MutCellFreq_CellType_merge[m][n]):
            if site in datAllReads_sample_type['pos'].unique():
                MutCellFreq_CellType_merge[m][n] = 0

MutCellFreq_CellType_merge['mut_type'] = ''

for i in range(len(MutCellFreq_CellType_merge['mut'])):
    if '+' in MutCellFreq_CellType_merge['mut'][i] or '-' in MutCellFreq_CellType_merge['mut'][i]:
        MutCellFreq_CellType_merge['mut_type'][i] = 'indel'
    else:
        MutCellFreq_CellType_merge['mut_type'][i] = 'SNP'

MutCellFreq_CellType_merge = MutCellFreq_CellType_merge.sort_values('mut').reset_index(drop = True)
MutCellFreq_CellType_merge.to_csv('SampleCellTypeTEMut_CellFreq_SNP>=5_Indel>1.txt', sep = '\t', na_rep = 'NA')
MutCellFreq_CellType_merge

  exec(code_obj, self.user_global_ns, self.user_ns)


['12a_vascular', '12a_mesophyll', '12a_epidermal', '9a-3_epidermal', '9a-3_vascular', '9a-3_mesophyll', '9a-4_epidermal', '9a-4_mesophyll', '9a-4_vascular', '9a-root_vascular', '9a-root_cortex', '9a-root_epidermis', '9a-root_root_cap']
12a_vascular   12a   vascular


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  MutCellFreq_CellType_merge[m][n] = 0


12a_mesophyll   12a   mesophyll
12a_epidermal   12a   epidermal
9a-3_epidermal   9a-3   epidermal
9a-3_vascular   9a-3   vascular
9a-3_mesophyll   9a-3   mesophyll
9a-4_epidermal   9a-4   epidermal
9a-4_mesophyll   9a-4   mesophyll
9a-4_vascular   9a-4   vascular
9a-root_vascular   9a-root   vascular
9a-root_cortex   9a-root   cortex
9a-root_epidermis   9a-root   epidermis
9a-root_root_cap   9a-root   root_cap


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  MutCellFreq_CellType_merge['mut_type'][i] = 'SNP'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  MutCellFreq_CellType_merge['mut_type'][i] = 'indel'


Unnamed: 0,mut,12a_vascular,12a_mesophyll,12a_epidermal,9a-3_epidermal,9a-3_vascular,9a-3_mesophyll,9a-4_epidermal,9a-4_mesophyll,9a-4_vascular,9a-root_vascular,9a-root_cortex,9a-root_epidermis,9a-root_root_cap,mut_type
0,TE_homo_1053188_123007180_A_+1A,0.00,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.333333,0.5,0.000000,0.000000,0.000000,0.000000,indel
1,TE_homo_1053188_123007272_T_C,0.00,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.0,0.054545,0.015873,0.034483,0.000000,SNP
2,TE_homo_1053188_123007277_T_C,0.00,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.0,0.044776,0.015873,0.034483,0.000000,SNP
3,TE_homo_1053188_123007586_A_C,0.00,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.0,0.027778,0.010870,0.029412,0.000000,SNP
4,TE_homo_1053188_123007638_T_A,0.00,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.0,0.028571,0.032967,0.000000,0.020833,SNP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1578,TE_homo_1053359_123370825_C_+4AACG,0.00,0.071429,0.038462,0.000000,0.000000,0.00,0.000000,0.000000,,0.000000,0.000000,0.000000,0.000000,indel
1579,TE_homo_1053359_123370825_C_+4AAGG,0.30,0.260870,0.255102,0.428571,0.333333,0.25,0.200000,1.000000,,0.500000,0.500000,0.000000,0.166667,indel
1580,TE_homo_1053359_123370829_G_+4AAGG,0.50,0.250000,0.184211,0.000000,0.000000,0.00,0.142857,0.000000,,0.100000,0.000000,0.000000,0.166667,indel
1581,TE_homo_1053359_123370830_A_+4AGGA,0.00,0.000000,0.025641,0.000000,0.000000,0.00,0.000000,0.000000,,0.000000,0.000000,0.000000,0.000000,indel
