In [1]:
import pandas as pd
from glob import glob
from tqdm import tqdm
import scipy.stats
import numpy as np
import os

In [2]:
def vcf2dataframe(vcfpath, parse_info=False, parse_annot=False):
    '''
    读取一个vcf转换成dataframe 列与vcfheader的列一致
    '''
    if not parse_info and parse_annot:
        parse_info = True
        print('[WARNING] When parse_annot=True, parse_info will be forced set to True!')
    for line in open(vcfpath).readlines():
        if line.startswith('#CHROM'):
            vcfdfheader = line.lstrip('#').split()
            break
    vcfdf = pd.read_table(vcfpath, comment='#', header=None, names=vcfdfheader)
    if len(vcfdf)==0:
        return vcfdf
    if parse_info:
        vcfinfo = pd.DataFrame(list(vcfdf['INFO'].apply(lambda x: { pair.split('=')[0]:pair.split('=')[1] for pair in x.split(';') })))
        vcfdf = pd.concat([vcfdf, vcfinfo], axis=1)
        if parse_annot: # parse annotation from snpEff
            ann_cols = 'Allele | Annotation | Annotation_Impact | Gene_Name | Gene_ID | Feature_Type | Feature_ID | Transcript_BioType | Rank | HGVS.c | HGVS.p | cDNA.pos / cDNA.length | CDS.pos / CDS.length | AA.pos / AA.length | Distance | ERRORS / WARNINGS / INFO'.split(' | ')
            vcfann = pd.DataFrame(list(vcfdf['ANN'].apply(lambda x: x.split('|', 15))), columns = ann_cols)
            vcfdf = pd.concat([vcfdf.drop('ANN', axis=1), vcfann], axis=1)
    return vcfdf

In [17]:
stat = []
for filepath in tqdm(glob('/jdfssz1/ST_HEALTH/P20Z10200N0206/renzirui/Projects/GIZ_SouthernChinaBat_clean/ViralSpec_Phylo/All_Rhinolophus_virus/8.lofreq_vcfs/*/*.lofreq.vcf')):
    vsname = filepath.split('/')[-2]
    sn = filepath.split('/')[-1].split('.')[0]
    vcfdf = vcf2dataframe(filepath, parse_info=True)
    if len(vcfdf) == 0:
        stat.append([vsname, sn, 0])
    else:
        vcfdf_isnv = vcfdf[(0.05 < vcfdf['AF'].astype(float)) & (vcfdf['AF'].astype(float) < 0.5) & (vcfdf['DP'].astype(float)>=20)]
        stat.append([vsname, sn, len(vcfdf_isnv)])

100%|██████████| 381/381 [00:02<00:00, 139.92it/s]


In [29]:
isnv_stat = pd.DataFrame(stat, columns=['vsname', 'sn', 'num_iSNV'])
isnv_stat

Unnamed: 0,vsname,sn,num_iSNV
0,AdV-2,R2109032528,0
1,AdV-2,R2109032536,16
2,AdV-2,R2109032408,0
3,AdV-2,R2109032545,4
4,AdV-2,R2107024465,0
...,...,...,...
376,PicoV-19,R2012010374,2
377,PicoV-19,R2107024456,0
378,PicoV-19,R2107024465,3
379,ParV-6,R2109032397,0


In [26]:
requant_info = pd.read_excel('/jdfssz1/ST_HEALTH/P20Z10200N0206/renzirui/Projects/GIZ_SouthernChinaBat_clean/VirusQuantification/GIZ_SouthernChinaBats_VirusQuant_new_rpm0.5_20231027.xlsx')
requant_info['vsname'] = requant_info['qseqid'].apply(lambda x: x.split('/')[0])
requant_info

Unnamed: 0,Sample_name,sn,Host_species,Sample_type,Province,City,Site,Longitude,Latitude,Collection_time,...,coverage,meandepth,meanbaseq,meanmapq,qlen,nr_vote_vfamily,nr_sscinames,nr_pident,RPM,vsname
0,HD13634,R2012010288,Rousettus leschenaultii,Anal swab,Guangdong,Huizhou,Site6_HZ,114.801199,23.181273,2013-06-19,...,100.0000,151.59100,35.3,43.5,6149,Astroviridae,Mamastrovirus sp.,74.8,223.243766,AstV-43
1,HD13634,R2012010288,Rousettus leschenaultii,Anal swab,Guangdong,Huizhou,Site6_HZ,114.801199,23.181273,2013-06-19,...,100.0000,59.90700,35.3,43.7,3830,Spinareoviridae,Cangyuan orthoreovirus,99.3,55.441333,SpinV-2
2,HD13635,R2012010289,Rousettus leschenaultii,Anal swab,Guangdong,Huizhou,Site6_HZ,114.801199,23.181273,2013-06-19,...,92.7143,13.48740,35.1,43.8,6149,Astroviridae,Mamastrovirus sp.,74.8,39.360453,AstV-43
3,HD13635,R2012010289,Rousettus leschenaultii,Anal swab,Guangdong,Huizhou,Site6_HZ,114.801199,23.181273,2013-06-19,...,44.1514,2.46919,35.0,43.9,3830,Spinareoviridae,Cangyuan orthoreovirus,99.3,4.514461,SpinV-2
4,HL06,R2012010339,Pipistrellus abramus,Anal swab,Hainan,Qiongzhong,Site12_QZ,109.677432,18.997514,2008-07-03,...,79.5383,98.12580,34.4,43.9,6715,Astroviridae,Mamastrovirus 16,68.2,131.110654,AstV-14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
592,SZ147,R2109032591,Rhinolophus affinis,Anal swab,Guangdong,Shenzhen,Site8_SZ,114.519444,22.530278,2020-09-21,...,100.0000,58.80090,35.9,41.8,6588,Astroviridae,Bat astrovirus Ha/Guangxi/LS11/2007,78.9,30.742973,AstV-18
593,CX08,R2109032613,Rhinolophus sinicus,Anal swab,Hainan,Qiongzhong,Site11_QZ,110.067971,18.800233,2008-07-05,...,100.0000,235.61600,35.7,43.4,7654,Picornaviridae,Bat picornavirus,68.3,155.975098,PicoV-7
594,XL26,R2109032618,Hipposideros pomona,Anal swab,Hainan,Qiongzhong,Site13_QZ,109.751167,18.928437,2008-07-06,...,100.0000,62.40010,35.3,42.6,30706,Adenoviridae,Bat mastadenovirus WIV13,60.5,200.802161,AdV-5
595,XL26,R2109032618,Hipposideros pomona,Anal swab,Hainan,Qiongzhong,Site13_QZ,109.751167,18.928437,2008-07-06,...,100.0000,88.20990,35.3,42.9,6035,Astroviridae,Bat astrovirus Ha/Guangxi/LS11/2007,78.0,54.945421,AstV-19


In [43]:
isnv_stat_mergeinfo = pd.merge(isnv_stat, requant_info, on=['sn', 'vsname'], how='left')
isnv_stat_filtered = isnv_stat_mergeinfo
#isnv_stat_filtered = isnv_stat_mergeinfo[isnv_stat_mergeinfo['meandepth'] >= 50]
isnv_stat_filtered['num_iSNV_perkb'] = isnv_stat_filtered['num_iSNV'] / isnv_stat_filtered['qlen'] * 1000
isnv_stat_filtered[['vsname', 'sn', 'nr_vote_vfamily', 'Host_species', 'Sample_name', 'Site', 'qlen', 'meandepth', 'num_iSNV', 'num_iSNV_perkb']].to_excel('rhinolophusbat_virus_isnv_stat.xlsx', index=False)