Skip to content

Commit

Permalink
1.21(2024.1)
Browse files Browse the repository at this point in the history
  • Loading branch information
YongxinLiu committed Jan 16, 2024
1 parent ed8f1b3 commit 3e3cd08
Showing 1 changed file with 25 additions and 12 deletions.
37 changes: 25 additions & 12 deletions script/format_dbcan2list.pl
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,20 @@
sub usage {
die(
qq!
Usage: format_dbcan2list.pl -i temp/dbcan2/gene_diamond.f6 -o temp/dbcan2/gene.list -h header num
Usage: format_dbcan2list.pl -i temp/dbcan3/gene_diamond.f6 -o temp/dbcan3/gene.list -h header num
Function: Format diamond blastx result into gene and target list, gene may have multiply targets.
Command: -i inpute file name (Must)
-o output file name (Must)
-d database file name
-e evaule, default 1e-30, recommend 1e-109
-h header line number, default 0
Author: Liu Yong-Xin, liuyongxin_bio\@163.com, QQ:42789409
Version: v1.01
Update: 2021/1/18
Notes: v1.00 2021/1/4 创立脚本
v1.01 2021/1/18 修改竖线后多个酶学编号的问题(|2.4.1.22|2.4.1.38|2.4.1.90)
v1.02 2021/7/7 新增相似度在第3列
v1.21 2024/1/16解决结果丢失第一列结果的bug,新增了Evalue,及按Evalue筛选的参数
\n!
)
}
Expand All @@ -32,14 +34,15 @@ sub usage {
#Get the parameter and provide the usage.
###############################################################################
my %opts;
getopts( 'i:o:d:h:', \%opts );
getopts( 'i:o:d:e:h:', \%opts );
&usage unless ( exists $opts{i} && exists $opts{o} );
my $start_time=time;
#print strftime("Start time is %Y-%m-%d %H:%M:%S\n", localtime(time));
#print "Input file is $opts{i}\nOutput file is $opts{o}\n";
#print "Database file is $opts{d}\n" if defined($opts{d});
# 调置参数的初始值,可以添加更多参数的默认值
$opts{h}=0 unless defined($opts{h});
$opts{e}=1e-30 unless defined($opts{e});

###############################################################################
#读入的数据或注释文件,用于与输入文件比较或注释(可选),提供三种方式
Expand Down Expand Up @@ -86,11 +89,18 @@ sub usage {
#k119_59_1 VTP76200.1|CBM48|GH13_9| 100.0 120 0 0 1 120 496 615 7.4e-70 266.2
#k141_2660488_1 BAA24360.1|CBM38|GH32|2.4.1.- 50.2 295 129 7 2 286 693 979 4.7e-70
#k141_3197835_1 AAA37297.1|GT7|2.4.1.22|2.4.1.38|2.4.1.90 62.1 103 31 2 1 102 160
#New data
#gene target similarity length mismatch gap qstart qend start end evalue bitscore
#k119_656_1 APZ95558.1|GT2 46.9 98 52 0 7 104 33 130 4.16e-24 100
#k119_265_1 ALR08738.2|GT83 48.9 139 67 4 8 144 348 484 6.72e-30 118
#k119_1032_1 AIE96983.1|GT7 33.0 182 111 4 1 173 973 1152 2.26e-17 85.1

open OUTPUT,">$opts{o}";
#gene target
#k119_57_1 CBM50
#k119_59_1 CBM48
#k119_59_1 GH13_9
#Name CAZy Identity Evaule
#k119_139_1 GT3 99.2
#k119_901_1 CBM32 100


#my %count;
## h参数用于去除有文件头的行
Expand All @@ -104,17 +114,18 @@ sub usage {

# 使用存储结果,防止输出重复结果
my %database;
print OUTPUT "Name\tCAZy\tIdentity\n";
print OUTPUT "Name\tCAZy\tIdentity\tEvalue\n";
while (<INPUT>) {
chomp;
my @tmp=split/\t/;
my @tmp1=split('\|',$tmp[1]);
# print $#tmp1,"\n";
if ($tmp[1]=~/\|$/) {
$i=$#tmp1;
}else{
$i=$#tmp1-1;
}
# 新版结果没有|,跳过此步
# if ($tmp[1]=~/\|$/) {
$i=$#tmp1;
# }else{
# $i=$#tmp1-1;
# }
foreach (1..$i) {
# 跳过酶学编号的结果
if ($tmp1[$_]=~/^\d/) {
Expand All @@ -124,7 +135,9 @@ sub usage {
$tmp1[$_]=~s/_.*//;
if (!$database{$tmp[0]}{$tmp1[$_]}) {
#print "$tmp[0]\t$tmp1[$_]\n";
print OUTPUT "$tmp[0]\t$tmp1[$_]\t$tmp[2]\n";
if ($tmp[10] < $opts{e}) {
print OUTPUT "$tmp[0]\t$tmp1[$_]\t$tmp[2]\t$tmp[10]\n";
}
}else{
next;
}
Expand Down

0 comments on commit 3e3cd08

Please sign in to comment.