In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
row_data = pd.read_excel('data.xlsx')
print('Количество найденных различий: {}'.format(row_data.shape[0]))
display(row_data.head())

Количество найденных различий: 154853


Unnamed: 0,CHROM,POS,TYPE,REF,ALT,EVIDENCE,FTYPE,STRAND,NT_POS,AA_POS,EFFECT,LOCUS_TAG,GENE,PRODUCT
0,NZ_CP084696.2,72,snp,T,C,C:114 T:0,,,,,,,,
1,NZ_CP084696.2,77,snp,T,A,A:102 T:0,,,,,,,,
2,NZ_CP084696.2,94,snp,C,G,G:98 C:0,,,,,,,,
3,NZ_CP084696.2,103,snp,C,G,G:90 C:0,,,,,,,,
4,NZ_CP084696.2,109,complex,CGGAAA,AGGAGG,AGGAGG:64 CGGAAA:0,,,,,,,,


In [3]:
# референс (REF) - геном RCAM1026 из GenBank, аннотация Prokka; альтернатива (ALT) - геном RLV3841 лабораторный

In [4]:
data = row_data.copy()
data = data[data['PRODUCT'] != "hypothetical protein"]
data = data[data['PRODUCT'].isnull() == False]

In [5]:
to_replace = {
    r'synonymous_variant(.)*': r'synonymous_variant',
    r'missense_variant(.)*': r'missense_variant',
    r'frameshift_variant(.)*': r'frameshift_variant',
    r'conservative_inframe_insertion(.)*': r'conservative_inframe_insertion',
    r'conservative_inframe_deletion(.)*': r'conservative_inframe_deletion',
    r'disruptive_inframe_insertion(.)*': r'disruptive_inframe_insertion',
    r'disruptive_inframe_deletion(.)*': r'disruptive_inframe_deletion',
    r'intragenic_variant(.)*': r'intragenic_variant',
    r'splice_region_variant(.)*': r'splice_region_variant',
    r'non_coding_transcript_variant(.)*': r'non_coding_transcript_variant',
    r'stop_gained(.)*': r'stop_gained',
    r'stop_lost(.)*': r'stop_lost',
    r'stop_gained(.)*': r'stop_gained',
    r'start_lost(.)*': r'start_lost',
    r'initiator_codon_variant(.)*': r'initiator_codon_variant',
    r'gene_fusion(.)*': r'gene_fusion',
    r'intergenic_region(.)*': r'intergenic_region'
}

data['EFFECT'] = data['EFFECT'].replace(to_replace, regex=True)

In [6]:
data = data.query('EFFECT != "synonymous_variant" &\
                   EFFECT != "intragenic_variant" &\
                   EFFECT != "splice_region_variant" &\
                   EFFECT != "initiator_codon_variant" &\
                   EFFECT != "non_coding_transcript_variant" &\
                   EFFECT != "stop_retained_variant&splice_region_variant" &\
                   EFFECT != "intergenic_region"').reset_index(drop=True)

In [7]:
print('Значимые эффекты: {}'.format(np.sort(data.EFFECT.unique())))

Значимые эффекты: ['conservative_inframe_deletion' 'conservative_inframe_insertion'
 'disruptive_inframe_deletion' 'disruptive_inframe_insertion'
 'frameshift_variant' 'gene_fusion' 'missense_variant' 'start_lost'
 'stop_gained' 'stop_lost']


In [8]:
data.PRODUCT.value_counts()

Adaptive-response sensory-kinase SasA                              247
Sensor histidine kinase RcsC                                       228
HTH-type transcriptional regulator DmlR                            192
2-methoxy-6-polyprenyl-1,4-benzoquinol methylase, mitochondrial    112
HTH-type transcriptional regulator HdfR                            104
                                                                  ... 
SsrA-binding protein                                                 1
Aquaporin Z                                                          1
30S ribosomal protein S13                                            1
30S ribosomal protein S5                                             1
Hypoxanthine phosphoribosyltransferase                               1
Name: PRODUCT, Length: 1666, dtype: int64

In [9]:
# pd.set_option('display.max_rows', None)

# with open("1026vs3841_names.txt", "w") as output:
#     output.write(str(data.PRODUCT.value_counts().index.tolist()))

In [10]:
# pd.set_option('display.max_rows', None)

# with open("1026vs3841_locus_tag.txt", "w") as output:
#     output.write(str(data.LOCUS_TAG.value_counts().index.tolist()))

In [11]:
# pd.set_option('display.max_rows', None)

# with open("1026vs3841_gene.txt", "w") as output:
#     output.write(str(data.GENE.value_counts().index.tolist()))

In [12]:
names = pd.read_csv('tr_FC6EC39BABDE1678791658553.txt',delimiter="\t", header=0)
names = names.drop(columns=['Gene Name', 'Species'])
names = names.rename(columns={'ID':'GENE'})
display(names.head())

Unnamed: 0,GENE,GOTERM_BP_DIRECT,GOTERM_CC_DIRECT,GOTERM_MF_DIRECT,UP_KW_LIGAND
0,hisA,"GO:0000105~histidine biosynthetic process,","GO:0005737~cytoplasm,",GO:0003949~1-(5-phosphoribosyl)-5-[(5-phosphor...,
1,dxr,"GO:0016114~terpenoid biosynthetic process,GO:0...",,"GO:0016853~isomerase activity,GO:0030604~1-deo...","KW-0521~NADP,"
2,rsmI,GO:0000453~enzyme-directed rRNA 2'-O-methylati...,"GO:0005737~cytoplasm,","GO:0008168~methyltransferase activity,GO:00706...","KW-0949~S-adenosyl-L-methionine,"
3,rsmH,"GO:0032259~methylation,GO:0070475~rRNA base me...","GO:0005737~cytoplasm,","GO:0008168~methyltransferase activity,GO:00714...",
4,rsmG,"GO:0032259~methylation,","GO:0005737~cytoplasm,","GO:0008168~methyltransferase activity,GO:00700...","KW-0949~S-adenosyl-L-methionine,"


In [13]:
data = data.merge(names, how="left", on="GENE")
disaplay(data)

Unnamed: 0,CHROM,POS,TYPE,REF,ALT,EVIDENCE,FTYPE,STRAND,NT_POS,AA_POS,EFFECT,LOCUS_TAG,GENE,PRODUCT,GOTERM_BP_DIRECT,GOTERM_CC_DIRECT,GOTERM_MF_DIRECT,UP_KW_LIGAND
0,NZ_CP084696.2,1369,complex,AGCG,TGCC,TGCC:134 AGCG:0,CDS,+,129/600,43/199,missense_variant,KDNLPMIB_00002,yceF,7-methyl-GTP pyrophosphatase,,,,
1,NZ_CP084696.2,1434,mnp,AT,GC,GC:124 AT:0,CDS,+,194/600,65/199,missense_variant,KDNLPMIB_00002,yceF,7-methyl-GTP pyrophosphatase,,,,
2,NZ_CP084696.2,1449,snp,C,G,G:132 C:0,CDS,+,209/600,70/199,missense_variant,KDNLPMIB_00002,yceF,7-methyl-GTP pyrophosphatase,,,,
3,NZ_CP084696.2,1509,snp,A,C,C:112 A:0,CDS,+,269/600,90/199,missense_variant,KDNLPMIB_00002,yceF,7-methyl-GTP pyrophosphatase,,,,
4,NZ_CP084696.2,1514,complex,ATGA,CTGG,CTGG:104 ATGA:0,CDS,+,274/600,92/199,missense_variant,KDNLPMIB_00002,yceF,7-methyl-GTP pyrophosphatase,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19537,NZ_CP084696.2,4920682,complex,TTTTTC,GTCTTG,GTCTTG:82 TTTTTC:0,CDS,-,734/1023,243/340,missense_variant,KDNLPMIB_04814,hemE,Uroporphyrinogen decarboxylase,GO:0006782~protoporphyrinogen IX biosynthetic ...,"GO:0005737~cytoplasm,",GO:0004853~uroporphyrinogen decarboxylase acti...,
19538,NZ_CP084696.2,4920739,snp,T,C,C:90 T:0,CDS,-,677/1023,226/340,missense_variant,KDNLPMIB_04814,hemE,Uroporphyrinogen decarboxylase,GO:0006782~protoporphyrinogen IX biosynthetic ...,"GO:0005737~cytoplasm,",GO:0004853~uroporphyrinogen decarboxylase acti...,
19539,NZ_CP084696.2,4920924,complex,CGG,GGC,GGC:90 CGG:0,CDS,-,492/1023,164/340,missense_variant,KDNLPMIB_04814,hemE,Uroporphyrinogen decarboxylase,GO:0006782~protoporphyrinogen IX biosynthetic ...,"GO:0005737~cytoplasm,",GO:0004853~uroporphyrinogen decarboxylase acti...,
19540,NZ_CP084696.2,4920931,snp,C,T,T:92 C:0,CDS,-,485/1023,162/340,missense_variant,KDNLPMIB_04814,hemE,Uroporphyrinogen decarboxylase,GO:0006782~protoporphyrinogen IX biosynthetic ...,"GO:0005737~cytoplasm,",GO:0004853~uroporphyrinogen decarboxylase acti...,
