In [1]:
import pandas as pd
import seaborn as sns
import glob
from numpy import genfromtxt
# from sklearn.metrics import f1_score
from matplotlib import pyplot as plt
import numpy as np
import json
from collections import OrderedDict
import os
import re
import logging
import multiprocessing
from functools import partial
from datetime import datetime
from Bio import SeqIO
from Bio.Seq import Seq
import gzip
from sklearn import preprocessing
from sklearn.feature_selection import VarianceThreshold
# https://pandas.pydata.org/docs/getting_started/intro_tutorials/03_subset_data.html

# Create map from gene ID to cluster ID

In [3]:
### panta input directory (prokka)
# pantain_dir = '/data/hoan/amromics/prediction/data/Ecoli1936/prokkatest/'
# pantain_dir = '/data/hoan/amromics/prediction/data/Ecoli1936/prokka/'
# pantain_dir = '/data/hoan/amromics/kover2_paper/data/SaPatric/prokkaMore/'
pantain_dir = '/data/hoan/amromics/kover2_paper/data/KpPatric/prokkaMore/'
### panta output directory
# pantaout_dir = '/data/hoan/amromics/prediction/output/pantaEcoli1936aligntest/'
# pantaout_dir = '/data/hoan/amromics/prediction/output/pantaEcoli1936align_v4/'
# pantaout_dir = '/data/hoan/amromics/prediction/output/pantaEcoli1936align_v9/'
# pantaout_dir = '/data/hoan/amromics/prediction/output/pantaSaPatric/'
pantaout_dir = '/data/hoan/amromics/prediction/output/pantaKpPatric/'

In [4]:
with open(pantaout_dir + 'annotated_clusters.json', 'r') as JSON:
    json_dict = json.load(JSON)
# data = json.loads('/data/hoan/amromics/prediction/output/pantaEcoli1936aligntest/clusters.json')[0]

In [5]:
gene2clusterdict = {}
for key in json_dict:
    if len(json_dict[key])==0:
        gene2clusterdict[key] = key
    for gene in json_dict[key]['gene_id']:
        gene2clusterdict[gene] = key

# Find all AMR genes

In [6]:
from pangraph.utils import parse_gff_AMRgene_finder

In [7]:
amr_gene = []
for data_dir in glob.glob(pantain_dir + '*.gff'):
    # print(data_dir)
    in_fh = open(data_dir)
    sample_id = data_dir.split('/')[-1][:-4]
    amr_gene += parse_gff_AMRgene_finder(in_fh, sample_id)
    in_fh.close()

In [8]:
amr_gene[:3], len(amr_gene)

(['573.12900-573.12900.con.0001-ALJABHLN_00114',
  '573.12900-573.12900.con.0001-ALJABHLN_00163',
  '573.12900-573.12900.con.0001-ALJABHLN_00174'],
 244260)

In [9]:
#### Map genes back to cluster IDs
amr_clusterID = [gene2clusterdict[gene] for gene in amr_gene]
amr_clusterID = list(set(amr_clusterID))

In [10]:
len(amr_clusterID), amr_clusterID[0:4]

(452, ['tmrB', 'vanR_M_1_03052', 'cmlA6', 'cat'])

# Find core genes

In [11]:
pa_matrix = pd.read_csv(pantaout_dir+'gene_presence_absence.Rtab', sep='\t', index_col=0).T

In [12]:
pa_matrix.head(2)

Gene,groups_0,groups_1,groups_2,yehX,groups_4,umuC_1,intA_2,feaR,traI,groups_9,...,groups_42028,groups_42029,groups_42030,groups_42031,groups_42032,groups_42033,groups_42034,groups_42035,groups_42036,groups_42037
1284787.3,1,1,2,14,1,3,2,2,1,0,...,0,0,0,0,0,0,0,0,0,0
1284788.3,3,1,0,15,2,3,2,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
n_samples = pa_matrix.shape[0]
n_genes = pa_matrix.shape[1]

In [14]:
colsum = pa_matrix.sum()
# core_gene_cluster = [colsum.index[idx] for idx in range(n_genes) if colsum[idx] >= 0.9999*n_samples] # E. coli
core_gene_cluster = [colsum.index[idx] for idx in range(n_genes) if colsum[idx] >= 0.9999*n_samples] # K. p.

In [15]:
core_gene_cluster[:4], len(core_gene_cluster), n_genes

(['groups_0', 'groups_1', 'groups_2', 'yehX'], 2791, 42038)

# Compute label encoder

## Compute K-mer of AMR clusters

In [16]:
from pangraph.utils import binary_label
from sklearn.feature_selection import mutual_info_classif, chi2

In [17]:
# Read prepresentative sequence
from Bio import SeqIO
genecluster2representativeseq = {}
with open(pantaout_dir+'representative_clusters_prot.fasta') as handle:
    for record in SeqIO.parse(handle, "fasta"):
        name, sequence = record.id, str(record.seq)
        genecluster2representativeseq[name] = sequence
        # print(name,'----', sequence)

In [18]:
with open(pantaout_dir + 'samples.json', 'r') as JSON:
    sample_dict = json.load(JSON)
sample2integerindex = {}
for idx in range(len(sample_dict)):
    sample2integerindex[sample_dict[idx]['id']] = idx
n_samples = len(sample_dict)

In [19]:
computed_gene_cluster = amr_clusterID;

In [20]:
# amr_mat = None;
ksize = 10; # k = 10 for protein, 20 for DNA
kmer_list = [];
# amr_mat = np.zeros((n_samples, len(computed_gene_cluster)))
pairdata = []
for idx in range(len(computed_gene_cluster)):
    alignment_dir = pantaout_dir + 'clusters/' + computed_gene_cluster[idx] +'/'+computed_gene_cluster[idx]+'.faa.aln.gz'
    # alignment_dir = pantaout_dir + 'clusters/' + computed_gene_cluster[idx] +'/'+computed_gene_cluster[idx]+'.fna.aln.gz'
    with gzip.open(alignment_dir, "rt") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            name, sequence = record.id, str(record.seq)
            sample_id = name.split('-')[0]
            seqraw = sequence.replace('-', '')
            n_kmers = len(seqraw) - ksize + 1
            for i in range(n_kmers):
                kmer = seqraw[i:i + ksize] 
                # kmer = computed_gene_cluster[idx] + seqraw[i:i + ksize] # compute unique kmer for eaach cluster
                kmer_list.append(kmer)
                pairdata.append((sample2integerindex[sample_id], kmer))

In [21]:
unique_kmer = list(set(kmer_list))

In [22]:
# AMR genes: (74198447, 169115)
len(kmer_list), len(unique_kmer), unique_kmer[0:3]

(189899438, 484013, ['DFVAGVGVGK', 'DPELCNEVLN', 'WADTAVAVTY'])

In [23]:
kmer2index = {}
for i in range(len(unique_kmer)):
    kmer2index[unique_kmer[i]] = i

In [24]:
kmer_matrix = np.zeros((n_samples, len(unique_kmer)))

In [25]:
# ct = 0
for u, v in pairdata:
    # kmer_matrix[u, kmer2index[v]] += 1
    kmer_matrix[u, kmer2index[v]] = 1

In [26]:
kmer_matrix.shape

(2334, 484013)

In [27]:
# selector = VarianceThreshold(threshold=0.05)
selector = VarianceThreshold(threshold=0.01)
kmer_matrix_VT = selector.fit_transform(kmer_matrix)

In [28]:
# (1653, 59580)
kmer_matrix_VT.shape

(2334, 91532)

In [29]:
np.save(pantaout_dir + 'KmerEncoderAMRGenesSubmission.npy', kmer_matrix_VT) # save numpy array

In [30]:
kmerindex_final = np.array(unique_kmer)[selector.get_support()==True]

In [31]:
np.save(pantaout_dir + 'KmerEncoderAMRGenesSubmission_index.npy', kmerindex_final) # save numpy array

### Get metadata

In [32]:
# ## Ecoli
# metadata_panta = pd.read_csv("data/Ecoli1936metafiles/metadata_final.csv")

In [33]:
# # pa_matrix = pd.read_csv('/data/hoan/amromics/prediction/output/pantaKpPatric/gene_presence_absence.Rtab', sep='\t', index_col=0).T
# pa_matrix = pd.read_csv(pantaout_dir+'gene_presence_absence.Rtab', sep='\t', index_col=0).T

In [34]:
# pa_matrix.index[0:10]

In [35]:
# ### Check the difference between metadata and matrix
# diffindex = set(pa_matrix.index).difference(set(metadata.index))
# # diffindex
# newindex = [val + '0' if val in diffindex else val for val in pa_matrix.index]
# pa_matrix.index = newindex
# len(set(pa_matrix.index).difference(set(metadata.index)))

In [36]:
# ### Export refined metadata
# metadata_panta = metadata.loc[list(pa_matrix.index)]
# newcolumn = [item.replace("_", "@") for item in metadata_panta.columns]
# metadata_panta.columns = newcolumn
# metadata_panta.to_csv("/data/hoan/amromics/prediction/data/Kpmetadata_final.csv", index=False)

In [37]:
# metadata_panta

## Compute label encoder for AMR clusters

In [38]:
computed_gene_cluster = amr_clusterID;

In [39]:
with open(pantaout_dir + 'samples.json', 'r') as JSON:
    sample_dict = json.load(JSON)
sample2integerindex = {}
for idx in range(len(sample_dict)):
    sample2integerindex[sample_dict[idx]['id']] = idx
n_samples = len(sample_dict)

In [40]:
amr_mat = None;
start_idx = [0];
pass_gene_cluster = [];
for idx in range(len(computed_gene_cluster)):
    alignment_dir = pantaout_dir + 'clusters/' + computed_gene_cluster[idx] +'/'+computed_gene_cluster[idx]+'.faa.aln.gz'
    codes = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
             'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', '-', 'X']
    le = preprocessing.LabelEncoder()
    le.fit(codes)
    mat = None; index = 0; index_set = []
    with gzip.open(alignment_dir, "rt") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            name, sequence = record.id, str(record.seq)
            sample_id = name.split('-')[0]
            if index == 0:
                mat = np.zeros((n_samples, len(sequence)))
            index += 1
            mat[sample2integerindex[sample_id],:] = 1 + le.transform([*sequence])
            index_set.append(sample2integerindex[sample_id])
            # print(record.id)
    if idx==0:
        pass_gene_cluster.append(computed_gene_cluster[idx])
        start_idx += [start_idx[-1] + mat.shape[1]]
        amr_mat = mat
    else:
        # ## Run feature selection
        # variant_thres = 0.05
        variant_thres = 0
        vs = True
        if len(index_set) >= int(n_samples*0.01):
            try:
                sel = VarianceThreshold(variant_thres)
                sel.fit(mat[index_set,:])
            except ValueError:
                vs = False
            if vs:
                mat = mat[:, sel.variances_>variant_thres]
                if mat.shape[0] > 0:
                    pass_gene_cluster.append(computed_gene_cluster[idx])
                    start_idx += [start_idx[-1] + mat.shape[1]]
                    amr_mat = np.append(amr_mat, mat, axis=1)
end_idx = [start_idx[idx]-1 for idx in range(1, len(start_idx))]
start_idx = start_idx[:-1]

In [41]:
amr_mat.shape

(2334, 85070)

In [42]:
metadata_panta = pd.read_csv("/data/hoan/amromics/prediction/data/Kpmetadata_final.csv")
# metadata_panta = pd.read_csv("data/Ecoli1936metafiles/metadata_final.csv")
mutual_mat = []
for idx in range(2, metadata_panta.shape[1]):
    y_class = metadata_panta.iloc[:,idx].values
    print(metadata_panta.columns[idx])
    y, nonenan_index = binary_label(y_class) # v6
    pa_matrix_new = amr_mat[nonenan_index, ]
    y_new = y[nonenan_index].astype(int)
    scores, pvalue = chi2(pa_matrix_new, y_new)
    mutual_mat.append(scores)
mutual_mat = np.array(mutual_mat)
mutual_mat_mean = mutual_mat.mean(axis=0)

amikacin
amoxicillin@clavulanic@acid
ampicillin@sulbactam
aztreonam
cefazolin
cefepime
cefoxitin
ceftazidime
ceftriaxone
cefuroxime@sodium
ciprofloxacin
ertapenem
gentamicin
imipenem
levofloxacin
meropenem
nitrofurantoin
ofloxacin
piperacillin@tazobactam
tetracycline
ticarcillin@clavulanic@acid
tobramycin
trimethoprim
trimethoprim@sulfamethoxazole


In [43]:
top_features = np.argsort(mutual_mat_mean)[::-1][:20000]
kmer_matrix_VT_top_features = amr_mat[:,top_features]
kmer_matrix_VT_top_features.shape

(2334, 20000)

In [44]:
# np.save(pantaout_dir + 'amrlabelencodermat_top10kgenes_v9.npy', amr_mat) # save numpy array
# np.save(pantaout_dir + 'KpAMRGeneLabelEncoderMat.npy', amr_mat) # save numpy array
outdata_name = 'KpAMRGeneLabelEncoderMatTop20k'
np.save(pantaout_dir + outdata_name + '.npy', kmer_matrix_VT_top_features) # save numpy array
# outdata_name = 'genes_fold_' + str(fold_idx)
# np.save(pantaout_dir + outdata_name + '.npy', amr_mat) # save numpy array

In [45]:
amrgene_annotation = pd.DataFrame({'gene': pass_gene_cluster, 'start_index': start_idx, 'end_index': end_idx})
amrgene_annotation.to_csv(pantaout_dir + outdata_name + '_geneindex.csv', index=None)

## Compute label encoder for core gene clusters

In [46]:
computed_gene_cluster = core_gene_cluster;

In [47]:
with open(pantaout_dir + 'samples.json', 'r') as JSON:
    sample_dict = json.load(JSON)
sample2integerindex = {}
for idx in range(len(sample_dict)):
    sample2integerindex[sample_dict[idx]['id']] = idx
n_samples = len(sample_dict)

In [48]:
computed_gene_cluster[:5]

['groups_0', 'groups_1', 'groups_2', 'yehX', 'groups_4']

In [None]:
amr_mat = None;
start_idx = [0];
pass_gene_cluster = [];
for idx in range(len(computed_gene_cluster)):
    # print(idx, end=',')
    alignment_dir = pantaout_dir + 'clusters/' + computed_gene_cluster[idx] +'/'+computed_gene_cluster[idx]+'.faa.aln.gz'
    codes = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
             'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', '-', 'X']
    # ['-' 'A' 'D' 'E' 'F' 'G' 'H' 'I' 'K' 'L' 'M' 'N' 'P' 'Q' 'R' 'S' 'T' 'V' 'W' 'X' 'Y']
    le = preprocessing.LabelEncoder()
    le.fit(codes)
    mat = None; index = 0; index_set = []
    with gzip.open(alignment_dir, "rt") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            name, sequence = record.id, str(record.seq)
            sample_id = name.split('-')[0]
            if index == 0:
                mat = np.zeros((n_samples, len(sequence)))
            index += 1
            # if idx >= 2:
            #     print(np.unique(np.array([*sequence])))
            mat[sample2integerindex[sample_id],:] = 1 + le.transform([*sequence])
            index_set.append(sample2integerindex[sample_id])
            # print(record.id)
    if idx==0:
        pass_gene_cluster.append(computed_gene_cluster[idx])
        start_idx += [start_idx[-1] + mat.shape[1]]
        amr_mat = mat
    else:
        # ## Run feature selection
        # variant_thres = 0.05
        variant_thres = 0
        vs = True
        if len(index_set) >= int(n_samples*0.01):
            try:
                sel = VarianceThreshold(variant_thres)
                sel.fit(mat[index_set,:])
            except ValueError:
                vs = False
            if vs:
                mat = mat[:, sel.variances_>variant_thres]
                if mat.shape[0] > 0:
                    pass_gene_cluster.append(computed_gene_cluster[idx])
                    start_idx += [start_idx[-1] + mat.shape[1]]
                    amr_mat = np.append(amr_mat, mat, axis=1)
end_idx = [start_idx[idx] for idx in range(1, len(start_idx))]
start_idx = start_idx[:-1]

In [None]:
amr_mat.shape

In [None]:
# pantaout_dir = '/data/hoan/amromics/prediction/output/pantaKpPatric/'
# # snp_data = 'KpPatricCoreGeneLabelEncoderMatTop2k.npy'
# # snp_data = 'KpAMRGeneLabelEncoderMatTop20k.npy'
# snp_data = 'KpPatricCoreGeneLabelEncoderMatTop2k.npy'
# amr_mat = np.load(pantaout_dir + snp_data) # pantaDifferSite

In [None]:
amr_mat.shape

In [None]:
## Delete this
# end_idx_old = end_idx
# end_idx = [end_idx_old[i] + 1 for i in range(len(end_idx_old))]

In [None]:
amrgene_annotation = pd.DataFrame({'gene': pass_gene_cluster, 'start_index': start_idx, 'end_index': end_idx})
# amrgene_annotation.to_csv(pantaout_dir + outdata_name + '_geneindex.csv', index=None)

In [None]:
np.save(pantaout_dir + 'SNPsCoreGeneFullSubmission.npy', amr_mat) # save numpy array
# amr_mat = np.load(pantaout_dir + 'SNPsCoreGeneFullSubmission.npy') 

In [None]:
amrgene_annotation.to_pickle(pantaout_dir + 'SNPsCoreGeneFullSubmission_metadata.pkl')
# amrgene_annotation = pd.read_pickle(pantaout_dir + 'SNPsCoreGeneFullSubmission_metadata.pkl')

In [None]:
amr_mat[:,amrgene_annotation.iloc[-1,1]:amrgene_annotation.iloc[-1,2]].shape

In [None]:
amrgene_annotation.head(3)

In [None]:
snpheader = []
for idx in range(len(amrgene_annotation.index)):
    start_idx = amrgene_annotation.iloc[idx, 1]
    end_idx = amrgene_annotation.iloc[idx, 2]
    for i in range(start_idx, end_idx):
        snpheader.append(amrgene_annotation.iloc[idx, 0] + '@'+str(i))

In [None]:
len(snpheader), snpheader[:3]

In [None]:
metadata_panta = pd.read_csv("/data/hoan/amromics/prediction/data/Kpmetadata_final.csv")
# metadata_panta = pd.read_csv("data/Ecoli1936metafiles/metadata_final.csv")
mutual_mat = []
for idx in range(2, metadata_panta.shape[1]):
    y_class = metadata_panta.iloc[:,idx].values
    print(metadata_panta.columns[idx])
    y, nonenan_index = binary_label(y_class) # v6
    pa_matrix_new = amr_mat[nonenan_index, ]
    y_new = y[nonenan_index].astype(int)
    scores, pvalue = chi2(pa_matrix_new, y_new)
    mutual_mat.append(scores)
mutual_mat = np.array(mutual_mat)
mutual_mat_mean = mutual_mat.mean(axis=0)

In [None]:
top_features = np.argsort(mutual_mat_mean)[::-1][:10000]
kmer_matrix_VT_top_features = amr_mat[:,top_features]
kmer_matrix_VT_top_features.shape

In [None]:
snp_features_name = np.array(snpheader)[top_features]

In [None]:
np.save(pantaout_dir + 'SNPsCoreGeneTop10KSubmission.npy', kmer_matrix_VT_top_features) # save numpy array

In [None]:
np.save(pantaout_dir + 'SNPsCoreGeneTop10KSubmission_index.npy', snp_features_name) # save numpy array