### Make a copy of this note if you want to analyze 

#### Please run panta first to get gene clusters and other output files for this analysis

In [1]:
# # Install a pip package in the current Jupyter kernel
# import sys
# !{sys.executable} -m pip install pygraphviz

In [2]:
import pandas as pd
from scipy.sparse import csr_matrix
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import math

In [3]:
# data_dir = "/home/vanhoan310/server/amromics/panta/examples/test/output/"
# data_dir = "data/genome_graph_test/"
# you can take the data in the directory: data/genome_graph_test
# data_dir = "/data/hoan/amromics/data/ncbi/Kp_true/SAMN04158282/output/"
data_dir = "../panta/examples/test/output_Kp30plus1/"

In [4]:
sample_info = pd.read_csv(data_dir + "samples.tsv", delimiter='\t', header=None)

In [5]:
sample_info.head()

Unnamed: 0,0,1
0,GCF_002113865.1_ASM211386v1_genomic,1
1,contigs,0


In [6]:
gene_info = pd.read_csv(data_dir + "gene_info.tsv", delimiter='\t', header=None)
gene_info.columns =['GeneName', 'SampleID', 'clusterID']

In [7]:
## select some sample
# selected_samples = [0, 1, 9]
# gene_info = gene_info.loc[gene_info['SampleID'].isin(selected_samples)]
gene_info.head(2)

Unnamed: 0,GeneName,SampleID,clusterID
0,0_5557@1590@-1,0,0
1,1_5400@1530@1,1,0


In [8]:
gene_position = pd.read_csv(data_dir + 'gene_position.tsv', delimiter='\t', header=None)
gene_position.columns =['SampleID', 'ContigName', 'GeneSequence']

In [9]:
# gene_position = gene_position.loc[gene_position['SampleID'].isin(selected_samples)]
gene_position.head(3)

Unnamed: 0,SampleID,ContigName,GeneSequence
0,1,NZ_CP020901.1,1_1@117@1;1_2@483@1;1_3@558@-1;1_4@309@-1;1_5@...
1,1,NZ_CP020902.1,1_5033@216@1;1_5034@1245@-1;1_5035@507@-1;1_50...
2,1,NZ_CP020903.1,1_5386@597@1;1_5387@276@1;1_5388@300@1;1_5389@...


In [10]:
def compute_number_nucleotides(gene_contigs = None):
    # compute number of nucleotides in the contig
    n_nucleo = 0
    for gene in gene_contigs:
        gc = gene.split("@")
        n_nucleo += int(gc[-2])
    return (n_nucleo) 

In [11]:
gene = gene_position.iloc[0,2].split(";")
gc = gene[0].split("@")
compute_number_nucleotides(gene)

4747266

In [12]:
from collections import Counter
dict(Counter(gene_position.iloc[:,0]))

{1: 5, 0: 142}

In [13]:
# sort by length of contigs
gene_position.sort_values(by="GeneSequence", key=lambda x: x.str.len(),  ascending=False, inplace=True)

In [14]:
gene_position.head(2)

Unnamed: 0,SampleID,ContigName,GeneSequence
0,1,NZ_CP020901.1,1_1@117@1;1_2@483@1;1_3@558@-1;1_4@309@-1;1_5@...
5,0,NODE_1_length_1228121_cov_190.580240,0_1@1197@-1;0_2@798@1;0_3@1500@-1;0_4@537@1;0_...


In [15]:
gene_position.loc[gene_position["SampleID"]==18]

Unnamed: 0,SampleID,ContigName,GeneSequence


In [16]:
print("List of all samples")
n_samples = len(np.unique(gene_position.iloc[:,0]))
np.unique(gene_position.iloc[:,0])

List of all samples


array([0, 1])

# Run here

In [17]:
%load_ext autoreload
%autoreload 2
from pangraph import PanGraph

In [18]:
# construct the pangenome graph
min_contig_len = 100
pangraph = PanGraph(sample_info=sample_info, gene_info=gene_info, gene_position=gene_position)

In [19]:
H = pangraph.construct_graph()
# H = pangraph.construct_graph(method = "graph_alignment", sample_id_ref = None,  min_nucleotides = 20)

Set minimum on number of nucleotides =  200 NUMBER OF COMPUTED CONTIGS: 71


In [20]:
# number of nodes and edges
pangraph.n_clusters, H.number_of_edges()

(11501, 5809)

In [21]:
gene_contigs = gene_position.iloc[0,2].split(";") #take the longest sequence

In [22]:
def count_dups(nums):
    element = []
    freque = []
    if not nums:
        return element
    running_count = 1
    for i in range(len(nums)-1):
        if nums[i] == nums[i+1]:
            running_count += 1
        else:
            freque.append(running_count)
            element.append(nums[i])
            running_count = 1
    freque.append(running_count)
    element.append(nums[i+1])
    return element,freque

In [23]:
list_contig = []
for gene in gene_contigs:
    cluster_id = pangraph.gene2cluster_dict[gene]
    df = gene_info.loc[gene_info.iloc[:,2]==cluster_id]
    for i in range(len(df.index)):
        if df.iloc[i,1] == 0:
            # print(df.iloc[i,0], "->", gene2contigs_dict[df.iloc[i,0]], end =", ")
            list_contig.append(gene_position.iloc[pangraph.gene2contigs_dict[df.iloc[i,0]], 1])

In [24]:
elem, freq = count_dups(list_contig)

In [25]:
# df_array = pd.DataFrame({'contig': elem, 'freq': freq})

In [26]:
edge_list = []
for i in range(len(elem)-1):
    edge_list.append([elem[i]+":"+str(freq[i]), elem[i+1]+":"+str(freq[i+1])])

In [27]:
true_graph = nx.DiGraph()
true_graph.add_edges_from(edge_list)

In [28]:
nx.write_gml(true_graph,'cytoscape_out/true_graph.gml')