In [6]:
import pandas as pd
from data_preprocessing import  separate_tf_genes, prepare_for_inference
from tf_utils import map_tf_ids
import db as db
from utils_network import *
from inference import *

In [7]:
config_file = "/data_nfs/og86asub/alternet-project/alternet/configs/MAGNet_NF.yaml"
import yaml

with open(config_file, 'r') as f:
    config = yaml.safe_load(f)


# Load necessary data and annotations

In [8]:
biomart_path = '/data_nfs/og86asub/alternet-project/alternet/data/biomart.txt'
tf_list_path = '/data_nfs/og86asub/alternet-project/alternet/data/allTFs_hg38.txt'

biomart = pd.read_csv(biomart_path, sep='\t')
tf_list = pd.read_csv(tf_list_path, sep='\t', header = None)
tf_list = map_tf_ids(tf_list, biomart)

# create TF Database
tf_database = db.create_transcipt_annotation_database(tf_list=tf_list, appris_path= config['appris'], digger_path=config['digger'])


  digger = pd.read_csv(digger_path)


In [9]:
## LOAD gene and transcript data
transcript_data = pd.read_csv(config['transcript_data'], index_col=0)

## GET the gene ids associated with transcripts
gene_ids = transcript_data['gene_id'].unique()




In [13]:
transcript_data

Unnamed: 0,transcript_id,gene_id,C00039,C00055,C00074,C00085,C00105,C00120,C00132,C00158,...,P01626,P01627,P01628,P01629,P01630,P01631,P01634,P01635,P01639,P01640
0,ENST00000473530,ENSG00000244682,2.276150,2.040930,1.564230,2.629880,3.224940,0.081621,3.577190,3.275500,...,2.249030,0.754787,0.000000,1.594620,0.000000,0.031552,1.032080,2.658050,1.475990,0.012955
1,ENST00000467903,ENSG00000244682,2.167410,2.788660,3.829120,3.177140,4.211690,0.176553,6.959880,3.485350,...,4.777670,2.296700,0.256739,7.109500,0.226315,0.453683,1.436180,2.298990,3.807010,0.155549
2,ENST00000508651,ENSG00000244682,1.950990,2.569730,3.989870,3.134920,5.018950,0.110165,4.027630,1.973540,...,4.473280,1.117980,0.185469,3.784620,0.146539,0.085379,1.271050,1.629250,3.130670,0.000000
3,ENST00000626647,ENSG00000280938,0.049179,0.019637,0.017334,0.489663,0.000000,0.105393,0.517216,0.029199,...,0.494979,3.469800,6.911700,1.803350,1.116030,0.850385,0.507154,0.000000,1.293200,1.458430
4,ENST00000633377,ENSG00000282181,3.231750,1.134710,3.935840,0.548367,0.007815,0.319113,3.020600,1.936820,...,2.497850,0.987592,0.007706,0.507429,0.257754,0.544622,0.007459,1.090220,0.713997,1.031000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39310,ENST00000569097,ENSG00000197081,0.821758,0.377072,0.413925,0.907076,0.476571,0.098923,0.395161,0.379031,...,0.957061,0.418827,0.047140,0.320178,0.066159,0.132223,0.285653,0.631274,0.421508,0.062954
39311,ENST00000602699,ENSG00000225366,0.050290,0.014330,0.035171,0.041823,0.037060,0.000000,0.065104,0.016308,...,0.044029,0.000000,0.000000,0.025864,0.024941,0.048181,0.038492,0.051400,0.017281,0.014494
39312,ENST00000594408,ENSG00000182986,2.056060,1.068780,0.909087,1.211280,1.235250,0.022664,1.194140,1.571050,...,1.833670,0.982506,0.017389,0.592679,0.000000,0.043223,0.683917,1.783930,1.038960,0.000000
39313,ENST00000510091,ENSG00000205482,0.397195,0.139399,0.137387,0.238294,0.456423,0.055239,0.127514,0.330749,...,1.001040,0.285062,0.049527,0.056683,0.000000,0.000000,0.040746,0.233459,0.148919,0.000000


In [16]:
gene_data = transcript_data.groupby('gene_id').sum()

In [18]:
gene_data = gene_data.drop(columns={'transcript_id'})

In [19]:
gene_data.index.name = 'gene_id'
gene_data = gene_data.reset_index()
gene_data = gene_data.iloc[0:40, ]

In [20]:

## Subset the samples of interest
sample_attributes = pd.read_csv(config['sample_attributes'])
sample_attributes = sample_attributes.loc[:, ['sample_name', 'etiology']]
samples = sample_attributes[sample_attributes['etiology'] == config['tissue']]
samples = samples['sample_name'].tolist()


## Get unified gene and transcript table
gene_data = gene_data.loc[:, ['gene_id'] + samples ]
transcript_data = transcript_data.loc[:,['gene_id', 'transcript_id'] + samples]


In [21]:
tf_iso_counts, _ = separate_tf_genes(data=transcript_data, tf_list=tf_list, data_column='transcript_id', biomart_column='Transcript stable ID')

In [22]:
tf_gene_counts, target_gene_counts = separate_tf_genes(data=gene_data, tf_list=tf_list, data_column='gene_id', biomart_column='Gene stable ID')

In [23]:
data_canonical, data_asware, target_gene_list = prepare_for_inference(tf_iso_counts, tf_gene_counts, target_gene_counts, transcript_column='transcript_id', gene_column='gene_id')

Preparing data for inference: Separation into canonical and as-aware gene expression data


In [24]:
tf_iso_counts


Unnamed: 0,gene_id,transcript_id,C00039,C00055,C00074,C00085,C00105,C00120,C00132,C00158,...,P01600,P01603,P01610,P01616,P01621,P01622,P01623,P01626,P01628,P01635
69,ENSG00000275746,ENST00000621843,0.000000,1.921400,4.830320,4.321360,11.776600,0.000000,11.941900,0.057060,...,3.274980,8.922970,12.219200,3.440300,0.035091,13.307500,12.556100,0.000000,0.381808,3.594360
71,ENSG00000280680,ENST00000629127,0.488933,0.006129,0.106360,0.269342,0.481044,0.208270,0.284457,0.230953,...,0.258495,0.109312,0.000000,0.169916,0.193431,0.142123,0.291337,0.569886,0.167812,0.301905
141,ENSG00000196458,ENST00000360187,1.556050,0.509873,0.559587,0.980809,0.613056,0.481815,0.643990,0.815159,...,0.672292,0.689361,0.727975,0.791609,0.408060,0.416713,0.931220,0.906321,0.456414,0.628137
142,ENSG00000196458,ENST00000392321,4.305740,2.353420,1.963320,3.603160,2.979200,1.108800,2.318270,2.618990,...,2.885390,2.497590,2.936280,3.926700,0.836353,2.798880,3.036580,3.238990,0.998708,5.220130
143,ENSG00000196458,ENST00000381215,1.360270,0.523519,0.718419,0.709772,0.507990,0.160236,1.083130,0.368735,...,0.433854,0.336460,0.596197,0.566359,0.175297,0.343090,0.530078,0.233906,0.000000,0.563263
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39234,ENSG00000282854,ENST00000634513,2.734940,2.158400,1.809580,1.739980,1.284910,0.136230,2.717190,1.536370,...,1.867850,1.626540,1.368840,1.454700,0.559354,1.852220,3.412960,3.710800,0.338195,1.194180
39235,ENSG00000282854,ENST00000634561,2.416900,2.097120,3.968760,1.306100,1.482140,0.064622,1.248550,1.534340,...,0.584410,2.874060,4.622200,0.932272,0.070226,3.783470,2.474310,2.291280,0.030406,1.310060
39236,ENSG00000282854,ENST00000635351,0.637849,0.098004,0.702740,0.536145,0.600501,0.413672,0.542548,0.522089,...,0.858770,0.219566,0.360110,0.286954,0.063341,0.107373,0.438501,0.000000,0.000000,0.338502
39237,ENSG00000282899,ENST00000634683,0.015705,0.018329,0.048345,0.009561,0.085228,0.021931,0.050436,0.031760,...,0.050812,0.013758,0.011409,0.023511,0.052863,0.020997,0.038926,0.049232,0.006934,0.011964


In [25]:
data_asware

Unnamed: 0_level_0,C00039,C00055,C00074,C00085,C00105,C00120,C00132,C00158,C00168,C00186,...,P01600,P01603,P01610,P01616,P01621,P01622,P01623,P01626,P01628,P01635
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENST00000621843,0.000000,1.921400,4.830320,4.321360,11.776600,0.000000,11.941900,0.057060,3.526590,0.170855,...,3.274980,8.922970,12.219200,3.440300,0.035091,13.307500,12.556100,0.000000,0.381808,3.594360
ENST00000629127,0.488933,0.006129,0.106360,0.269342,0.481044,0.208270,0.284457,0.230953,0.408554,0.173717,...,0.258495,0.109312,0.000000,0.169916,0.193431,0.142123,0.291337,0.569886,0.167812,0.301905
ENST00000360187,1.556050,0.509873,0.559587,0.980809,0.613056,0.481815,0.643990,0.815159,0.685427,0.802393,...,0.672292,0.689361,0.727975,0.791609,0.408060,0.416713,0.931220,0.906321,0.456414,0.628137
ENST00000392321,4.305740,2.353420,1.963320,3.603160,2.979200,1.108800,2.318270,2.618990,1.960630,0.120433,...,2.885390,2.497590,2.936280,3.926700,0.836353,2.798880,3.036580,3.238990,0.998708,5.220130
ENST00000381215,1.360270,0.523519,0.718419,0.709772,0.507990,0.160236,1.083130,0.368735,0.229089,0.018763,...,0.433854,0.336460,0.596197,0.566359,0.175297,0.343090,0.530078,0.233906,0.000000,0.563263
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000003436,28.490930,18.440540,15.969181,21.885337,22.476400,10.506746,27.181098,20.534197,12.225895,10.116812,...,15.251027,9.683091,13.895921,14.239894,18.224117,11.555473,14.553964,21.890835,16.586082,19.743486
ENSG00000003509,21.012959,18.172372,22.750115,16.056193,18.383836,15.862320,19.250591,17.897856,18.642779,8.065930,...,20.107785,11.898181,17.237519,17.251907,8.558163,13.976136,13.146203,17.406819,8.288946,15.996116
ENSG00000003756,54.936501,26.129274,38.328458,45.693790,42.495645,28.709693,37.254217,35.839398,28.977764,17.107432,...,43.215013,23.500684,34.498117,34.216680,21.748461,20.232026,32.258981,51.464930,19.335575,42.815704
ENSG00000003987,0.673885,0.513828,0.733176,0.730261,0.811276,0.212408,0.698341,0.470540,0.636159,0.271111,...,0.540007,0.357516,0.366972,0.680045,0.206822,0.313873,0.557605,0.970646,0.398043,0.640866


In [26]:
# get isoform categories
isoform_categories = isoform_categorization(tf_iso_counts, tf_gene_counts, threshold_dominance=80)


            gene_id     C00039     C00055     C00074    C00085     C00105  \
9   ENSG00000001167   7.682690   4.104230   4.853400  4.317800   5.404930   
12  ENSG00000001497  15.656040   7.594953  11.370840  9.514930   8.113490   
19  ENSG00000002330   6.494686  11.965629  16.796471  5.533693  13.560036   

       C00120     C00132     C00158    C00168  ...     P01603     P01610  \
9    0.896633   3.439820   4.188700   2.40911  ...   3.054800   4.432720   
12   6.973793  10.199250   9.995520  12.91687  ...   6.606053   8.879420   
19  36.562488  10.916904  11.033318  18.67241  ...  28.632750  62.501836   

      P01616     P01621     P01622     P01623     P01626     P01628    P01635  \
9   4.571550   0.865780   3.645890   2.541630   4.436620   0.818385  6.867520   
12  7.067700   7.164017   5.129949  11.200990  12.328330   6.447379  8.749470   
19  1.875371  65.369523  46.345562  12.156581  12.816146  64.315560  4.524684   

    sum_gene_expression  
9            598.263609  
12       

In [27]:
tf_gene_counts

Unnamed: 0,gene_id,C00039,C00055,C00074,C00085,C00105,C00120,C00132,C00158,C00168,...,P01600,P01603,P01610,P01616,P01621,P01622,P01623,P01626,P01628,P01635
9,ENSG00000001167,7.68269,4.10423,4.8534,4.3178,5.40493,0.896633,3.43982,4.1887,2.40911,...,4.1206,3.0548,4.43272,4.57155,0.86578,3.64589,2.54163,4.43662,0.818385,6.86752
12,ENSG00000001497,15.65604,7.594953,11.37084,9.51493,8.11349,6.973793,10.19925,9.99552,12.91687,...,10.26528,6.606053,8.87942,7.0677,7.164017,5.129949,11.20099,12.32833,6.447379,8.74947
19,ENSG00000002330,6.494686,11.965629,16.796471,5.533693,13.560036,36.562488,10.916904,11.033318,18.67241,...,4.760836,28.63275,62.501836,1.875371,65.369523,46.345562,12.156581,12.816146,64.31556,4.524684


In [28]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
isoform_categories

Unnamed: 0,transcript_id,gene_id,sum_transcript_expression,sum_gene_expression,percentage,max_percentage,min_percentage,std_percentage,isoform_category
0,ENST00000394532,ENSG00000002330,168.174,4108.186,4.094,86.225,2.39,40.867,non-dominant
1,ENST00000309032,ENSG00000002330,3542.295,4108.186,86.225,86.225,2.39,40.867,dominant
2,ENST00000492141,ENSG00000002330,98.202,4108.186,2.39,86.225,2.39,40.867,non-dominant
3,ENST00000544271,ENSG00000002330,299.516,4108.186,7.291,86.225,2.39,40.867,non-dominant
4,ENST00000374807,ENSG00000001497,309.502,1639.394,18.879,34.838,12.95,10.784,balanced
5,ENST00000374811,ENSG00000001497,546.445,1639.394,33.332,34.838,12.95,10.784,balanced
6,ENST00000484069,ENSG00000001497,212.309,1639.394,12.95,34.838,12.95,10.784,balanced
7,ENST00000469091,ENSG00000001497,571.137,1639.394,34.838,34.838,12.95,10.784,balanced
8,ENST00000341376,ENSG00000001167,598.264,598.264,100.0,100.0,100.0,,dominant


In [29]:
as_aware_grn, canonical_grn = inference(config, 2, tf_list=tf_list, data_canonical=data_canonical, data_asware=data_asware, target_gene_list=target_gene_list, aggregate=True)


Starting inference ...
Computing network
Computing network
Computing network
Computing network
Inference complete
Aggregate results
Going through edges of each df ...


100%|██████████| 38846/38846 [00:00<00:00, 41915.22it/s]
100%|██████████| 38197/38197 [00:00<00:00, 41793.96it/s]
100%|██████████| 58624/58624 [00:00<00:00, 1977345.58it/s]


Going through edges of each df ...


100%|██████████| 111/111 [00:00<00:00, 34491.61it/s]
100%|██████████| 111/111 [00:00<00:00, 39278.47it/s]
100%|██████████| 111/111 [00:00<00:00, 1293243.73it/s]


In [30]:
canonical_grn

Unnamed: 0,source,target,frequency,mean_importance,median_importance
0,ENSG00000001167,ENSG00000003056,2,173.096,178.614
1,ENSG00000001167,ENSG00000001629,2,166.671,168.557
2,ENSG00000001167,ENSG00000001561,2,132.995,137.291
3,ENSG00000001167,ENSG00000001460,2,125.436,128.146
4,ENSG00000001167,ENSG00000003393,2,131.500,141.749
...,...,...,...,...,...
106,ENSG00000002330,ENSG00000001036,2,11.414,12.249
107,ENSG00000001497,ENSG00000000460,2,8.448,8.633
108,ENSG00000002330,ENSG00000000460,2,7.225,8.388
109,ENSG00000001497,ENSG00000000005,2,17.364,26.507


In [31]:
#filter aggregate
as_aware_grn = filter_aggregated(as_aware_grn, threshold_frequency=1, threshold_importance=0.3)
canonical_grn = filter_aggregated(canonical_grn, threshold_frequency=1, threshold_importance=0.3)

net_AS = add_edge_key(as_aware_grn, biomart, source_column = 'source')
net_canonical = add_edge_key(canonical_grn, biomart, type='canonical', source_column='source')

# Categorize edges into gene-unique, isoform-unique, common
common_edges = get_common_edges(net_canonical, net_AS)
gene_unique, isoform_unique = get_diff(net_canonical, net_AS)

print('Number of edges in each category')
print('Number of edges in gene-exclusive: ', len(gene_unique))
print('Number of edges in isoform-exclusive: ', len(isoform_unique))
print('Number of edges in common interactions: ', len(common_edges))


Number of edges in each category
Number of edges in gene-exclusive:  25
Number of edges in isoform-exclusive:  17577
Number of edges in common interactions:  19


In [64]:
gene_transcript_data_source = common_edges.loc[:, ['source_gene', 'source_transcript']]


In [55]:
common_edges['source_descriptor'] = common_edges['source_gene']+':'+common_edges['source_transcript']
common_edges['target_descriptor'] = common_edges['target_gene']+':'+common_edges['target_gene']

In [67]:
common_edges

Unnamed: 0,source,target,frequency,mean_importance,median_importance,source_transcript,source_gene,source_transcript_name,source_gene_name,target_gene,target_gene_name,type,edge_key,source_descriptor,target_descriptor
0,ENST00000544271,ENSG00000002822,2,11.935,14.636,ENST00000544271,ENSG00000002330,BAD-206,BAD,ENSG00000002822,MAD1L1,AS,ENSG00000002330_ENSG00000002822,ENSG00000002330:ENST00000544271,ENSG00000002822:ENSG00000002822
1,ENST00000341376,ENSG00000001629,2,0.796,1.211,ENST00000341376,ENSG00000001167,NFYA-201,NFYA,ENSG00000001629,ANKIB1,AS,ENSG00000001167_ENSG00000001629,ENSG00000001167:ENST00000341376,ENSG00000001629:ENSG00000001629
2,ENST00000341376,ENSG00000003393,2,0.657,0.918,ENST00000341376,ENSG00000001167,NFYA-201,NFYA,ENSG00000003393,ALS2,AS,ENSG00000001167_ENSG00000003393,ENSG00000001167:ENST00000341376,ENSG00000003393:ENSG00000003393
3,ENST00000341376,ENSG00000003989,2,0.494,0.872,ENST00000341376,ENSG00000001167,NFYA-201,NFYA,ENSG00000003989,SLC7A2,AS,ENSG00000001167_ENSG00000003989,ENSG00000001167:ENST00000341376,ENSG00000003989:ENSG00000003989
4,ENST00000484069,ENSG00000003756,2,1.507,2.256,ENST00000484069,ENSG00000001497,LAS1L-205,LAS1L,ENSG00000003756,RBM5,AS,ENSG00000001497_ENSG00000003756,ENSG00000001497:ENST00000484069,ENSG00000003756:ENSG00000003756
5,ENST00000374811,ENSG00000003756,2,0.59,0.621,ENST00000374811,ENSG00000001497,LAS1L-203,LAS1L,ENSG00000003756,RBM5,AS,ENSG00000001497_ENSG00000003756,ENSG00000001497:ENST00000374811,ENSG00000003756:ENSG00000003756
6,ENST00000341376,ENSG00000003756,2,1.109,1.679,ENST00000341376,ENSG00000001167,NFYA-201,NFYA,ENSG00000003756,RBM5,AS,ENSG00000001167_ENSG00000003756,ENSG00000001167:ENST00000341376,ENSG00000003756:ENSG00000003756
7,ENST00000544271,ENSG00000003509,1,0.15,0.15,ENST00000544271,ENSG00000002330,BAD-206,BAD,ENSG00000003509,NDUFAF7,AS,ENSG00000002330_ENSG00000003509,ENSG00000002330:ENST00000544271,ENSG00000003509:ENSG00000003509
8,ENST00000341376,ENSG00000003056,2,1.418,2.807,ENST00000341376,ENSG00000001167,NFYA-201,NFYA,ENSG00000003056,M6PR,AS,ENSG00000001167_ENSG00000003056,ENSG00000001167:ENST00000341376,ENSG00000003056:ENSG00000003056
9,ENST00000341376,ENSG00000003402,1,0.254,0.254,ENST00000341376,ENSG00000001167,NFYA-201,NFYA,ENSG00000003402,CFLAR,AS,ENSG00000001167_ENSG00000003402,ENSG00000001167:ENST00000341376,ENSG00000003402:ENSG00000003402


In [None]:
gene_transcript_map = defaultdict(set)

# Iterate through each pair of (gene, transcript) from the DataFrame using zip.
for gene, transcript in zip(gene_transcript_data_source['source_gene'], gene_transcript_data_source['source_transcript']):
    # Add the transcript to the set associated with the current gene.
    # The set automatically handles duplicate transcripts, ensuring each
    # transcript is listed only once for a given gene.
    gene_transcript_map[gene].add(transcript)


# Convert the sets of transcripts into sorted lists to match the desired output format.
# A dictionary comprehension is used for a clean and concise conversion.
node_data = {
    gene: sorted(list(transcripts))
    for gene, transcripts in gene_transcript_map.items()
}



ENSG00000002330
ENST00000544271
ENSG00000001167
ENST00000341376
ENSG00000001167
ENST00000341376
ENSG00000001167
ENST00000341376
ENSG00000001497
ENST00000484069
ENSG00000001497
ENST00000374811
ENSG00000001167
ENST00000341376
ENSG00000002330
ENST00000544271
ENSG00000001167
ENST00000341376
ENSG00000001167
ENST00000341376
ENSG00000001167
ENST00000341376
ENSG00000001167
ENST00000341376
ENSG00000001167
ENST00000341376
ENSG00000002330
ENST00000394532
ENSG00000001167
ENST00000341376
ENSG00000001167
ENST00000341376
ENSG00000001497
ENST00000677056
ENSG00000001167
ENST00000341376
ENSG00000002330
ENST00000394532


In [66]:
gene_transcript_map

defaultdict(set,
            {'ENSG00000002330': {'ENST00000394532', 'ENST00000544271'},
             'ENSG00000001167': {'ENST00000341376'},
             'ENSG00000001497': {'ENST00000374811',
              'ENST00000484069',
              'ENST00000677056'}})

In [None]:
import graphviz

s = graphviz.Digraph(
    'gene_transcripts_expanded',
    filename='gene_transcripts_expanded.gv',
    graph_attr={
        'splines': 'spline',  # Added splines attribute
        'rankdir': 'neato'
    }
)
# Set global node and edge attributes for a consistent look.
s.attr('node', shape='record', style='rounded, filled', height='0.1')
s.attr('edge', color='grey', arrowhead='vee')

# Assume common_edges is your data source
# For this example, let's use a simple dictionary

for gene, transcripts in node_data.items():
    # Create a list of formatted transcript strings with ports
    transcript_ports = [f"<{t}>{t}" for t in transcripts]

    # Join the gene and the formatted transcripts for the node label
    label = f"{{<{gene}>{gene}|{'|'.join(transcript_ports)}}}"

    # Use the new formatted string to create the node
    s.node(gene, label)

for idx, row in common_edges.iterrows():

    # Add some edges to demonstrate the 'ortho' splines
    s.edge(row['source_descriptor'], row['target_descriptor'])

# Render the graph
s.render(view=True)



'gene_transcripts_expanded.gv.pdf'

Error: no "view" rule for type "application/pdf" passed its test case
       (for more information, add "--debug=1" on the command line)


In [None]:
s.view()

'gene_transcripts_expanded.gv.pdf'

Error: no "view" rule for type "application/pdf" passed its test case
       (for more information, add "--debug=1" on the command line)


In [None]:


# Create a new directed graph with an increased number of nodes.
# The 'splines' attribute with 'ortho' makes the edges straight.
s = graphviz.Digraph(
    'gene_transcripts_expanded',
    filename='gene_transcripts_expanded.gv',
    graph_attr={'splines': 'ortho', 'rankdir': 'neato'}
)

# Set global node and edge attributes for a consistent look.
s.attr('node', shape='record', style='rounded, filled', height='0.1')
s.attr('edge', color='grey', arrowhead='vee')
s.node('A_T3', 'Transcript MYC-003 | <here> variant | coding | intron')
s.view()
# --- Define the nodes for the genes and their transcripts (20 nodes total) ---

# Define the 'Gene_A' node. This is the top-level entity.
s.node('Gene_A', 'Gene MYC', style='filled', fillcolor='#c5e1a5')

# Define the 'Gene_B' node. Another top-level gene.
s.node('Gene_B', 'Gene TP53', style='filled', fillcolor='#c5e1a5')

# Define the 'Gene_C' node. A third gene.
s.node('Gene_C', 'Gene BCL2', style='filled', fillcolor='#c5e1a5')

# Define the 'Gene_D' node. A fourth gene.
s.node('Gene_D', 'Gene STAT3', style='filled', fillcolor='#c5e1a5')

# Define transcripts for Gene_A (4 transcripts).
s.node('A_T1', 'Transcript MYC-001')
s.node('A_T2', 'Transcript MYC-002')
s.node('A_T3', 'Transcript MYC-003 | <here> variant | coding | intron')
s.node('A_T4', 'Transcript MYC-004')

# Define transcripts for Gene_B (5 transcripts).
s.node('B_T1', 'Transcript TP53-001')
s.node('B_T2', 'Transcript TP53-002')
s.node('B_T3', 'Transcript TP53-003')
s.node('B_T4', 'Transcript TP53-004')
s.node('B_T5', 'Transcript TP53-005')

# Define transcripts for Gene_C (4 transcripts).
s.node('C_T1', 'Transcript BCL2-001')
s.node('C_T2', 'Transcript BCL2-002')
s.node('C_T3', 'Transcript BCL2-003')
s.node('C_T4', 'Transcript BCL2-004')

# Define transcripts for Gene_D (4 transcripts).
s.node('D_T1', 'Transcript STAT3-001')
s.node('D_T2', 'Transcript STAT3-002')
s.node('D_T3', 'Transcript STAT3-003')
s.node('D_T4', 'Transcript STAT3-004')

# --- Define the edges to show the relationships ---

# Edges from Gene_A to its transcripts.
s.edge('Gene_A', 'A_T1')
s.edge('Gene_A', 'A_T2')
s.edge('Gene_A', 'A_T3')
s.edge('Gene_A', 'A_T4')

# Edges from Gene_B to its transcripts.
s.edge('Gene_B', 'B_T1')
s.edge('Gene_B', 'B_T2')
s.edge('Gene_B', 'B_T3')
s.edge('Gene_B', 'B_T4')
s.edge('Gene_B', 'B_T5')

# Edges from Gene_C to its transcripts.
s.edge('Gene_C', 'C_T1')
s.edge('Gene_C', 'C_T2')
s.edge('Gene_C', 'C_T3')
s.edge('Gene_C', 'C_T4')

# Edges from Gene_D to its transcripts.
s.edge('Gene_D', 'D_T1')
s.edge('Gene_D', 'D_T2')
s.edge('Gene_D', 'D_T3')
s.edge('Gene_D', 'D_T4')

# Add specific relationships like alternative splicing and regulatory interactions.
s.edge('A_T2', 'A_T3:here', label='alternative splicing')
s.edge('B_T1', 'C_T2', label='regulatory interaction')
s.edge('C_T3', 'D_T1', label='regulatory interaction')
s.edge('D_T2', 'A_T4', label='regulatory interaction')

# Render the graph to a file and open it for viewing.
s.view()


ModuleNotFoundError: No module named 'graphviz'

In [None]:
plausibility_filtered = plausibility_filtering(config, isoform_unique, isoform_categories, tf_database)
