In [None]:
import os
import matplotlib.pyplot as plt
import pandas as pd
from topology import (
    ensure_dir,
    get_transcripts_and_sequences,
    align_protein_sequences,
    run_deeptmhmm,
    generate_isoform_mapping,
    create_membrane_topology_objects
)
from expression import (
    get_expression_data,
    plot_expression
)
import json

## Variables

# Output directory
out_dir = "./files"
out_dir_for_plots = "./app/files_for_plots"

# Proteins we want to show
proteins = ["HER2", "CD20"]

# Email for alignment
email = "s242830@dtu.dk"

# Variables for each protein
proteins_ids = {
    "HER2": "ENSG00000141736",
    "CD20": "ENSG00000156738"
}

# Running the analysis for each protein
for protein in proteins:
    print(f"Processing {protein}...")
    ensure_dir(out_dir + "/" + protein)
    get_transcripts_and_sequences(proteins_ids[protein], out_dir + "/" + protein)
    align_protein_sequences(email, out_dir + "/" + protein)
    run_deeptmhmm(out_dir + "/" + protein)
    mapping = generate_isoform_mapping(out_dir + "/" + protein)
    json.dump(mapping, open(out_dir + "/" + protein + "/transcript_to_isoform_mapping.json", "w"))
    create_membrane_topology_objects(mapping, out_dir + "/" + protein)
    

    # with open(out_dir + "/HER2/isoforms.fasta") as f: 
    #     lines = f.readlines()
    #     headers = [line.strip()[1:] for line in lines if line.startswith(">")]
    #     all_ids = []
    #     for h in headers:
    #         all_ids.extend(h.split("|"))

    # get_expression_data(all_ids, out_dir + "/HER2")

  re.sub(r"^[^[]+[[]([^]]*)[]].*$", r"\1", query, flags=re.DOTALL))


Processing HER2...
Fetching transcripts for gene with Ensembl ID: ENSG00000141736...
Fetching protein sequence for Ensembl IDs: ENST00000584601 ENST00000584014 ENST00000578199 ENST00000445658 ENST00000584450 ENST00000269571 ENST00000578502 ENST00000578709 ENST00000582818 ENST00000580074 ENST00000863095 ENST00000863096 ENST00000863097 ENST00000863098 ENST00000863099 ENST00000863100 ENST00000863101 ENST00000863102 ENST00000863103 ENST00000938923 ENST00000938924 ENST00000938925 ENST00000959774 ENST00000959775
Saved transcript_to_isoform_mapping.csv to ./files/HER2
Saved sequences_data.json to ./files/HER2
Processing CD20...
Fetching transcripts for gene with Ensembl ID: ENSG00000156738...
Fetching protein sequence for Ensembl IDs: ENST00000532491 ENST00000532073 ENST00000345732 ENST00000534668 ENST00000528313 ENST00000533306 ENST00000674194 ENST00000389939 ENST00000904593 ENST00000904594 ENST00000966396
Saved transcript_to_isoform_mapping.csv to ./files/CD20
Saved sequences_data.json to .

In [2]:
# Pushing relevant files to the app for plotting

for protein in proteins:
    ensure_dir(out_dir_for_plots + "/" + protein)
    os.system(f"cp {out_dir}/{protein}/transcript_to_isoform_mapping.json {out_dir_for_plots}/{protein}/transcript_to_isoform_mapping.json")
    os.system(f"cp {out_dir}/{protein}/membrane_topology.csv {out_dir_for_plots}/{protein}/membrane_topology.csv")
    os.system(f"cp {out_dir}/{protein}/sequences_data.json {out_dir_for_plots}/{protein}/sequences_data.json")