<a href="https://colab.research.google.com/github/bicks1/hughesintern/blob/main/gff_gene_individual_v5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


**File Name**:

```
gff_individual_gene.v4.ipynb
```

**Description**:

```
This program is a part of a series of programs for information extraction and mining of gene annotations in GFF3 files.

Using this script, transcript information is extracted for individual genes.

Feature/Type (column 3) defintions: http://www.sequenceontology.org/browser/obob.cgi

Biotype(attribute in column 9) definition: https://www.gencodegenes.org/pages/biotypes.html

```

**Authors**:

```
Sophia Bick, Chun Liang
```


###[Step 1]: Install Python modules, Map Google Drive that contains GFF3 files

In [None]:
!pip install gffutils

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
hg38gff = "/content/drive/My Drive/Lab_share/Lab_member/SophiaBick/HughesIntern/Homo_sapiens.GRCh38.109.chr.gff3"

In [None]:
import gffutils

In [None]:
import pandas as pd

In [None]:
database = "/content/drive/My Drive/Lab_share/Lab_member/SophiaBick/HughesIntern/test.db"

In [None]:
# The following command connects to the database (test.db) previously created
# FeatureDB methods allow interaction with the database
db = gffutils.FeatureDB(database, keep_order=False)

###[Step 2]: Get an overview of transcript information for each type of genes (gene, ncRNA_gene, pseudogene) in the whole GFF3 file

In [None]:
# It takes a long time (about 8 mins) to build the database that contains parent (genes) and child (transcripts) relationships
# The following command imports the file into a local sqlite3 file-based database ("test.db")

#db = gffutils.create_db(hg38gff, dbfn='test.db', force=True, keep_order=False, merge_strategy='create_unique', sort_attribute_values=False)

#don't run this anymore

In [None]:
# The number of transcripts (mRNA, transcript, lnc_RNA, etc) for the feature type "gene"
# Question:  db.features_of_type("gene") will automatically includes three types of genes: gene, ncRNA-gene, and pseudo-gene?
    ## Answer: No, you would have to specify all desired feature types in a tuple. For all three it'd be ("gene", "ncRNA_gene", "pseudogene")
num_for_gene = 0
for gene in db.features_of_type(("gene")):
  for c in db.children(gene, featuretype=('mRNA', 'lnc_RNA', 'transcript', 'pseudogenic_transcript', 'ncRNA', 'snRNA', 'miRNA', 'unconfirmed_transcript', 'snoRNA', 'V_gene_segment', 'J_gene_segment', 'scRNA', 'rRNA', 'D_gene_segment', 'C_gene_segment', 'tRNA')):
    num_for_gene += 1
print(num_for_gene)

172078


In [None]:
# The number of transcripts (mRNA, transcript, lnc_RNA, etc) for the feature type "ncRNA_gene"
num_for_ncRNA_gene = 0
for ncRNA_gene in db.features_of_type("ncRNA_gene"):
  for c in db.children(ncRNA_gene, featuretype=('mRNA', 'lnc_RNA', 'transcript', 'pseudogenic_transcript', 'ncRNA', 'snRNA', 'miRNA', 'unconfirmed_transcript', 'snoRNA', 'V_gene_segment', 'J_gene_segment', 'scRNA', 'rRNA', 'D_gene_segment', 'C_gene_segment', 'tRNA')):
    num_for_ncRNA_gene += 1
print(num_for_ncRNA_gene)

63976


In [None]:
# The number of transcripts (mRNA, transcript, lnc_RNA, etc) for all featuretype "pseudogene"
num_for_pseudogene = 0
for pseudogene in db.features_of_type("pseudogene"):
  for c in db.children(pseudogene, featuretype=('mRNA', 'lnc_RNA', 'transcript', 'pseudogenic_transcript', 'ncRNA', 'snRNA', 'miRNA', 'unconfirmed_transcript', 'snoRNA', 'V_gene_segment', 'J_gene_segment', 'scRNA', 'rRNA', 'D_gene_segment', 'C_gene_segment', 'tRNA')):
    num_for_pseudogene += 1
print(num_for_pseudogene)

16684


In [None]:
# The number of transcripts (mRNA, transcript, lnc_RNA, etc) for all gene featuretypes ("gene", "ncRNA_gene", "pseudogene")
num_for_all = 0
for feature in db.features_of_type(("gene", "ncRNA_gene", "pseudogene")):
  for c in db.children(feature, featuretype=('mRNA', 'lnc_RNA', 'transcript', 'pseudogenic_transcript', 'ncRNA', 'snRNA', 'miRNA', 'unconfirmed_transcript', 'snoRNA', 'V_gene_segment', 'J_gene_segment', 'scRNA', 'rRNA', 'D_gene_segment', 'C_gene_segment', 'tRNA')):
    num_for_all += 1
print(num_for_all)

252738


###trans_stats function
Transcript info all genes in the file in dictionaries

For **all genes in the file** information returned includes:

The number of transcripts and the transcript that is the
* longest
* shortest
* lowest number of exons
* highest number of exons

In [None]:
def trans_stats(db, gene_cat):
  # :param db: FeatureDB object from gffutils that allows connection to the built sqlite3 file-based database
  # :param gene_cat: one of the gene types [gene, ncRNA_gene, pseudogene]

  # gene_trans_d: key is featuretype gene ID; value is # of transcripts, longest and name, shortest and name
  # gene_exon_d: key is featuretype gene ID; value is transcr w/ least # exons and name, most # exons and name
  # gene_genic_d: key is featuretype gene ID; value is information about 5'-UTR, CDS and 3'-UTR

  gene_trans_d = {}
  gene_exon_d = {}
  gene_intron_d ={}
  gene_genic_d = {}

  for gene in db.features_of_type(gene_cat):  # for ea featuretype gene
    # transcript initialize
    counter = 0
    trans_min = 9999999999
    trans_max = 0
    trans_min_label = ''
    trans_max_label = ''

    # exon initialize
    exon_min = 9999999999
    exon_max = 0
    exon_min_label = ''
    exon_max_label = ''

    key = gene["gene_id"][0]  # key is gene's gene_id (internally 1 item list)
    #print(key)

    # each transcript/child of gene
    for c in db.children(gene, featuretype=('mRNA', 'lnc_RNA', 'transcript', 'pseudogenic_transcript', 'ncRNA', 'snRNA', 'miRNA', 'unconfirmed_transcript', 'snoRNA', 'V_gene_segment', 'J_gene_segment', 'scRNA', 'rRNA', 'D_gene_segment', 'C_gene_segment', 'tRNA')):
      exon_count = 0  # flush exon count for ea new transcript
      counter += 1 # count num of transcripts in gene
      length = c.end - c.start + 1  # length of transcript (nt)
      for exon in db.children(c, featuretype="exon"):
          exon_count += 1  # number of exons in ea transcript
          #print(exon)


      if length < trans_min:  # determine min and max length and transcript ID
        trans_min = length
        trans_min_label = c["transcript_id"][0]  # changed from "ID"
      if length > trans_max:
        trans_max = length
        trans_max_label = c["transcript_id"][0]

      if exon_count < exon_min:  # determine min and max exon count and transcript ID
        exon_min = exon_count
        exon_min_label = c["transcript_id"][0]
      elif exon_count == exon_min:
        exon_min_label = "{0}-{1}".format(exon_min_label, c["transcript_id"][0])

      if exon_count > exon_max:
        exon_max = exon_count
        exon_max_label = c["transcript_id"][0]
      elif exon_count == exon_max:
        exon_max_label = "{0}-{1}".format(exon_max_label, c["transcript_id"][0])

    gene_trans_d[key] = "{0}-{1}-{2}-{3}-{4}".format(counter, trans_min_label, trans_min, trans_max_label, trans_max)
    gene_exon_d[key] = "{0}-{1}-{2}-{3}".format(exon_min_label, exon_min, exon_max_label, exon_max)

  return gene_trans_d, gene_exon_d

In [None]:
g_trans_d, g_exon_d = trans_stats(db, "gene")

In [None]:
p_trans_d, p_exon_d = trans_stats(db, "pseudogene")

In [None]:
nc_trans_d, nc_exon_d = trans_stats(db, "ncRNA_gene")

####Writing files to view each transcription and exon dictionary for the three categories of genes tested here

(printout in console too much data)

In [None]:
fh = open("gene_dict.txt", "w")
fh.write('Transcription statistics dictionary\n')
fh.write('\n')
fh.write(str(g_trans_d))
fh.write('\n')
fh.write("Length of transcription dictionary {0} and exon dictionary {1} == count of desired features in file {2}".format(str(len(g_trans_d)), str(len(g_exon_d)), str(db.count_features_of_type("gene"))))
fh.write('\n')
fh.write(str(g_exon_d))
fh.close()

In [None]:
fh = open("pseudogene_dict.txt", "w")
fh.write('Transcription statistics dictionary\n')
fh.write('\n')
fh.write(str(p_trans_d))
fh.write('\n')
fh.write("Length of transcription dictionary {} and exon dictionary {} == count of desired features in file {}".format(str(len(p_trans_d)), str(len(p_exon_d)), str(db.count_features_of_type("pseudogene"))))
fh.write('\n')
fh.write(str(p_exon_d))
fh.close()


In [None]:
fh = open("ncRNA_gene_dict.txt", "w")
fh.write('Transcription statistics dictionary\n')
fh.write('\n')
fh.write(str(nc_trans_d))
fh.write('\n')
fh.write("Length of transcription dictionary {} and exon dictionary {} == count of desired features in file {}".format(str(len(nc_trans_d)), str(len(nc_exon_d)), str(db.count_features_of_type("ncRNA_gene"))))
fh.write('\n')
fh.write(str(nc_exon_d))
fh.close()

In [None]:
! ls -al

total 2076860
drwxr-xr-x 1 root root       4096 Jun 29 19:51 .
drwxr-xr-x 1 root root       4096 Jun 29 19:36 ..
drwxr-xr-x 4 root root       4096 Jun 28 13:37 .config
drwx------ 6 root root       4096 Jun 29 19:42 drive
-rw-r--r-- 1 root root    3098096 Jun 29 19:51 gene_dict.txt
-rw-r--r-- 1 root root    3360241 Jun 29 19:51 ncRNA_gene_dict.txt
-rw-r--r-- 1 root root    1876662 Jun 29 19:51 pseudogene_dict.txt
drwxr-xr-x 1 root root       4096 Jun 28 13:38 sample_data
-rw-r--r-- 1 root root 2118213632 Jun 29 19:49 test.db


In [None]:
from google.colab import files
files.download('gene_dict.txt')
files.download('pseudogene_dict.txt')
files.download('ncRNA_gene_dict.txt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Note: ncRNA_genes have some genes with no child transcript in the GFF file, results in a 0--999999... entry



---



###indiv_gene function
Query one gene for transcript info

Querying **one individual gene**, information returned includes:

* Counts of each type of transcript
* longest transcript
* shortest transcript
* average transcript length

For each transcript
* the number of exons
* exonic length of transcript
* average amount of exons


In [None]:
# you MUST know the gene's ID= attribute in your GFF file to query
def indiv_gene(db, gene_ID):
  # :param db: FeatureDB object from gffutils that allows connection to the built sqlite3 file-based database
  # gene_trans_d: key is featuretype "gene"'s ID; value is # of transcripts, longest and name, shortest and name
  # gene_exon_d: key is featuretype "gene"'s ID; value is transcr w/ least # exons and name, most # exons and name
  gene_trans_d = {}
  gene_exon_d = {}
  count_d = {}
  exon_print = {}

  # transcript initialize
  counter = 0
  trans_min = 9999999999
  trans_max = 0
  trans_min_label = ''
  trans_max_label = ''
  total_len = 0

  # exon initialize
  exon_min = 9999999999
  exon_max = 0
  exon_min_label = ''
  exon_max_label = ''
  exon_count_total = 0

  gene = db[gene_ID]

  # each transcript/child of gene
  for c in db.children(gene, featuretype=('mRNA', 'lnc_RNA', 'transcript', 'pseudogenic_transcript', 'ncRNA', 'snRNA', 'miRNA', 'unconfirmed_transcript', 'snoRNA', 'V_gene_segment', 'J_gene_segment', 'scRNA', 'rRNA', 'D_gene_segment', 'C_gene_segment', 'tRNA')):
    #print(c)
    if c.featuretype not in count_d:
      count_d[c.featuretype] = 1
    elif c.featuretype in count_d:
      count_d[c.featuretype] += 1


    exon_count = 0  # flush exon count for ea new transcript

    counter += 1 # count num of transcripts in gene
    length = c.end - c.start + 1  # length of transcript (nt)
    total_len += length  # total len of all transcripts of gene
    for exon in db.children(c, featuretype="exon"):
        exon_count += 1  # number of exons in ea transcript
        #print(exon)

    category = c.featuretype
    exonic_bp = db.children_bp(c, child_featuretype="exon")
    exon_count_total += exon_count
    exon_print[c["transcript_id"][0]] = "category: {0} | exons: {1} | exonic_length: {2}".format(category, exon_count, exonic_bp)

    if length < trans_min:  # determine min and max length and transcript ID
      trans_min = length
      trans_min_label = c["transcript_id"][0]  # changed from "ID"
      trans_min_type = c.featuretype
    if length > trans_max:
      trans_max = length
      trans_max_label = c["transcript_id"][0]
      trans_max_type = c.featuretype

    if exon_count < exon_min:  # determine min and max exon count and transcript ID
      exon_min = exon_count
      exon_min_label = c["transcript_id"][0]
    elif exon_count == exon_min:
      exon_min_label = "{0}-{1}".format(exon_min_label, c["transcript_id"][0])

    if exon_count > exon_max:
      exon_max = exon_count
      exon_max_label = c["transcript_id"][0]
    elif exon_count == exon_max:
      exon_max_label = "{0}-{1}".format(exon_max_label, c["transcript_id"][0])

  average = total_len / counter
  gene_trans_d[gene_ID] = "{0}-{1}-{2}-{3}-{4}".format(counter, trans_min_label, trans_min, trans_max_label, trans_max)
  gene_exon_d[gene_ID] = "{0}-{1}-{2}-{3}".format(exon_min_label, exon_min, exon_max_label, exon_max)

  print("The total number of transcripts:", counter)
  print("Transcript count breakdown:")
  for k, v in count_d.items():
    print(k, v)
  print("The shortest transcript:", trans_min_label, "with length", trans_min, "| category:", trans_min_type)
  print("The longest transcript:", trans_max_label, "with length", trans_max, "| category:", trans_max_type)
  print("The average transcript length:", average)

  average_exon = exon_count_total/counter
  print("\nExon count per transcript")
  for k, v in exon_print.items():
    print(k, v)
  print("Average number of exons per transcript:", average_exon)

  return gene_trans_d, gene_exon_d

In [None]:
t, d = indiv_gene(db, 'gene:ENSG00000187634')

The total number of transcripts: 15
Transcript count breakdown:
mRNA 11
lnc_RNA 3
transcript 1
The shortest transcript: ENST00000466827 with length 700 | category: lnc_RNA
The longest transcript: ENST00000616016 with length 20652 | category: mRNA
The average transcript length: 12418.333333333334

Exon count per transcript
ENST00000341065 category: mRNA | exons: 12 | exonic_length: 2191
ENST00000342066 category: mRNA | exons: 14 | exonic_length: 2557
ENST00000437963 category: mRNA | exons: 5 | exonic_length: 387
ENST00000455979 category: mRNA | exons: 7 | exonic_length: 1731
ENST00000464948 category: lnc_RNA | exons: 2 | exonic_length: 657
ENST00000466827 category: lnc_RNA | exons: 2 | exonic_length: 542
ENST00000474461 category: lnc_RNA | exons: 4 | exonic_length: 862
ENST00000478729 category: transcript | exons: 3 | exonic_length: 318
ENST00000616016 category: mRNA | exons: 14 | exonic_length: 3465
ENST00000616125 category: mRNA | exons: 11 | exonic_length: 1722
ENST00000617307 catego

In [None]:
print(t)  # transcription stats dictionary
print(d)  # exon stats dictionary, NOTE: two transcripts shown means both have the lowest or highest exon count

{'gene:ENSG00000187634': '15-ENST00000466827-700-ENST00000616016-20652'}
{'gene:ENSG00000187634': 'ENST00000464948-ENST00000466827-2-ENST00000342066-ENST00000616016-ENST00000618323-14'}


In [None]:
indiv_gene(db, "gene:ENSG00000134644")

The total number of transcripts: 23
Transcript count breakdown:
mRNA 18
lnc_RNA 3
transcript 2
The shortest transcript: ENST00000498627 with length 1493 | category: lnc_RNA
The longest transcript: ENST00000426105 with length 134212 | category: mRNA
The average transcript length: 66275.52173913043

Exon count per transcript
ENST00000257075 category: mRNA | exons: 22 | exonic_length: 5360
ENST00000373741 category: mRNA | exons: 22 | exonic_length: 4242
ENST00000373742 category: mRNA | exons: 20 | exonic_length: 3515
ENST00000373747 category: mRNA | exons: 22 | exonic_length: 5375
ENST00000424085 category: mRNA | exons: 18 | exonic_length: 4631
ENST00000426105 category: mRNA | exons: 22 | exonic_length: 5385
ENST00000440538 category: mRNA | exons: 22 | exonic_length: 3936
ENST00000471894 category: lnc_RNA | exons: 4 | exonic_length: 812
ENST00000480602 category: mRNA | exons: 7 | exonic_length: 906
ENST00000490546 category: transcript | exons: 5 | exonic_length: 707
ENST00000498419 catego

({'gene:ENSG00000134644': '23-ENST00000498627-1493-ENST00000426105-134212'},
 {'gene:ENSG00000134644': 'ENST00000498627-ENST00000531867-2-ENST00000257075-ENST00000373741-ENST00000373747-ENST00000426105-ENST00000440538-22'})



---



###trans_stats_csv (all genes in the GFF3 file)

Information returned in **2 dictionaries** includes:

* number of transcripts
* longest transcript
* shortest transcript
* transcript with lowest number of exons
* transcript with highest number of exons

**Outputted CSV** includes:
* transcript length
* number of exons and introns per transcript
* positions of all exons, introns, and genic features (5'UTR, CDS, 3'UTR) per transcript




In [None]:
def trans_stats_csv(db, gene_cat, filename, output="string"):
  # :param gene_cat: one of the gene types [gene, ncRNA_gene, pseudogene]
  # :param filename: desired filename of CSV file outputted
  # :param output: determine how "x_positions" columns are formatted; "string" or "list"

  # gene_trans: key is featuretype gene ID; value is # of transcripts, longest and name, shortest and name
  # gene_exon: key is featuretype gene ID; value is transcr w/ least # exons and name, most # exons and name

  import csv

  gene_trans = {}
  gene_exon = {}


  # write csv file for the gene category specfied; transcript is only mRNA right now
  with open(filename, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    fields = ["chromosome", "strand", "gene_name", "gene_id", "gene_start", "gene_end", "transcript_name", "transcript_id", "transcript_length", "exon_number", "intron_number", "exon_positions", "intron_positions", "genic_positions"]
    writer.writerow(fields)  # write header of csv
    for gene in db.features_of_type(gene_cat):  # for ea featuretype gene
      # transcript initialize
      counter = 0
      trans_min = 9999999999
      trans_max = 0
      trans_min_label = ''
      trans_max_label = ''

      # exon initialize
      exon_min = 9999999999
      exon_max = 0
      exon_min_label = ''
      exon_max_label = ''

      key = gene["gene_id"][0]  # key is gene's gene_id (internaly 1 item list)
      #print(key)

      # each transcript/child of gene
      #########################
      #### CHANGE featuretype= if want to change type of transcripts analyzed
      for c in db.children(gene, featuretype=('mRNA', 'lnc_RNA', 'transcript', 'pseudogenic_transcript', 'ncRNA', 'snRNA', 'miRNA', 'unconfirmed_transcript', 'snoRNA', 'V_gene_segment', 'J_gene_segment', 'scRNA', 'rRNA', 'D_gene_segment', 'C_gene_segment', 'tRNA')):
        exon_count = 0  # flush exon count for ea new transcript
        intron_count = 0  # sim logic as previous
        genic_5_count = 0
        genic_cds_count = 0
        genic_3_count = 0
        exon_positions_l = []  # flush positions list for ea new transcript
        intron_positions_l = []
        #genic_positions_l = []
        genic_positions_l_5 = []
        genic_positions_l_c = []
        genic_positions_l_3 = []
        counter += 1 # count num of transcripts in gene
        length = abs(c.end - c.start) + 1  # length of transcript (nt)
        if length <= 0:
          print("WARNING: Length {0} of {1} is negative or zero".format(length, c["transcript_id"][0]))

        # following 3 for loops extract information about exons, introns, and genic features per transcript of a gene
        for exon in db.children(c, featuretype="exon"):
          exon_count += 1  # number of exons in ea transcript
          if gene.strand == "+":
            exon_start = exon.start
            exon_end = exon.end
          elif gene.strand == "-":
            exon_start = exon.end
            exon_end = exon.start
          exon_len = abs(exon.end - exon.start) + 1
          if exon_len <= 0:
            print("WARNING: Length {0} of {1} is negative or zero".format(exon_len, exon["exon_id"][0]))
          exon_label = "Exon" + str(exon["rank"][0])
          exon_positions_l += ["{0}:{1}-{2}:{3}".format(exon_label, exon_start, exon_end, exon_len)]

        negative_count = exon_count
        for intron in db.interfeatures(db.children(c, featuretype="exon"), "intron", numeric_sort="True"):
          intron_count += 1
          if gene.strand == "+":
            intron_start = intron.start
            intron_end = intron.end
            intron_label = "Intron" + str(intron_count)
          elif gene.strand == "-":
            intron_start = intron.end
            intron_end = intron.start
            intron_label = "Intron" + str(negative_count-1)
            negative_count -= 1
          intron_len = abs(intron.end - intron.start) + 1

          """if gene.strand == "-":
            intron_label = "Intron" + str(negative_count-1)
            negative_count -= 1
          else:
            intron_label = "Intron" + str(intron_count)"""
          if intron_len <= 0:
            print("WARNING: Length {0} of intron between {1} is negative or zero".format(intron_len, intron["exon_id"][0]))
          intron_positions_l += ["{0}:{1}-{2}:{3}".format(intron_label, intron_start, intron_end, intron_len)]

        # loop so know counts of ea genic feature to order correctly
        count_5 = 0
        count_cds = 0
        count_3 = 0
        for count in db.children(c, featuretype=("five_prime_UTR", "CDS", "three_prime_UTR")):
          if count.featuretype == "five_prime_UTR":
            count_5 += 1
          elif count.featuretype == "CDS":
            count_cds += 1
          elif count.featuretype == "three_prime_UTR":
            count_3 += 1

        # multiple loops so order is 5UTR, CDS, 3UTR
        for genic in db.children(c, featuretype=("five_prime_UTR")):
          genic_5_count += 1
          if gene.strand == "+":
            genic_start = genic.start
            genic_end = genic.end
            number = str(genic_5_count)
          elif gene.strand == "-":
            genic_end = genic.start
            genic_start = genic.end
            number = str(count_5)
            count_5 -= 1
          genic_len = abs(genic.end - genic.start) + 1
          if genic_len <= 0:
            print("WARNING: Length {0} of a 5' UTR in transcript {1} is negative or zero".format(genic_len, c["transcript_id"][0]))
          genic_label = "5UTR" + number
          genic_positions_l_5 += ["{0}:{1}-{2}:{3}".format(genic_label, genic_start, genic_end, genic_len)]

        for genic in db.children(c, featuretype="CDS"):
          genic_cds_count += 1
          if gene.strand == "+":
            genic_start = genic.start
            genic_end = genic.end
            number = str(genic_cds_count)
          elif gene.strand == "-":
            genic_end = genic.start
            genic_start = genic.end
            number = str(count_cds)
            count_cds -= 1
          genic_len = abs(genic.end - genic.start) + 1
          if genic_len <= 0:
            print("WARNING: Length {0} of a CDS in transcript {1} is negative or zero".format(genic_len, c["transcript_id"][0]))
          genic_label = "CDS" + number
          genic_positions_l_c += ["{0}:{1}-{2}:{3}".format(genic_label, genic_start, genic_end, genic_len)]

        for genic in db.children(c, featuretype="three_prime_UTR"):
          genic_3_count += 1
          if gene.strand == "+":
            genic_end = genic.end
            genic_start = genic.start
            genic_end = genic.end
            number = str(genic_3_count)
          elif gene.strand == "-":
            genic_end = genic.start
            genic_start = genic.end
            number = str(count_3)
            count_3 -= 1
          genic_len = abs(genic.end - genic.start) + 1
          if genic_len <= 0:
            print("WARNING: Length {0} of a 3' UTR in transcript {1} is negative or zero".format(genic_len, c["transcript_id"][0]))
          genic_label = "3UTR" + number
          genic_positions_l_3 += ["{0}:{1}-{2}:{3}".format(genic_label, genic_start, genic_end, genic_len)]

        import re
        def num_sort(test_string):
          #print(list(map(int, re.findall(r'\d+', test_string)))[0])
          return list(map(int, re.findall(r'\d+', test_string)))[0]

        def num_sort2(test_string):
          return list(map(int, re.findall(r'\d+', test_string)))[1]

        exon_positions_l.sort(key=num_sort)
        '''if gene.strand == "-":
          print("------", exon_positions_l)
        else:
          print("+++++", exon_positions_l)'''

        intron_positions_l.sort(key=num_sort)
        '''if gene.strand == "-":
          print("------", intron_positions_l)
        else:
          print("+++++", intron_positions_l)'''
        genic_positions_l_5.sort(key=num_sort2)
        genic_positions_l_c.sort(key=num_sort)
        genic_positions_l_3.sort(key=num_sort2)
        genic_positions_l = genic_positions_l_5 + genic_positions_l_c + genic_positions_l_3

        if output == "string":
          exon_positions_l = ';'.join(exon_positions_l)
          intron_positions_l = ';'.join(intron_positions_l)
          genic_positions_l = ';'.join(genic_positions_l)

        # if gene or transcript does not have a name, is "None" in CSV
        try:
          gene_name = gene["Name"][0]
        except KeyError:
          gene_name = "None"

        try:
          transcript_name = c["Name"][0]
        except KeyError:
          transcript_name = "None"


        # write new CSV entry for ea transcript of a gene
        #print(genic_positions_l)
        '''if gene.strand == "-":
          print("------", [gene.seqid, gene.strand, gene_name, gene["gene_id"][0], transcript_name, c["transcript_id"][0], length, exon_count, intron_count, exon_positions_l, intron_positions_l, genic_positions_l])
        elif gene.strand == "+":
          print("++++++", [gene.seqid, gene.strand, gene_name, gene["gene_id"][0], transcript_name, c["transcript_id"][0], length, exon_count, intron_count, exon_positions_l, intron_positions_l, genic_positions_l])
        '''

        if gene.strand == "-":
          gene_start = gene.end
          gene_end = gene.start
        if gene.strand == "+":
          gene_start = gene.start
          gene_end = gene.end

        writer.writerow([gene.seqid, gene.strand, gene_name, gene["gene_id"][0], gene_start, gene_end, transcript_name, c["transcript_id"][0], length, exon_count, intron_count, exon_positions_l, intron_positions_l, genic_positions_l])

        # for overall stats on transcripts and exons for specified gene features
        if length < trans_min:  # determine min and max length and transcript ID
          trans_min = length
          trans_min_label = c["transcript_id"][0]  # changed from "ID"
        if length > trans_max:
          trans_max = length
          trans_max_label = c["transcript_id"][0]

        if exon_count < exon_min:  # determine min and max exon count and transcript ID
          exon_min = exon_count
          exon_min_label = c["transcript_id"][0]
        elif exon_count == exon_min:
          exon_min_label = "{0}-{1}".format(exon_min_label, c["transcript_id"][0])

        if exon_count > exon_max:
          exon_max = exon_count
          exon_max_label = c["transcript_id"][0]
        elif exon_count == exon_max:
          exon_max_label = "{0}-{1}".format(exon_max_label, c["transcript_id"][0])

      if counter == 0:
        trans_max_label = trans_min_label = exon_max_label = exon_min_label = "None"

      gene_trans[key] = "{0}-{1}-{2}-{3}-{4}".format(counter, trans_min_label, trans_min, trans_max_label, trans_max)
      gene_exon[key] = "{0}-{1}-{2}-{3}".format(exon_min_label, exon_min, exon_max_label, exon_max)
      #print(gene_trans[key])
      #print(gene_exon[key])

  return gene_trans, gene_exon

In [None]:
for exon in db.children(db["gene:ENSG00000006007"], featuretype="exon", order_by=("end"), reverse=True):
  print(exon)
  #transcript:ENST00000353258

**NOTE**: trans_stats_csv is set to do all transcripts right now. Must change manually to do only "mRNA"

In [None]:
# GENE category only
# about 3.5/4 min for only mRNA
# 8 min for all transcripts
trans_d, exon_d = trans_stats_csv(db, "gene", "gff_gene_individual_gene.csv")

In [None]:
print(trans_d)
print(exon_d)

In [None]:
# ALL gene categories
# about 3.5/4 min for only mRNA (which outputs the same as doing "gene" with only mRNA)
# about 9 min for all transcripts
trans_d2, exon_d2 = trans_stats_csv(db, ("gene", "pseudogene", "ncRNA_gene"), "gff_gene_individual_all.csv")

In [None]:
# in order to copy file to Google Drive, use pandas
df = pd.read_csv("gff_gene_individual_gene.csv")
df.to_csv("/content/drive/My Drive/Lab_share/Lab_member/SophiaBick/HughesIntern/gff_gene_individual_gene.csv")
print("Number of transcript entries:", len(df))


  df = pd.read_csv("gff_gene_individual_gene.csv")


Number of transcript entries: 172078


In [None]:
df2 = pd.read_csv("gff_gene_individual_all.csv")
df2.to_csv("/content/drive/My Drive/Lab_share/Lab_member/SophiaBick/HughesIntern/gff_gene_individual_all.csv")
print("Number of transcript entries:", len(df2))

  df2 = pd.read_csv("gff_gene_individual_all.csv")


Number of transcript entries: 252738
