<a href="https://colab.research.google.com/github/bicks1/hughesintern/blob/main/gff_gene_consensus_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


**File Name**:

```
gff_gene_consensus.v3.ipynb
```

**Description**:

```
This program is a part of a series of programs for information extraction and mining of gene annotations in GFF3 files.

Using this script, exonic and intronic gene information is extracted for individual genes.

Feature/Type (column 3) defintions: http://www.sequenceontology.org/browser/obob.cgi

Biotype(attribute in column 9) definition: https://www.gencodegenes.org/pages/biotypes.html

```

**Authors**:

```
Sophia Bick, Chun Liang
```


###[Step 1]: Install Python modules, Map Google Drive that contains GFF3 files

In [None]:
!pip install gffutils

Collecting gffutils
  Downloading gffutils-0.12-py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyfaidx>=0.5.5.2 (from gffutils)
  Downloading pyfaidx-0.7.2.1-py3-none-any.whl (28 kB)
Collecting argh>=0.26.2 (from gffutils)
  Downloading argh-0.28.1-py3-none-any.whl (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting argcomplete>=1.9.4 (from gffutils)
  Downloading argcomplete-3.1.1-py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.5/41.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting simplejson (from gffutils)
  Downloading simplejson-3.19.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.9/137.9 kB[0

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
hg38gff = "/content/drive/My Drive/Lab_share/Lab_member/SophiaBick/HughesIntern/Homo_sapiens.GRCh38.109.chr.gff3"

In [None]:
import gffutils

In [None]:
import pandas as pd

###[Step 2]: Get an overview for each type of gene (gene, ncRNA_gene, pseudogene) in the whole GFF3 file

In [None]:
# It takes a long time (about 8 mins) to build the database that contains parent (genes) and child (transcripts) relationships
# The following command imports the file into a local sqlite3 file-based database ("test.db")
db = gffutils.create_db(hg38gff, dbfn='test.db', force=True, keep_order=False, merge_strategy='create_unique', sort_attribute_values=False)

# The following command connects to the database previously created
# FeatureDB methods allow interaction with the database
db = gffutils.FeatureDB('test.db', keep_order=False)

In [None]:
# The number of feature type "gene"
num_for_gene = db.count_features_of_type("gene")
print(num_for_gene)

21507


In [None]:
# The number of feature type "pseudogene"
num_for_pseudogene = db.count_features_of_type("pseudogene")
print(num_for_pseudogene)

15224


In [None]:
# The number of feature type "ncRNA_gene"
num_for_ncRNA_gene = db.count_features_of_type("ncRNA_gene")
print(num_for_ncRNA_gene)

25925


In [None]:
# The number of all gene featuretypes ("gene", "ncRNA_gene", "pseudogene")
num_for_all = num_for_gene + num_for_ncRNA_gene + num_for_pseudogene
print(num_for_all)

62656




---



###consensus_csv function
Determine the consensus exon and intron regions for genes

* consensus exons: regions where at least one transcript has an exon at that location
* consensus introns: regions where no exons are found in any transcript at that location

In [None]:
def consensus_csv(db, gene_cat, filename, output="string"):
  # :param gene_cat: one of the gene types [gene, ncRNA_gene, pseudogene]
  # :param filename: desired filename of CSV file outputted
  # :param output: determine how "x_positions" columns are formatted; "string" or "list"
  # :return: CSV file

  import csv

  with open(filename, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    fields = ["chromosome", "strand", "gene_name", "gene_id", "gene_start", "gene_end", "consensus_exon_count", "consensus_intron_count", "consensus_exons", "consensus_introns"]
    writer.writerow(fields)

    for gene in db.features_of_type(gene_cat):
      start = 1  # indicator of first exon of gene
      consensus_exon = 1
      consensus_ls = []

      # if gene does not have a name, is "None" in CSV
      try:
        gene_name = gene["Name"][0]
      except KeyError:
        gene_name = "None"

      ###########
      if gene.strand == "+":  # for +, exons must ordered by start, ascending (smallest start first, then more)
        order_by = ("start", "end")
        reverse = False
        gene_start = gene.start
        gene_end = gene.end
      elif gene.strand == "-":  # for -, exons must ordered by end, descending (greatest end first, then less)
        order_by = ("end")
        reverse = True
        gene_start = gene.end
        gene_end = gene.start

      for e in db.children(gene, featuretype="exon", order_by=order_by, reverse=reverse):
        if gene.strand == "+":
          if start == 1:  # first exon
            # initialize start and end reference point for 1st consensus exon
            consensus_start = e.start
            consensus_end = e.end
            start += 1

          # if next exon is completely within previous exon, skip
          elif consensus_start <= e.start <= consensus_end and e.end <= consensus_end:
            #print('pass')
            pass

          # if next exon overlaps previous exon, extend the end of consensus exon
          elif consensus_start <= e.start <= consensus_end and e.end >= consensus_end:
            consensus_end = e.end

          # if next exon does not overlap previous exon, add previous consensus exon data to file
          elif e.start > consensus_start and e.end > consensus_end:
            consensus_len = abs(consensus_end - consensus_start) + 1  # no warning since impossible for len to be negative or 0
            consensus_ls += ["ConsensusExon_{0}:{1}-{2}:{3}".format(consensus_exon, consensus_start, consensus_end, consensus_len)]
            # initialize new reference points for new consensus exon
            consensus_exon += 1
            consensus_start = e.start
            consensus_end = e.end

        if gene.strand == "-":
          if start == 1:  # first exon
        # initialize start and end reference point for 1st consensus exon
            consensus_start = e.end
            consensus_end = e.start
            start += 1

          # if next exon is completely within previous exon, skip
          elif consensus_end < e.end <= consensus_start and e.start >= consensus_end:
            #print('pass')
            pass

          # if next exon overlaps previous exon, extend the end of consensus exon
          elif consensus_end <= e.end <= consensus_start and e.start < consensus_end:
            consensus_end = e.start

          # if next exon does not overlap previous exon, add previous consensus exon data to file
          elif e.end < consensus_end and e.start < consensus_end:
            consensus_len = abs(consensus_end - consensus_start) + 1  # no warning since impossible for len to be negative or 0
            consensus_ls += ["ConsensusExon_{0}:{1}-{2}:{3}".format(consensus_exon, consensus_start, consensus_end, consensus_len)]
            # initialize new reference points for new consensus exon
            consensus_exon += 1
            consensus_start = e.end
            consensus_end = e.start

      ###########
      # Add last entry of consensus exon
      consensus_len = abs(consensus_end - consensus_start) + 1
      consensus_ls += ["ConsensusExon_{0}:{1}-{2}:{3}".format(consensus_exon, consensus_start, consensus_end, consensus_len)]
      #print(consensus_ls)

      # Add consensus introns
      consensus_intron = []
      counter = 1
      for index in range(0, len(consensus_ls)):  # ea entry/block of consensus exon info
        #print(consensus_ls[index])
        if len(consensus_ls) == 1:  # only one consensus exon
          pass
        else:
          if index == len(consensus_ls)-1:  # on last consensus exon entry; no new introns
            pass
          else:
            #print("first split", consensus_ls[index+1].split(":")[1])
            #print("second split", consensus_ls[index+1].split(":")[1].split("-")[0])
            if gene.strand == "+":
              new_end = int(consensus_ls[index+1].split(":")[1].split("-")[0]) - 1  # take next exon's start and substract one for new end
              new_start = int(consensus_ls[index].split(":")[1].split("-")[1]) + 1  # take current exon's end and add one for new start
            if gene.strand == "-":
              new_end = int(consensus_ls[index+1].split(":")[1].split("-")[0]) + 1  # take next exon's start and add one for new end
              new_start = int(consensus_ls[index].split(":")[1].split("-")[1]) - 1  # take current exon's end and subtract one for new start
            new_len = abs(new_end-new_start)+1  # no warning since impossible for len to be negative or 0
            consensus_intron += ["ConsensusIntron_{0}:{1}-{2}:{3}".format(counter, new_start, new_end, new_len)]
            counter += 1

      # add data to CSV
      #print(gene.seqid, gene.strand, gene_name, gene["gene_id"][0], consensus_ls, consensus_intron)
      i_count = len(consensus_intron)
      if output == "string":
        consensus_ls = ";".join(consensus_ls)
        consensus_intron = ";".join(consensus_intron)

      #print(consensus_ls)
      #print(consensus_intron)

      writer.writerow([gene.seqid, gene.strand, gene_name, gene["gene_id"][0], gene_start, gene_end, consensus_exon, i_count, consensus_ls, consensus_intron])

In [None]:
consensus_csv(db, "gene", "gff_gene_consensus_gene.csv")

In [None]:
consensus_csv(db, ("gene", "ncRNA_gene", "pseudogene"), "gff_gene_consensus_all.csv")

In [None]:
df = pd.read_csv("gff_gene_consensus_gene.csv")
df.to_csv("/content/drive/My Drive/Lab_share/Lab_member/SophiaBick/HughesIntern/gff_gene_consensus_gene.csv")
print("Number of 'gene' entries:", len(df))

Number of 'gene' entries: 21507


In [None]:
df = pd.read_csv("gff_gene_consensus_all.csv")
df.to_csv("/content/drive/My Drive/Lab_share/Lab_member/SophiaBick/HughesIntern/gff_gene_consensus_all.csv")
print("Number of all gene category entries:", len(df))

Number of all gene category entries: 62656
