### Import libraries

pandas - data analysis and manipulation

numpy - support for multidimensional arrays and matrices

genomepy -  handling genomes and gene annotations

matplotlib - creating visualisations 

SciPy - scientific and technical computing

In [1]:
pip install genomepy

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import genomepy
import matplotlib.pyplot as plt
from scipy import stats

### Download genome assembly with genomepy and read in the gene annotation

In [3]:
genomepy.install_genome("GRCm39", "NCBI", annotation = False)

gencode = pd.read_table("gencode.vM32.annotation.gff3.gz", comment="#", sep = "\t", names = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute'])
gencode.head() 

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
0,chr1,HAVANA,gene,3143476,3144545,.,+,.,ID=ENSMUSG00000102693.2;gene_id=ENSMUSG0000010...
1,chr1,HAVANA,transcript,3143476,3144545,.,+,.,ID=ENSMUST00000193812.2;Parent=ENSMUSG00000102...
2,chr1,HAVANA,exon,3143476,3144545,.,+,.,ID=exon:ENSMUST00000193812.2:1;Parent=ENSMUST0...
3,chr1,ENSEMBL,gene,3172239,3172348,.,+,.,ID=ENSMUSG00000064842.3;gene_id=ENSMUSG0000006...
4,chr1,ENSEMBL,transcript,3172239,3172348,.,+,.,ID=ENSMUST00000082908.3;Parent=ENSMUSG00000064...


### Select genes only

In [4]:
gencode_genes = gencode[(gencode.feature == "gene")][['seqname', 'source', 'feature','start', 'end', 'strand', 'attribute']].copy().reset_index().drop('index', axis=1)
gencode_genes.head()

Unnamed: 0,seqname,source,feature,start,end,strand,attribute
0,chr1,HAVANA,gene,3143476,3144545,+,ID=ENSMUSG00000102693.2;gene_id=ENSMUSG0000010...
1,chr1,ENSEMBL,gene,3172239,3172348,+,ID=ENSMUSG00000064842.3;gene_id=ENSMUSG0000006...
2,chr1,HAVANA,gene,3276124,3741721,-,ID=ENSMUSG00000051951.6;gene_id=ENSMUSG0000005...
3,chr1,HAVANA,gene,3322980,3323459,+,ID=ENSMUSG00000102851.2;gene_id=ENSMUSG0000010...
4,chr1,HAVANA,gene,3435954,3438772,-,ID=ENSMUSG00000103377.2;gene_id=ENSMUSG0000010...


### Extract gene names and gene types

In [5]:
def gene_info(x):
    g_name = list(filter(lambda x: 'gene_name' in x,  x.split(";")))[0].split("=")[1]
    g_type = list(filter(lambda x: 'gene_type' in x,  x.split(";")))[0].split("=")[1]
    return (g_name, g_type)
gencode_genes["gene_name"], gencode_genes["gene_type"] = zip(*gencode_genes.attribute.apply(lambda x: gene_info(x)))

### Select protein-coding genes only

In [6]:
gencode_genes = gencode_genes[gencode_genes['gene_type'] == 'protein_coding'].reset_index().drop('index', axis=1)
gencode_genes.head()

Unnamed: 0,seqname,source,feature,start,end,strand,attribute,gene_name,gene_type
0,chr1,HAVANA,gene,3276124,3741721,-,ID=ENSMUSG00000051951.6;gene_id=ENSMUSG0000005...,Xkr4,protein_coding
1,chr1,HAVANA,gene,4069780,4479464,-,ID=ENSMUSG00000025900.14;gene_id=ENSMUSG000000...,Rp1,protein_coding
2,chr1,HAVANA,gene,4561154,4567577,-,ID=ENSMUSG00000025902.14;gene_id=ENSMUSG000000...,Sox17,protein_coding
3,chr1,HAVANA,gene,4843429,4855962,-,ID=ENSMUSG00000033845.14;gene_id=ENSMUSG000000...,Mrpl15,protein_coding
4,chr1,HAVANA,gene,4878011,4918633,+,ID=ENSMUSG00000025903.15;gene_id=ENSMUSG000000...,Lypla1,protein_coding


### Index by gene name

In [7]:
gencode_genes = gencode_genes.set_index('gene_name')
gencode_genes.head()

Unnamed: 0_level_0,seqname,source,feature,start,end,strand,attribute,gene_type
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Xkr4,chr1,HAVANA,gene,3276124,3741721,-,ID=ENSMUSG00000051951.6;gene_id=ENSMUSG0000005...,protein_coding
Rp1,chr1,HAVANA,gene,4069780,4479464,-,ID=ENSMUSG00000025900.14;gene_id=ENSMUSG000000...,protein_coding
Sox17,chr1,HAVANA,gene,4561154,4567577,-,ID=ENSMUSG00000025902.14;gene_id=ENSMUSG000000...,protein_coding
Mrpl15,chr1,HAVANA,gene,4843429,4855962,-,ID=ENSMUSG00000033845.14;gene_id=ENSMUSG000000...,protein_coding
Lypla1,chr1,HAVANA,gene,4878011,4918633,+,ID=ENSMUSG00000025903.15;gene_id=ENSMUSG000000...,protein_coding


### Create dictionary of genes with their esitmated elongation rates 

In [8]:
df = pd.read_excel('gene_list.xlsx') # A list of genes for which transcription elongation rates have been estimated in mESCs by Jonkers et al. in the times spanning  12.5-25 minutes following flavopiridol treatment 

genedata = dict(zip(df['Gene name'].tolist(), df['Rate (bp/min)'].tolist()))

print(genedata)

{'Kdm2a': 1482.0664, 'Ankib1': 1992.554, 'Pid1': 1612.144, 'Wapl': 896.2004, 'Kdm2b': 1526.1148, 'Nol10': 1347.6652, 'Zfyve26': 595.1316, 'Septin11': 2407.198, 'Usp13': 1554.2792, 'Ercc6l2': 1017.0216, 'Snx13': 2688.8208, 'C2cd3': 1330.8956, 'N4bp2': 1256.5544, 'Crebbp': 2558.938, 'Otud7b': 1150.5932, 'Usp32': 1978.4756, 'Smg1': 2356.2876, 'Usp31': 2220.418, 'Cblb': 2476.694, 'Fancd2': 866.8708, 'Fam120a': 2233.7096, 'Spata13': 1694.3776, 'Ddx31': 1470.9436, 'Gucy1a2': 2400.0, 'Ralgapa2': 2535.3224, 'Gfod1': 1088.1276, 'Dock1': 2088.1196, 'Thoc2': 1880.0, 'Garem1': 2167.7232, 'Bclaf3': 1256.1996, 'Rfx7': 2487.55, 'Zyg11b': 884.404, 'Mef2a': 2258.272, 'Ak8': -440.0, 'Rabgap1': 1294.5556, 'Erich1': 1500.418, 'Slc38a6': 1457.3628, 'Fam117b': 1868.22, 'Cep350': 2055.0, 'Tcerg1': 1439.7512, 'Plekhm3': 730.2772, 'Pdxdc1': 1525.1096, 'Myo6': 2452.754, 'Edem3': 1871.2152, 'Mtmr7': 1052.718, 'Kntc1': 848.076, 'Apaf1': 674.9648, 'Raph1': 2149.4576, 'Fkbp15': 1331.1188, 'Med14': 1716.472, 'Pik3r1

### Determine gene coordinates

In [9]:
gene_coordinates = pd.DataFrame({'gene_name': [], 'chromosome': [], 'start': [], 'end': [], 'strand': []})
for gene in genedata:
    try:
        a = gencode_genes.loc[gene]['seqname']
        b = gencode_genes.loc[gene]['start']
        c = gencode_genes.loc[gene]['end']
        d = gencode_genes.loc[gene]['strand']
        gene_coordinates.loc[len(gene_coordinates)] = [gene, a, b, c, d]
    except Exception as e:
        pass
gene_coordinates = gene_coordinates.set_index('gene_name')
gene_coordinates.head()

Unnamed: 0_level_0,chromosome,start,end,strand
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Kdm2a,chr19,4364447,4448313,-
Ankib1,chr5,3740000,3852925,-
Pid1,chr1,84014017,84341901,-
Wapl,chr14,34395885,34469940,+
Kdm2b,chr5,123008728,123127886,-


### Select exons

In [11]:
gencode_exons = gencode[(gencode.feature == "exon")][['seqname', 'source', 'feature','start', 'end', 'strand', 'attribute']].copy().reset_index().drop('index', axis=1)

gencode_exons["gene_name"], gencode_exons["gene_type"] = zip(*gencode_exons.attribute.apply(lambda x: gene_info(x)))

gencode_exons = gencode_exons[gencode_exons['gene_type'] == 'protein_coding'].reset_index().drop('index', axis=1)

gencode_exons = gencode_exons.set_index('gene_name')

gencode_exons1 = gencode_exons.set_index('attribute')

print(gencode_exons1)

                                                   seqname   source feature  \
attribute                                                                     
ID=exon:ENSMUST00000162897.2:1;Parent=ENSMUST00...    chr1   HAVANA    exon   
ID=exon:ENSMUST00000162897.2:2;Parent=ENSMUST00...    chr1   HAVANA    exon   
ID=exon:ENSMUST00000159265.2:1;Parent=ENSMUST00...    chr1   HAVANA    exon   
ID=exon:ENSMUST00000159265.2:2;Parent=ENSMUST00...    chr1   HAVANA    exon   
ID=exon:ENSMUST00000070533.5:1;Parent=ENSMUST00...    chr1   HAVANA    exon   
...                                                    ...      ...     ...   
ID=exon:ENSMUST00000084013.1:1;Parent=ENSMUST00...    chrM  ENSEMBL    exon   
ID=exon:ENSMUST00000082414.1:1;Parent=ENSMUST00...    chrM  ENSEMBL    exon   
ID=exon:ENSMUST00000082418.1:1;Parent=ENSMUST00...    chrM  ENSEMBL    exon   
ID=exon:ENSMUST00000082419.1:1;Parent=ENSMUST00...    chrM  ENSEMBL    exon   
ID=exon:ENSMUST00000082421.1:1;Parent=ENSMUST00...  

### Exclude outliers

In [None]:
data = np.array(list(genedata.values()))

# Calculate the interquartile range (IQR)
q1 = np.percentile(data, 25)
q3 = np.percentile(data, 75)
iqr = q3 - q1

# Define the lower and upper bounds for outliers
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

# Exclude the outliers from the data
data_excl = data[(data >= lower_bound) & (data <= upper_bound)]

genedata1 = {}
for gene in genedata:
    if genedata[gene] in data_excl:
        genedata1[gene] = genedata[gene]

### Count exons

In [None]:
exon_counts = pd.DataFrame({'gene_name': [], 'exon_count': [], 'Rate (bp/min)':[]})
for gene in genedata1:
    exon_count = 0

    try: 
        a = gene_coordinates.loc[gene]['chromosome']
        b = gene_coordinates.loc[gene]['start']
        c = gene_coordinates.loc[gene]['end']
        d = gene_coordinates.loc[gene]['strand']
        for attribute in gencode_exons.loc[gene]['attribute']:
            if "Ensembl" in attribute:
                start = gencode_exons1.loc[attribute]['start']
                end = gencode_exons1.loc[attribute]['end']
                if d == '+':
                    if end < b+30000:
                        if start > b+7500:
                            exon_count = exon_count + 1
                if d == '-':
                    if start > c-30000:
                        if end < c-7500:
                            exon_count = exon_count + 1

    except:
        pass
    if exon_count >-1:
        exon_counts.loc[len(exon_counts)] = [gene, exon_count, genedata[gene]]

print(exon_counts) 

### Plot the data

In [None]:
# Read in data
x = np.array(exon_counts['exon_count'])
y = np.array(exon_counts['Rate (bp/min)'])

# Calculate the linear regression line and R-squared value
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
lin_reg = slope * x + intercept
r_squared = r_value**2

# Set the figure size
plt.figure(figsize=(6, 6))  # width=10 inches, height=7 inches

# Create a scatter plot
plt.scatter(x, y)

# Add the linear regression line to the plot
plt.plot(x, lin_reg, color='r')

r_squared = r_value**2
r_squared_percent = r_squared * 100

# Add the R-squared value, p-value, and the number of datapoints (n) to the plot
plt.annotate(f"R-squared = {r_squared_percent:.1f}%", xy=(0.64, 0.95), xycoords='axes fraction', fontsize=12)
plt.annotate('p-value = {:.1e}'.format(p_value), xy=(0.64, 0.90), xycoords='axes fraction',fontsize=12)
plt.annotate('n = {}'.format(len(x)), xy=(0.64, 0.85), xycoords='axes fraction', fontsize = 12)

# Set the x- and y-axis labels
plt.xlabel('Exon count', fontsize=15)
plt.ylabel('Elongation rate (bp/min)', fontsize=15)

plt.xticks(fontsize=11)
plt.yticks(fontsize=11)

# Show the plot
plt.show()

### Determine exon coordinates 

In [None]:
exon_counts = pd.DataFrame({'gene_name': [], 'exon_count': [], 'exons': [], 'exons_coordinates': [], 'Rate (bp/min)':[]})
for gene in genedata:
    exon_count = 0
    exons = []
    exon_coordinates = []
    try: 
        a = gene_coordinates.loc[gene]['chromosome']
        b = gene_coordinates.loc[gene]['start']
        c = gene_coordinates.loc[gene]['end']
        d = gene_coordinates.loc[gene]['strand']
        for attribute in gencode_exons.loc[gene]['attribute']:
            if "Ensembl" in attribute:
                start = gencode_exons1.loc[attribute]['start']
                end = gencode_exons1.loc[attribute]['end']
                if d == '+':
                    if end < b+30000:
                        if start > b+7500:
                            exon_count = exon_count + 1
                            exon_length = end-start
                            exons.append(exon_length)
                            exon_coordinates.append((a, start, end))
                if d == '-':
                    if start > c-30000:
                        if end < c-7500:
                            exon_count = exon_count + 1
                            exon_length = end-start
                            exons.append(exon_length)
                            exon_coordinates.append((a, start, end))
    except:
        pass
    if exon_count >-1:
        exon_counts.loc[len(exon_counts)] = [gene, exon_count, exons, exon_coordinates, genedata[gene]]

exon_counts = exon_counts.set_index('gene_name')
exon_counts.head()

### Determine exon density

In [None]:
exon_density = pd.DataFrame({'gene_name': [], 'exon_density': [], 'Rate (bp/min)':[]})
try:
    for gene in genedata1:
        total_exon = sum(exon_counts.loc[gene]['exons'])
        exon_den = total_exon/22500
        if exon_counts.loc[gene]['exon_count'] >=1:
            exon_density.loc[len(exon_density)] = [gene, exon_den, genedata[gene]]      
except:
    pass

print(exon_density)

### Plot the data

In [None]:
# Read in data
x = np.array(exon_density['exon_density']) *100
y = np.array(exon_density['Rate (bp/min)']) 

# Calculate the linear regression line and R-squared value
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
lin_reg = slope * x + intercept
r_squared = r_value**2

# Set the figure size
plt.figure(figsize=(6, 6)) 

# Create a scatter plot
plt.scatter(x, y)

# Add the linear regression line to the plot
plt.plot(x, lin_reg, color='r')

r_squared = r_value**2
r_squared_percent = r_squared * 100

# Add the R-squared value, p-value, and the number of datapoints (n) to the plot
plt.annotate(f"R-squared = {r_squared_percent:.1f}%", xy=(0.64, 0.95), xycoords='axes fraction', fontsize=12)
plt.annotate('p-value = {:.2e}'.format(p_value), xy=(0.64, 0.90), xycoords='axes fraction', fontsize=12)
plt.annotate('n = {}'.format(len(x)), xy=(0.64, 0.85), xycoords='axes fraction', fontsize=12)

# Set the x- and y-axis labels
plt.xlabel('Exon density (%)', fontsize=15)
plt.ylabel('Elongation rate (bp/min)', fontsize=15)

plt.xticks(fontsize=11)
plt.yticks(fontsize=11)

# Show the plot
plt.show()

### Determine the GC-content

In [None]:
GC_content = pd.DataFrame({'Gene_name': [], 'GC_content': [], 'Rate (bp/min)' : []})

for gene in genedata1:

    try:       
        a = gencode_genes.loc[gene]['seqname']
        b = gencode_genes.loc[gene]['start']
        c = gencode_genes.loc[gene]['end']
        d = gencode_genes.loc[gene]['strand']
        x = genomepy.Genome("GRCm39").get_seq(a[3:],b,c)
        x = (str(x)).upper()
        if d == '+':
            x = x[7499:30000]
        if d == '-':
            x = x[-30000:-7499]
        gc_count = x.count("G") + x.count("C")
        gc_content = gc_count / len(x) * 100
        GC_content.loc[len(GC_content)] = [gene, gc_content, genedata[gene]]
    except Exception as e:
        pass
    
print(GC_content)

### Plot the data

In [None]:
# Read in data 
x = np.array(GC_content['GC_content'])
y = np.array(GC_content['Rate (bp/min)'])

# Calculate the linear regression line and R-squared value
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
lin_reg = slope * x + intercept
r_squared = r_value**2

# Set the figure size
plt.figure(figsize=(6, 6))

# Create a scatter plot
plt.scatter(x, y)

# Add the linear regression line to the plot
plt.plot(x, lin_reg, color='r')

r_squared = r_value**2
r_squared_percent = r_squared * 100

# Add the R-squared value, p-value, and the number of datapoints (n) to the plot
plt.annotate(f"R² = {r_squared_percent:.2f}%", xy=(0.75, 0.95), xycoords='axes fraction', fontsize=12)
plt.annotate('p = {:.2e}'.format(p_value), xy=(0.75, 0.90), xycoords='axes fraction', fontsize = 12)
plt.annotate('n = {}'.format(len(x)), xy=(0.75, 0.85), xycoords='axes fraction', fontsize = 12)

# Set the x- and y-axis labels
plt.xlabel('Guanine-cytosine content (%)', fontsize=15)
plt.ylabel('Elongation rate (bp/min)', fontsize=15)

plt.xticks(fontsize=11)
plt.yticks(fontsize=11)

# Show the plot
plt.show()

### Count CpG sites

In [None]:
CpG_sites = pd.DataFrame({'Gene_name': [], 'CpG_count': [], 'Rate (bp/min)' : []})

for gene in genedata1:

    try:       
        a = gencode_genes.loc[gene]['seqname']
        b = gencode_genes.loc[gene]['start']
        c = gencode_genes.loc[gene]['end']
        d = gencode_genes.loc[gene]['strand']
        x = genomepy.Genome("GRCm39").get_seq(a[3:],b,c)
        x = (str(x)).upper()
        if d == '+':
            x = x[7499:30000]
        if d == '-':
            x = x[-30000:-7499]
        CpG_count = x.count("CG")
        CpG_sites.loc[len(CpG_sites)] = [gene, CpG_count, genedata[gene]]
    except Exception as e:
        pass
    
print(CpG_sites)

### Plot the data

In [None]:
# Read in data
x = np.array(CpG_sites['CpG_count'])
y = np.array(CpG_sites['Rate (bp/min)'])

# Calculate the linear regression line and R-squared value
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
lin_reg = slope * x + intercept
r_squared = r_value**2

# Set the figure size
plt.figure(figsize=(6, 6))

# Create a scatter plot
plt.scatter(x, y)

# Add the linear regression line to the plot
plt.plot(x, lin_reg, color='r')

r_squared = r_value**2
r_squared_percent = r_squared * 100

# Add the R-squared value, p-value, and the number of datapoints (n) to the plot
plt.annotate(f"R² = {r_squared_percent:.2f}%", xy=(0.75, 0.95), xycoords='axes fraction', fontsize=12)
plt.annotate('p = {:.2e}'.format(p_value), xy=(0.75, 0.90), xycoords='axes fraction', fontsize = 12)
plt.annotate('n = {}'.format(len(x)), xy=(0.75, 0.85), xycoords='axes fraction', fontsize = 12)

plt.xticks(fontsize=11)
plt.yticks(fontsize=11)

# Set the x- and y-axis labels
plt.xlabel('CpG sites count', fontsize=15)
plt.ylabel('Elongation rate (bp/min)', fontsize=15)

# Show the plot
plt.show()