### Import libraries

pandas - data analysis and manipulation

numpy - support for multidimensional arrays and matrices

genomepy -  handling genomes and gene annotations

matplotlib - creating visualisations 

SciPy - scientific and technical computing

In [None]:
pip install genomepy

In [None]:
import pandas as pd
import numpy as np
import genomepy
import matplotlib.pyplot as plt
from scipy import stats

### Download genome assembly with genomepy and read in the gene annotation

In [None]:
genomepy.install_genome("GRCm39", "NCBI", annotation = False)

gencode = pd.read_table("gencode.vM32.annotation.gff3.gz", comment="#", sep = "\t", names = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute'])
gencode.head() 

### Select genes only

In [None]:
gencode_genes = gencode[(gencode.feature == "gene")][['seqname', 'source', 'feature','start', 'end', 'strand', 'attribute']].copy().reset_index().drop('index', axis=1)
gencode_genes.head()

### Extract gene names and gene types

In [None]:
def gene_info(x):
    g_name = list(filter(lambda x: 'gene_name' in x,  x.split(";")))[0].split("=")[1]
    g_type = list(filter(lambda x: 'gene_type' in x,  x.split(";")))[0].split("=")[1]
    return (g_name, g_type)
gencode_genes["gene_name"], gencode_genes["gene_type"] = zip(*gencode_genes.attribute.apply(lambda x: gene_info(x)))

### Select protein-coding genes only

In [None]:
gencode_genes = gencode_genes[gencode_genes['gene_type'] == 'protein_coding'].reset_index().drop('index', axis=1)
gencode_genes.head()

### Index by gene name

In [None]:
gencode_genes = gencode_genes.set_index('gene_name')
gencode_genes.head()

### Create dictionary of genes with their esitmated elongation rates 

In [None]:
df = pd.read_excel('gene_list.xlsx') # A list of genes for which transcription elongation rates have been estimated in mESCs by Jonkers et al. in the times spanning  12.5-25 minutes following flavopiridol treatment 

genedata = dict(zip(df['Gene name'].tolist(), df['Rate (bp/min)'].tolist()))

print(genedata)

### Determine gene coordinates

In [None]:
gene_coordinates = pd.DataFrame({'gene_name': [], 'chromosome': [], 'start': [], 'end': [], 'strand': []})
for gene in genedata:
    try:
        a = gencode_genes.loc[gene]['seqname']
        b = gencode_genes.loc[gene]['start']
        c = gencode_genes.loc[gene]['end']
        d = gencode_genes.loc[gene]['strand']
        gene_coordinates.loc[len(gene_coordinates)] = [gene, a, b, c, d]
    except Exception as e:
        pass
gene_coordinates = gene_coordinates.set_index('gene_name')
gene_coordinates.head()

### Select exons

In [None]:
gencode_exons = gencode[(gencode.feature == "exon")][['seqname', 'source', 'feature','start', 'end', 'strand', 'attribute']].copy().reset_index().drop('index', axis=1)

gencode_exons["gene_name"], gencode_exons["gene_type"] = zip(*gencode_exons.attribute.apply(lambda x: gene_info(x)))

gencode_exons = gencode_exons[gencode_exons['gene_type'] == 'protein_coding'].reset_index().drop('index', axis=1)

gencode_exons = gencode_exons.set_index('gene_name')

gencode_exons1 = gencode_exons.set_index('attribute')

print(gencode_exons1)

### Exclude outliers

In [None]:
data = np.array(list(genedata.values()))

# Calculate the interquartile range (IQR)
q1 = np.percentile(data, 25)
q3 = np.percentile(data, 75)
iqr = q3 - q1

# Define the lower and upper bounds for outliers
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

# Exclude the outliers from the data
data_excl = data[(data >= lower_bound) & (data <= upper_bound)]

genedata1 = {}
for gene in genedata:
    if genedata[gene] in data_excl:
        genedata1[gene] = genedata[gene]

### Count exons

In [None]:
exon_counts = pd.DataFrame({'gene_name': [], 'exon_count': [], 'Rate (bp/min)':[]})
for gene in genedata1:
    exon_count = 0

    try: 
        a = gene_coordinates.loc[gene]['chromosome']
        b = gene_coordinates.loc[gene]['start']
        c = gene_coordinates.loc[gene]['end']
        d = gene_coordinates.loc[gene]['strand']
        for attribute in gencode_exons.loc[gene]['attribute']:
            if "Ensembl" in attribute:
                start = gencode_exons1.loc[attribute]['start']
                end = gencode_exons1.loc[attribute]['end']
                if d == '+':
                    if end < b+30000:
                        if start > b+7500:
                            exon_count = exon_count + 1
                if d == '-':
                    if start > c-30000:
                        if end < c-7500:
                            exon_count = exon_count + 1

    except:
        pass
    if exon_count >-1:
        exon_counts.loc[len(exon_counts)] = [gene, exon_count, genedata[gene]]

print(exon_counts) 

### Plot the data

In [None]:
# Read in data
x = np.array(exon_counts['exon_count'])
y = np.array(exon_counts['Rate (bp/min)'])

# Calculate the linear regression line and R-squared value
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
lin_reg = slope * x + intercept
r_squared = r_value**2

# Set the figure size
plt.figure(figsize=(6, 6))  # width=10 inches, height=7 inches

# Create a scatter plot
plt.scatter(x, y)

# Add the linear regression line to the plot
plt.plot(x, lin_reg, color='r')

r_squared = r_value**2
r_squared_percent = r_squared * 100

# Add the R-squared value, p-value, and the number of datapoints (n) to the plot
plt.annotate(f"R-squared = {r_squared_percent:.1f}%", xy=(0.64, 0.95), xycoords='axes fraction', fontsize=12)
plt.annotate('p-value = {:.1e}'.format(p_value), xy=(0.64, 0.90), xycoords='axes fraction',fontsize=12)
plt.annotate('n = {}'.format(len(x)), xy=(0.64, 0.85), xycoords='axes fraction', fontsize = 12)

# Set the x- and y-axis labels
plt.xlabel('Exon count', fontsize=15)
plt.ylabel('Elongation rate (bp/min)', fontsize=15)

plt.xticks(fontsize=11)
plt.yticks(fontsize=11)

# Show the plot
plt.show()

### Determine exon coordinates 

In [None]:
exon_counts = pd.DataFrame({'gene_name': [], 'exon_count': [], 'exons': [], 'exons_coordinates': [], 'Rate (bp/min)':[]})
for gene in genedata:
    exon_count = 0
    exons = []
    exon_coordinates = []
    try: 
        a = gene_coordinates.loc[gene]['chromosome']
        b = gene_coordinates.loc[gene]['start']
        c = gene_coordinates.loc[gene]['end']
        d = gene_coordinates.loc[gene]['strand']
        for attribute in gencode_exons.loc[gene]['attribute']:
            if "Ensembl" in attribute:
                start = gencode_exons1.loc[attribute]['start']
                end = gencode_exons1.loc[attribute]['end']
                if d == '+':
                    if end < b+30000:
                        if start > b+7500:
                            exon_count = exon_count + 1
                            exon_length = end-start
                            exons.append(exon_length)
                            exon_coordinates.append((a, start, end))
                if d == '-':
                    if start > c-30000:
                        if end < c-7500:
                            exon_count = exon_count + 1
                            exon_length = end-start
                            exons.append(exon_length)
                            exon_coordinates.append((a, start, end))
    except:
        pass
    if exon_count >-1:
        exon_counts.loc[len(exon_counts)] = [gene, exon_count, exons, exon_coordinates, genedata[gene]]

exon_counts = exon_counts.set_index('gene_name')
exon_counts.head()

### Determine exon density

In [None]:
exon_density = pd.DataFrame({'gene_name': [], 'exon_density': [], 'Rate (bp/min)':[]})
try:
    for gene in genedata1:
        total_exon = sum(exon_counts.loc[gene]['exons'])
        exon_den = total_exon/22500
        if exon_counts.loc[gene]['exon_count'] >=1:
            exon_density.loc[len(exon_density)] = [gene, exon_den, genedata[gene]]      
except:
    pass

print(exon_density)

### Plot the data

In [None]:
# Read in data
x = np.array(exon_density['exon_density']) *100
y = np.array(exon_density['Rate (bp/min)']) 

# Calculate the linear regression line and R-squared value
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
lin_reg = slope * x + intercept
r_squared = r_value**2

# Set the figure size
plt.figure(figsize=(6, 6)) 

# Create a scatter plot
plt.scatter(x, y)

# Add the linear regression line to the plot
plt.plot(x, lin_reg, color='r')

r_squared = r_value**2
r_squared_percent = r_squared * 100

# Add the R-squared value, p-value, and the number of datapoints (n) to the plot
plt.annotate(f"R-squared = {r_squared_percent:.1f}%", xy=(0.64, 0.95), xycoords='axes fraction', fontsize=12)
plt.annotate('p-value = {:.2e}'.format(p_value), xy=(0.64, 0.90), xycoords='axes fraction', fontsize=12)
plt.annotate('n = {}'.format(len(x)), xy=(0.64, 0.85), xycoords='axes fraction', fontsize=12)

# Set the x- and y-axis labels
plt.xlabel('Exon density (%)', fontsize=15)
plt.ylabel('Elongation rate (bp/min)', fontsize=15)

plt.xticks(fontsize=11)
plt.yticks(fontsize=11)

# Show the plot
plt.show()

### Determine the GC-content

In [None]:
GC_content = pd.DataFrame({'Gene_name': [], 'GC_content': [], 'Rate (bp/min)' : []})

for gene in genedata1:

    try:       
        a = gencode_genes.loc[gene]['seqname']
        b = gencode_genes.loc[gene]['start']
        c = gencode_genes.loc[gene]['end']
        d = gencode_genes.loc[gene]['strand']
        x = genomepy.Genome("GRCm39").get_seq(a[3:],b,c)
        x = (str(x)).upper()
        if d == '+':
            x = x[7499:30000]
        if d == '-':
            x = x[-30000:-7499]
        gc_count = x.count("G") + x.count("C")
        gc_content = gc_count / len(x) * 100
        GC_content.loc[len(GC_content)] = [gene, gc_content, genedata[gene]]
    except Exception as e:
        pass
    
print(GC_content)

### Plot the data

In [None]:
# Read in data 
x = np.array(GC_content['GC_content'])
y = np.array(GC_content['Rate (bp/min)'])

# Calculate the linear regression line and R-squared value
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
lin_reg = slope * x + intercept
r_squared = r_value**2

# Set the figure size
plt.figure(figsize=(6, 6))

# Create a scatter plot
plt.scatter(x, y)

# Add the linear regression line to the plot
plt.plot(x, lin_reg, color='r')

r_squared = r_value**2
r_squared_percent = r_squared * 100

# Add the R-squared value, p-value, and the number of datapoints (n) to the plot
plt.annotate(f"R² = {r_squared_percent:.2f}%", xy=(0.75, 0.95), xycoords='axes fraction', fontsize=12)
plt.annotate('p = {:.2e}'.format(p_value), xy=(0.75, 0.90), xycoords='axes fraction', fontsize = 12)
plt.annotate('n = {}'.format(len(x)), xy=(0.75, 0.85), xycoords='axes fraction', fontsize = 12)

# Set the x- and y-axis labels
plt.xlabel('Guanine-cytosine content (%)', fontsize=15)
plt.ylabel('Elongation rate (bp/min)', fontsize=15)

plt.xticks(fontsize=11)
plt.yticks(fontsize=11)

# Show the plot
plt.show()

### Count CpG sites

In [None]:
CpG_sites = pd.DataFrame({'Gene_name': [], 'CpG_count': [], 'Rate (bp/min)' : []})

for gene in genedata1:

    try:       
        a = gencode_genes.loc[gene]['seqname']
        b = gencode_genes.loc[gene]['start']
        c = gencode_genes.loc[gene]['end']
        d = gencode_genes.loc[gene]['strand']
        x = genomepy.Genome("GRCm39").get_seq(a[3:],b,c)
        x = (str(x)).upper()
        if d == '+':
            x = x[7499:30000]
        if d == '-':
            x = x[-30000:-7499]
        CpG_count = x.count("CG")
        CpG_sites.loc[len(CpG_sites)] = [gene, CpG_count, genedata[gene]]
    except Exception as e:
        pass
    
print(CpG_sites)

### Plot the data

In [None]:
# Read in data
x = np.array(CpG_sites['CpG_count'])
y = np.array(CpG_sites['Rate (bp/min)'])

# Calculate the linear regression line and R-squared value
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
lin_reg = slope * x + intercept
r_squared = r_value**2

# Set the figure size
plt.figure(figsize=(6, 6))

# Create a scatter plot
plt.scatter(x, y)

# Add the linear regression line to the plot
plt.plot(x, lin_reg, color='r')

r_squared = r_value**2
r_squared_percent = r_squared * 100

# Add the R-squared value, p-value, and the number of datapoints (n) to the plot
plt.annotate(f"R² = {r_squared_percent:.2f}%", xy=(0.75, 0.95), xycoords='axes fraction', fontsize=12)
plt.annotate('p = {:.2e}'.format(p_value), xy=(0.75, 0.90), xycoords='axes fraction', fontsize = 12)
plt.annotate('n = {}'.format(len(x)), xy=(0.75, 0.85), xycoords='axes fraction', fontsize = 12)

plt.xticks(fontsize=11)
plt.yticks(fontsize=11)

# Set the x- and y-axis labels
plt.xlabel('CpG sites count', fontsize=15)
plt.ylabel('Elongation rate (bp/min)', fontsize=15)

# Show the plot
plt.show()