In [7]:
import os
import numpy as np
import scipy as sp
import sklearn as sk
import pandas as pd
import operator
from operator import sub
from decimal import Decimal
from sklearn import preprocessing
import statistics
import seaborn as sb
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import pybedtools
from pybedtools import BedTool

# For each gene, pick the one TL-seq peak that best matches

In [8]:
# I first read in the file produced by CAGEr with gene names associated
# This is based off of the concensus clusters from all samples

df = pd.read_csv('181130clusters_diff_exp_loose_with_genes.csv', sep='\t')
df = df.dropna('rows')
df = df.reset_index().drop('index', axis=1)
df = df[df['tpm'] > 4]


# to return only a single TSS per gene
# find instances in which at least one end of the peak is upstream of the gene's ORF 
# and pick the closest peak to the gene's start

# start by converting to a bed file in order to use bedtools

tlbed = pd.DataFrame(columns=['chr', 'start', 'end', 'name', 'score', 'strand'])

tlbed['chr'] = df['coordinates'].str.split(':', expand=True)[0]
tlbed['start'] = (df['coordinates']
                      .str.split(':', expand=True)[1]
                      .str.split('-', expand=True)[0]) 
tlbed['start'] = tlbed['start'].astype(int)
tlbed['end'] = (df['coordinates']
                    .str.split(':', expand=True)[1]
                    .str.split('-', expand=True)[1])
tlbed['name'] = df['genes']
tlbed['score'] = 1
tlbed['strand'] = df['coordinates'].str.split(':', expand=True)[2]

# fill in the end value for single bp peaks and then add one to all

tlbed['end'].fillna(value=tlbed['start'], inplace=True)
tlbed['end'] = tlbed['end'].astype(int) + 1



In [9]:

# load in the bed file for the genome's ORFs

bed = pd.read_csv('SK1.genes_w_paralogs_no_dups.bed',
                 sep='\t', header=None)

bed[1] = bed[1].astype(int)
bed[2] = bed[2].astype(int)



In [12]:

# merge the two dataframes on gene name

tlmerge = tlbed.merge(bed, how='left', left_on='name', right_on=3).dropna()


# determine the distance between the TSS stop and the ORF start

tlmerge['dist'] = [ (tlmerge.loc[idx, 1] - tlmerge.loc[idx, 'start']) 
                   if x == '+' 
                   else (tlmerge.loc[idx, 'end'] - tlmerge.loc[idx, 2])  
                   for idx, x in tlmerge['strand'].iteritems()]


# remove any TSSs that are entirely in the ORF

tlmerge = tlmerge[tlmerge['dist'] > 0]


# select the peak that is closest to the promoter

df2 = pd.DataFrame()

for gene, group in tlmerge.groupby('name'):
    keep_index = group['dist'].idxmin()
    keep = tlmerge.loc[keep_index]
    df2 = df2.append(keep)

# only keep the best identified peak in the final dataframe
df2['start'] = df2['start'].astype(int).astype(str)
searchfor = df2['chr'] + ':' + df2['start']

df = df[df['coordinates'].str.contains('|'.join(searchfor))]

df.drop(["score", "consensus.cluster","tpm", "annotation"], axis=1, inplace=True)

df.to_csv('181130single_tss_per_gene.csv', index=False, )

