In [1]:
import pandas as pd
import numpy as np

# Determine the S288C chromosome location for the midpoint of the URS1 site at sites that are bound by Ume6

In [22]:

# in order to get the positions.txt file
# copied a table from the meme.txt output and pasted it into a new file
# removed all of the spaces and then replaced them with tabs

lutipositions = pd.read_csv('urs1_motif_analysis/lutipositions.txt', sep='\t', header=None)
lutipositions['df_origin'] = 'luti'

knownpositions = pd.read_csv('urs1_motif_analysis/knownpositions.txt', sep='\t', header=None)
knownpositions['df_origin'] = 'known'

# figure out the distance from the corresponding ORF

# concat the files together so that all further analyses can be performed as one

positions = pd.concat([lutipositions, knownpositions])

positions['chr'] = positions[0].str.split(':', expand=True)[0]
positions['coordinates'] = positions[0].str.split(':', expand=True)[1]
positions['start'] = positions['coordinates'].str.split('-', expand=True)[0]
positions['stop'] = positions['coordinates'].str.split('-', expand=True)[1]
positions['mid'] = positions['start'].astype(int) + positions[2].astype(int) + 4
positions['mid_coord'] = positions['chr'] + ':' + positions['mid'].astype(str)

positions.drop([0, 1, 2, 3, 4, 5, 6, 'coordinates', 'chr', 'stop', 'mid'], 
    inplace=True, axis=1)



In [23]:

# bring in the gene names from the bed files

lutibed = pd.read_csv('bedfiles/lutiume6enrichedup300down300.bed', 
                      header=None, sep='\t')
lutibed[1] = lutibed[1].astype(str)
lutibed.rename(columns={0:'chr', 1:'start', 2:'stop', 3:'parent', 4:'score', 5:'strand', 6:'peak'}
               , inplace=True)

knownbed = pd.read_csv('bedfiles/knownume6enrichedup300down300.bed',
                      header=None, sep='\t')
knownbed[1] = knownbed[1].astype(str)
knownbed.rename(columns={0:'chr', 1:'start', 2:'stop', 3:'parent', 4:'score', 5:'strand', 6:'peak'}
               , inplace=True)

bed = pd.concat([lutibed, knownbed])

positions = positions.merge(bed, how='left', left_on='start', right_on='start')



bedmidpositions = pd.DataFrame(columns=['chr', 'start', 'stop', 'parent', 'score', 'strand', 'peak'])

bedmidpositions['chr'] = positions['chr']
bedmidpositions['start'] = positions['mid_coord'].str.split(':', expand=True)[1].astype(int)
bedmidpositions['stop'] = bedmidpositions['start'].astype(int) + 1
bedmidpositions['parent'] = positions['parent']
bedmidpositions['score'] = positions['df_origin']
bedmidpositions['strand'] = positions['strand']
bedmidpositions['peak'] = positions['peak']

bedmidpositions = bedmidpositions.dropna()


In [24]:

allbed = pd.read_csv('bedfiles/SK1_PacBio_with_gene_names_MHspikes.bed',
                    sep='\t', header=None)

allbed.drop([0,4,5], inplace=True, axis=1)
allbed[2].astype(int)
allbed[1].astype(int)

orftomid = bedmidpositions.merge(allbed, how='left', left_on='parent', right_on=3)


plus = []
minus = []

plus = (orftomid[1] - orftomid['start']).astype(int)
minus = (orftomid['stop'] - orftomid[2]).astype(int)

orftomid['distance'] = [plus[idx] if sign  == '+' else minus[idx] for idx, sign in orftomid['strand'].iteritems()]

# I now need to get the coordinates of the motif midpoint for S288C
# drop all things that will not be needed when comparing to S288C
orftomid.drop(['start', 'stop', 'peak', 1, 2, 3], axis=1, inplace=True)



In [25]:

# I downloaded the .bed file from UCSC genome browser
s288cbed = pd.read_csv('bedfiles/sacCer3.bed', sep='\t', header=None)

s288cbed = s288cbed.drop([4, 5, 6, 7, 8, 9, 10, 11], axis=1)

s288cmidpositions = orftomid.merge(s288cbed, how='left', left_on='parent', right_on=3)

plus = (s288cmidpositions[1] - s288cmidpositions['distance']).astype(int)
minus = (s288cmidpositions[2] + s288cmidpositions['distance']).astype(int)

s288cmidpositions['start'] = [plus[idx] if sign == '+' else minus[idx] for idx, sign in s288cmidpositions['strand'].iteritems()]

s288cmidpositions['stop'] = s288cmidpositions['start'] + 1


s288cmidpositions.drop([0, 1, 2, 3, 'distance'], axis=1, inplace=True)

cols = ['chr', 'start', 'stop', 'parent', 'score', 'strand']

# for the case of ITR2, the distance is wrong because S228C has a transposable element inside of it 
# I manually found the chromosome location and inputted it into the database below

s288cmidpositions.loc[16, 'start':'stop'] = (117541, 117542)

s288cmidpositions = s288cmidpositions[cols]

s288cmidpositions[s288cmidpositions['score'] == 'luti'].to_csv('urs1_motif_analysis/sacCer3motifposluti.bed', sep='\t', header=None, index=None)
s288cmidpositions[s288cmidpositions['score'] == 'known'].to_csv('urs1_motif_analysis/sacCer3motifposknown.bed', sep='\t', header=None, index=None)

