## Importing data and pre-processing to remove genes in operons
Our source file comes from EcoGene and contains information on the intergenic sequences of *E. coli*. Intergenic regions are intended from start to stop codon.

In [1]:
import numpy as np
import pandas as pd

#read from file
df = pd.read_table("./source/intergenic_table")
original_length = len(df.index)

def OrientationSplit(df):
    
    # Removing convergent genes. We won't target here and these intergenic regions are not of interest to us.
    df_convergent = df[df.Orientation == "Convergent"]
    df = df[df.Orientation <> "Convergent"]

    # dividing by gene orientation
    df_div = df[df.Orientation == "Divergent"]
    df_co_plus = df[df.Orientation == "Codirectional+"]
    df_co_min = df[df.Orientation == "Codirectional-"]

    # Merging codirectional genes on top and bottom strand
    df_co = pd.concat([df_co_plus, df_co_min])
    
    return df_div, df_co

In [2]:
# Let's add the first and second gene of each intergenic region as a separate field, as well as wheter or not it's part of an operon.

def AnnotateTranscriptionalUnits(df):   
    df_div, df_co = OrientationSplit(df)
    
    ## CODIRECTIONAL GENES
    df_co_ann = df_co
    df_co_ann["firstGene"]=df_co_ann.Name.str.split("_", expand=True)[0]
    df_co_ann["secondGene"]=df_co_ann.Name.str.split("_", expand=True)[1]

    # this is a database containing genes in operons from RegulonDB
    operons = pd.read_table("./source/OperonSet_RegulonDB.txt")
    operons.drop("Evidence", axis = 1, inplace=True)
    operons_filtered1= operons[9:]
    operons_filtered = operons_filtered1[operons_filtered1.NumberOfGenes<>1]

    # however, the df goes in the direction of the top strand when saying gene1_gene2. For genes on the reverse strand, I need to reverse the list
    # extract gene names, split by comma
    GeneNames_forward = operons_filtered[operons_filtered.Strand=="forward"].GeneNames.str.split(",").tolist()
    GeneNames_reverse = operons_filtered[operons_filtered.Strand=="reverse"].GeneNames.str.split(",").tolist()
    for i in GeneNames_reverse:
        i = i.reverse()
    GeneNames = GeneNames_forward + GeneNames_reverse

    # generate list of adjacent genes in operons that can be used to mark intergenic regions for removal.
    geneDuplets = []
    for i in GeneNames:
        for j in range(len(i)-1):
            geneDuplets.append(i[j]+"_"+i[j+1])
    
    GeneInOperon = []
    for i in df_co_ann.Name: 
        if i in geneDuplets:
            GeneInOperon.append("True")
        else:
            GeneInOperon.append("False")
    df_co_ann["GeneInOperon"] = GeneInOperon
    
    ## DIVERGING GENES
    df_div_ann = df_div
    df_div_ann["firstGene"]=df_div_ann.Name.str.split("_", expand=True)[0]
    df_div_ann["secondGene"]=df_div_ann.Name.str.split("_", expand=True)[1]
    GeneInOperon_div = []
    # no operons are diverging
    for i in range(len(df_div_ann.Name.index)):
        GeneInOperon_div.append("False")
    df_div_ann["GeneInOperon"] = GeneInOperon_div
        
    return pd.concat([df_div_ann, df_co_ann])

df_div, df_co = OrientationSplit(AnnotateTranscriptionalUnits(df))
df_export = pd.concat([df_div, df_co])
#df_export.to_excel("./export/All-Annotated.xls")

In [3]:
df_export_TU = df_export[df_export.GeneInOperon == "False"]
df_export_TU.to_excel("./export/All-Annotated_TU.xls")

In [4]:
df_export_TU[:10]

Unnamed: 0,Name,Length,Orientation,L_END,R_END,Cs,Unnamed: 6,firstGene,secondGene,GeneInOperon
6,yaaJ_talB,278,Divergent,7960.0,8237.0,0.17,,yaaJ,talB,False
11,yaaI_dnaK,376,Divergent,11787.0,12162.0,0.25,,yaaI,dnaK,False
16,mokC_sokC,-9,Divergent,16952.0,16960.0,0.37,,mokC,sokC,False
22,rpsT_yaaY,102,Divergent,21079.0,21180.0,0.45,,rpsT,yaaY,False
39,caiT_fixA,471,Divergent,41932.0,42402.0,0.9,,caiT,fixA,False
53,lptD_djlA,254,Divergent,57110.0,57363.0,1.23,,lptD,djlA,False
61,araB_araC,338,Divergent,70049.0,70386.0,1.51,,araB,araC,False
67,sgrR_sgrS,67,Divergent,77300.0,77366.0,1.67,,sgrR,sgrS,False
74,leuL_leuO,659,Divergent,83709.0,84367.0,1.8,,leuL,leuO,False
100,coaE_guaC,224,Divergent,113220.0,113443.0,2.44,,coaE,guaC,False


In [5]:
df_seq = pd.read_table("./source/intergenic_seq")

In [6]:
df_seq[:10]

Unnamed: 0,>thrL_thrA 81 bp Codirectional+ intergenic region
0,GCGTACAGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGGCT...
1,CAACCA
2,>thrC_yaaX 213 bp Codirectional+ intergenic re...
3,ATCTATTCATTATCTCAATCAGGCCGGGTTTGCTTTTATGCAGCCC...
4,AATGACAGGGAAAAAGGAGAAATTCTCAATAAATGCGGTAACTTAG...
5,CGTTCTCATCGAGTAATCTCCGGATATCGACCCATAACGGGCAATG...
6,>yaaX_yaaA 152 bp Convergent intergenic region
7,TGACAAATGCCGGGTAACAATCCGGCATTCAGCGCCTGATGCGACG...
8,TCTGCAATATATTGAATCTGCATGCTTTTGTAGGCAGGATAAGGCG...
9,CT


In [7]:
# Extracting sequences, ids and lengths from intergenic_seq fasta file
from Bio import SeqIO
with open('./source/intergenic_seq') as fasta_file:  # Will close handle cleanly
    sequences = []
    identifiers = []
    lengths = []
    for seq_record in SeqIO.parse(fasta_file, 'fasta'):  # (generator)
        sequences.append(str(seq_record.seq))
        identifiers.append(seq_record.id)
        lengths.append(len(seq_record.seq))

In [8]:
sequences[1]
# Generating df with this information

df_seq = pd.DataFrame(identifiers)
df_seq["Sequence"] = sequences
df_seq["Length"] = lengths
df_seq[:10]

Unnamed: 0,0,Sequence,Length
0,thrL_thrA,GCGTACAGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGGCT...,81
1,thrC_yaaX,ATCTATTCATTATCTCAATCAGGCCGGGTTTGCTTTTATGCAGCCC...,213
2,yaaX_yaaA,TGACAAATGCCGGGTAACAATCCGGCATTCAGCGCCTGATGCGACG...,152
3,yaaA_yaaJ,TCCAGTCCTTGCAGGAAATTTATGCCGACTTTAGCAAAAAATGAGA...,69
4,yaaJ_talB,CATATCCCTCTTATTGCCGGTCGCGATGACTTTCCTGTGTAAACGT...,278
5,talB_mog,CATTCTTAGCGTGACCGGGAAGTCGGTCACGCTACCTCTTCTGAAG...,114
6,satP_yaaW,ATTCCTCAAAAATCATCATCGAATGAATGGTGAAATAATTTCCCTG...,148
7,yaaI_dnaK,AGGATTCTCTTAGTGGGAAGAGGTAGGGGGATGAATACCCACTAGT...,376
8,dnaK_dnaJ,CGCCCTATAAACGGGTAATTATACTGACACGGGCGAAGGGGAATTT...,88
9,dnaJ_insL1,CTCCCCAAAAGCCTGCCCGTGGGCAGGCCTGGGTAAAAATAGGGTG...,146


In [9]:
df_seq["Sequence"][2]

'TGACAAATGCCGGGTAACAATCCGGCATTCAGCGCCTGATGCGACGCTGGCGCGTCTTATCAGGCCTACGTTAATTCTGCAATATATTGAATCTGCATGCTTTTGTAGGCAGGATAAGGCGTTCACGCCGCATCCGGCATTGACTGCAAACT'

In [11]:
df_seq.shape, df_export_TU.shape

((2606, 3), (2167, 10))

In [14]:
df_export_TU_positive = df_export_TU[df_export_TU["Length"] > 0]

In [15]:
notFound = np.array([])
for i in df_export_TU_positive.index:
    currentName = df_export_TU_positive.loc[i, "Name"]
    #print "currentName is " + str(currentName)
    what = df_seq[df_seq[0] == currentName].index.values
    #print "what is " + str(what), len(what)
    if len(what) > 0:
        seq = df_seq.get_value(what[0], "Sequence")
        #print seq
        df_export_TU_positive.loc[i, "Sequence"] = seq
    else: 
        notFound = np.append(notFound, currentName)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [16]:
# could not find a gene that has a negative size intergenic sequence. Now down to 177 genes not found
notFound.shape
# all these genes but one have intergenic sequences shorter than 50 bp, which we wouldn't use anyway I guess?

(177,)

In [17]:
df_export_TU_positive.shape

(2037, 11)

In [18]:
df_export_TU_positive.to_excel("./export/All-Annotated_TU-positive.xls")