# Optimizing Part 3 : Finding a Sequence Number's Gene Name

## Prepping before we start optimizing

In [1]:
# Declaring values used for all test
import time
import pandas as pd

from archerDX.core.coordinateAnnotationPairer import filterDataFrame, pairGTFRowWithCoordinateFilter, pairGTFRowWithCoordinateLoop, pairGTFRowWithCoordinateFilterIterrows, filterDataFrameOutOfRange
from archerDX.utility.dataImport import readInGTF, readInCoordinateData, readInGTFReducedInput
from archerDX.utility.paths import providedDataPaths

coordinatePath = providedDataPaths["coordinateAnnotateProvidedPath"]
gtfPath = providedDataPaths["gtfProvidedPath"]

In [2]:
# Declaring a decorator we will use to keep track of time
def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        if 'log_time' in kw:
            name = kw.get('log_name', method.__name__.upper())
            kw['log_time'][name] = int((te - ts) * 1000)
        else:
            print ('%r  %2.2f ms' % \
                  (method.__name__, (te - ts) * 1000))
        return result
    return timed

## Baseline Time

In [3]:
@timeit
def baseline():
    coordinateData = readInCoordinateData(coordinatePath)
    gtfData        = readInGTF(gtfPath)
    
    result = pairGTFRowWithCoordinateFilter(gtfData, coordinateData, True)


start_time = time.time()
baseline()
baseLineTime = (time.time() - start_time)

     Chromosome  Location Gene Name
0         chr12  20704380     PDE3A
1         chr12  20704379     PDE3A
2         chr21   9827238          
3          chr5  71146882          
4          chr8  38283717     FGFR1
...         ...       ...       ...
1798      chr12  20704363     PDE3A
1799      chr11  85195011      DLG2
1800      chr17  33478112    UNC45B
1801       chr8  70602348   SLCO5A1
1802      chr21   9827137          

[1803 rows x 3 columns]
'baseline'  94905.21 ms


#### Baseline Result = 95734.14 ms

Note : *This is about 96 seconds or close to a minute and a half*

## Idea 1 : Only reading in nessessary columns

In [4]:
@timeit
def reducedColumns():
    coordinateData = readInCoordinateData(coordinatePath)
    gtfData        = readInGTFReducedInput(gtfPath)
    
    result = pairGTFRowWithCoordinateFilter(gtfData, coordinateData, True)

start_time = time.time()
reducedColumns()
ideaOneTime = (time.time() - start_time)

     Chromosome  Location Gene Name
0         chr12  20704380     PDE3A
1         chr12  20704379     PDE3A
2         chr21   9827238          
3          chr5  71146882          
4          chr8  38283717     FGFR1
...         ...       ...       ...
1798      chr12  20704363     PDE3A
1799      chr11  85195011      DLG2
1800      chr17  33478112    UNC45B
1801       chr8  70602348   SLCO5A1
1802      chr21   9827137          

[1803 rows x 3 columns]
'reducedColumns'  95136.91 ms


#### Reduced Input Time = 94278.68 ms

Note : *Here we reduce the time by 3 seconds. Thats not a lot considering that its still in the 1.5 minute range but its worth keeping*

## Idea 2: Filtering the data before attempting to match

The idea here is to filter the data by chromosome and then start value to speed up the process

In [5]:
@timeit
def filteringDataFirst():
    coordinateData = readInCoordinateData(coordinatePath)
    gtfData        = readInGTF(gtfPath)
    
    gtfData = filterDataFrame(gtfData)
    result = pairGTFRowWithCoordinateFilter(gtfData, coordinateData, True)

start_time = time.time()
filteringDataFirst()
ideaTwoTime = (time.time() - start_time)

     Chromosome  Location Gene Name
0         chr12  20704380     PDE3A
1         chr12  20704379     PDE3A
2         chr21   9827238          
3          chr5  71146882          
4          chr8  38283717     FGFR1
...         ...       ...       ...
1798      chr12  20704363     PDE3A
1799      chr11  85195011      DLG2
1800      chr17  33478112    UNC45B
1801       chr8  70602348   SLCO5A1
1802      chr21   9827137          

[1803 rows x 3 columns]
'filteringDataFirst'  97642.40 ms


#### Filtered Data Time = 95533.52 ms

Note : *No real time saved. This is likely do to how we are filtering, since it the complex filters we used in the code require that we look through all of the data anyway rather than ejecting once we find a single correct anwser*

## Idea 3 : Use a **For Loop** Rather than a Filter

By using two for loops with a break, it may eject from the process earlier than with a filter

In [6]:
@timeit
def usingForLoop():
    coordinateData = readInCoordinateData(coordinatePath)
    gtfData        = readInGTF(gtfPath)
    
    result = pairGTFRowWithCoordinateLoop(gtfData, coordinateData, True)

#filteringDataFirst()

#### For Loop Time : Over twenty minutes

Note : *I cut it early as it was taking too long. This was obviously a bad idea and is very inefficient*

## Idea 4 : Using **Iterrows** with Pandas

In [7]:
@timeit
def usingIterrows():
    coordinateData = readInCoordinateData(coordinatePath)
    gtfData        = readInGTF(gtfPath)
    
    result = pairGTFRowWithCoordinateFilterIterrows(gtfData, coordinateData, True)

start_time = time.time()
usingIterrows()
ideaFourTime = (time.time() - start_time)

     Chromosome  Location Gene Name
0         chr12  20704380     PDE3A
1         chr12  20704379     PDE3A
2         chr21   9827238          
3          chr5  71146882          
4          chr8  38283717     FGFR1
...         ...       ...       ...
1798      chr12  20704363     PDE3A
1799      chr11  85195011      DLG2
1800      chr17  33478112    UNC45B
1801       chr8  70602348   SLCO5A1
1802      chr21   9827137          

[1803 rows x 3 columns]
'usingIterrows'  96696.64 ms


#### Iter row Time : 95065.17 ms

Note : *Slightly slower or the same as the baseline, we will discard this option*

## Idea 5 : Reducing Data Set Size

As suggested in the file containing instructions, many of the values that are in the GTF file are not in the range of data loo

In [8]:
@timeit
def reducingDataSetSize():
    coordinateData = readInCoordinateData(coordinatePath)
    gtfData        = readInGTF(gtfPath)

    gtfData = filterDataFrameOutOfRange(coordinateData, gtfData)
    
    result = pairGTFRowWithCoordinateFilter(gtfData, coordinateData, True)

start_time = time.time()
reducingDataSetSize()
ideaFiveTime = (time.time() - start_time)

-- Removed 50607 rows based on Chromosome
     Chromosome  Location Gene Name
0         chr12  20704380     PDE3A
1         chr12  20704379     PDE3A
2         chr21   9827238          
3          chr5  71146882          
4          chr8  38283717     FGFR1
...         ...       ...       ...
1798      chr12  20704363     PDE3A
1799      chr11  85195011      DLG2
1800      chr17  33478112    UNC45B
1801       chr8  70602348   SLCO5A1
1802      chr21   9827137          

[1803 rows x 3 columns]
'reducingDataSetSize'  92984.05 ms


#### Filtered GTF Data Time : 96625.74 ms

Note : *Faster than baseline! We removed enough rows to notice a speed increase.*

## Idea 6 : Two Filters Applied

Here will will filter the initial data pulled in from the CSV (as show in idea 1) before removing additional fields based on the start/end time (as show in idea5)

In [9]:
@timeit
def twoFiltersApplied():
    coordinateData = readInCoordinateData(coordinatePath)
    gtfData        = readInGTFReducedInput(gtfPath)

    gtfData = filterDataFrameOutOfRange(coordinateData, gtfData)
    
    result = pairGTFRowWithCoordinateFilter(gtfData, coordinateData, True)

start_time = time.time()
twoFiltersApplied()
ideaSixTime = (time.time() - start_time)

-- Removed 50607 rows based on Chromosome
     Chromosome  Location Gene Name
0         chr12  20704380     PDE3A
1         chr12  20704379     PDE3A
2         chr21   9827238          
3          chr5  71146882          
4          chr8  38283717     FGFR1
...         ...       ...       ...
1798      chr12  20704363     PDE3A
1799      chr11  85195011      DLG2
1800      chr17  33478112    UNC45B
1801       chr8  70602348   SLCO5A1
1802      chr21   9827137          

[1803 rows x 3 columns]
'twoFiltersApplied'  89901.22 ms


#### Multiple Filter Applied Time : 88982.62 ms

Note : *Fastest so far by a very slim margin. We will be using this in the main program*

# Conclusion

In [10]:
print(f"Base Time      | {baseLineTime}")
print(f"Idea One Time  | {ideaOneTime}")
print(f"Idea Two Time  | {ideaTwoTime}")
print(f"Idea Four Time | {ideaFourTime}")
print(f"Idea Five Time | {ideaFiveTime}")
print(f"Idea Six Time  | {ideaSixTime}")


Base Time      | 94.90556716918945
Idea One Time  | 95.13729691505432
Idea Two Time  | 97.64279580116272
Idea Four Time | 96.69727206230164
Idea Five Time | 92.98456907272339
Idea Six Time  | 89.90153813362122
