In [1]:
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('/Users/asamarakone/Desktop/Data_Sources/AAPL.csv')
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2000-01-03,3.745536,4.017857,3.631696,3.997768,2.677157,133949200
1,2000-01-04,3.866071,3.950893,3.613839,3.660714,2.451444,128094400
2,2000-01-05,3.705357,3.948661,3.678571,3.714286,2.487319,194580400
3,2000-01-06,3.790179,3.821429,3.392857,3.392857,2.27207,191993200
4,2000-01-07,3.446429,3.607143,3.410714,3.553571,2.379695,115183600


In [2]:
df.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
4743,2018-11-07,205.970001,210.059998,204.130005,209.949997,209.219986,33424400
4744,2018-11-08,209.979996,210.119995,206.75,208.490005,208.490005,25362600
4745,2018-11-09,205.550003,206.009995,202.25,204.470001,204.470001,34365800
4746,2018-11-12,199.0,199.850006,193.789993,194.169998,194.169998,51135500
4747,2018-11-13,191.630005,197.179993,191.449997,192.229996,192.229996,46809900


In [3]:
df['Close - Open'] = df['Close'] - df['Open']
df['High - Low'] = df['High'] - df['Low']
df['Month'] = [int(date.split('-')[1]) for date in df['Date']]
df['Day'] = [int(date.split('-')[2]) for date in df['Date']]
df['Year'] = [int(date.split('-')[0]) for date in df['Date']]

** Function definitions ** 

In [4]:
def splitArea(data, start_date, end_date, column_name, ref_price, b):
    
    '''Returns the area underneath the curve that is below the reference price
       and the area underneath the curve that is above the reference price
       
       data - pandas dataFrame obtained by reading Yahoo Finance Historical Data CSV file
       start_date - date to define the first point of the curve (e.g. '2018-08-01')
       end_date - later date to define the last point of the curve 
       column_name - name of column to be analyzed (e.g. 'Volume')
       ref_price - reference price
       b - arbitrary distance between adjacent dates
       '''
    # Indexes
    si = list(data['Date']).index(start_date)
    fi = list(data['Date']).index(end_date)
    
    # Dates between start_date and end_date (inclusive)
    X = data['Date'][si:fi+1].values
    
    # Corresponding values from selected column 
    Y = data[column_name][si:fi+1].values
    
    areaAbove = 0
    areaBelow = 0
    
    # Calculate the area underneath the curve that is above the reference price
    # and the area underneath the curve that is below the reference price
    for i in range(0,len(Y)-1):
        if (ref_price < min([Y[i],Y[i+1]])) & (ref_price > 0.0):
            areaBelow = areaBelow + ref_price*b
            areaAbove = areaAbove + (Y[i] + Y[i+1] - 2.0*ref_price)/2.0*b
        elif ref_price == min([Y[i],Y[i+1]]):
            areaBelow = areaBelow + ref_price*b
            areaAbove = areaAbove + abs(Y[i]-Y[i+1])*b/2.0
        elif (ref_price > min([Y[i],Y[i+1]])) & (ref_price < max([Y[i],Y[i+1]])):
            areaBelow = areaBelow + (min([Y[i],Y[i+1]]) + ref_price)/2.0*b
            areaAbove = areaAbove + max([Y[i],Y[i+1]])*b - (min([Y[i],Y[i+1]]) + ref_price)/2.0*b - abs(Y[i]-Y[i+1])*b/2.0
        elif ref_price >= max([Y[i],Y[i+1]]):
            areaAbove = areaAbove + 0.0
            areaBelow = areaBelow + (Y[i] + Y[i+1])*b/2.0
        else:
            areaBelow = areaBelow + 0.0
            areaAbove = areaAbove + (Y[i] + Y[i+1])*b/2.0
        
    return(areaBelow,areaAbove)

In [5]:
def signedSplitArea(data, start_date, end_date, column_name, ref_price, b):
    
    '''Returns the area underneath the curve that is below the reference price
       and the area underneath the curve that is above the reference price
       
       data - pandas dataFrame obtained by reading Yahoo Finance Historical Data CSV file
       start_date - date to define the first point of the curve (e.g. '2018-08-01')
       end_date - later date to define the last point of the curve 
       column_name - name of column to be analyzed (e.g. 'Volume')
       ref_price - reference price
       b - arbitrary distance between adjacent dates
       '''
    # Indexes
    si = list(data['Date']).index(start_date)
    fi = list(data['Date']).index(end_date)
    
    # Dates between start_date and end_date (inclusive)
    X = data['Date'][si:fi+1].values
    
    # Corresponding values from selected column 
    Y = data[column_name][si:fi+1].values
    
    areaAbove = 0
    areaBelow = 0
    
    # Calculate the area underneath the curve that is above the reference price
    # and the area underneath the curve that is below the reference price
    for i in range(0,len(Y)-1):
        if (ref_price < min([Y[i],Y[i+1]])) & (ref_price > 0.0):
            if Y[i] <= Y[i+1]:
                areaBelow = areaBelow + ref_price*b
                areaAbove = areaAbove + (Y[i] + Y[i+1] - 2.0*ref_price)/2.0*b
            else:
                areaBelow = areaBelow - ref_price*b
                areaAbove = areaAbove - (Y[i] + Y[i+1] - 2.0*ref_price)/2.0*b
        elif ref_price == min([Y[i],Y[i+1]]):
            if Y[i] <= Y[i+1]:
                areaBelow = areaBelow + ref_price*b
                areaAbove = areaAbove + abs(Y[i]-Y[i+1])*b/2.0
            else:
                areaBelow = areaBelow - ref_price*b
                areaAbove = areaAbove - abs(Y[i]-Y[i+1])*b/2.0
        elif (ref_price > min([Y[i],Y[i+1]])) & (ref_price < max([Y[i],Y[i+1]])):
            if Y[i] <= Y[i+1]:
                areaBelow = areaBelow + (min([Y[i],Y[i+1]]) + ref_price)/2.0*b
                areaAbove = areaAbove + max([Y[i],Y[i+1]])*b - (min([Y[i],Y[i+1]]) + ref_price)/2.0*b - abs(Y[i]-Y[i+1])*b/2.0
            else:
                areaBelow = areaBelow - (min([Y[i],Y[i+1]]) + ref_price)/2.0*b
                areaAbove = areaAbove - max([Y[i],Y[i+1]])*b - (min([Y[i],Y[i+1]]) + ref_price)/2.0*b - abs(Y[i]-Y[i+1])*b/2.0
        elif ref_price >= max([Y[i],Y[i+1]]):
            if Y[i] <= Y[i+1]:
                areaAbove = areaAbove + 0.0
                areaBelow = areaBelow + (Y[i] + Y[i+1])*b/2.0
            else:
                areaAbove = areaAbove - 0.0
                areaBelow = areaBelow - (Y[i] + Y[i+1])*b/2.0
        else:
            if Y[i] <= Y[i+1]:
                areaBelow = areaBelow + 0.0
                areaAbove = areaAbove + (Y[i] + Y[i+1])*b/2.0
            else:
                areaBelow = areaBelow - 0.0
                areaAbove = areaAbove - (Y[i] + Y[i+1])*b/2.0
        
    return(areaBelow,areaAbove)

In [6]:
def areaBetweenMaxAndCurve(data, start_date, end_date, column_name, b):
    
    '''Returns the area above the curve that is below the max value of the curve
       
       data - pandas dataFrame obtained by reading Yahoo Finance Historical Data CSV file
       start_date - date to define the first point of the curve (e.g. '2018-08-01')
       end_date - later date to define the last point of the curve 
       column_name - name of column to be analyzed (e.g. 'Volume')
       b - arbitrary distance between adjacent dates
    '''
        
    subset = data[column_name][list(data['Date']).index(start_date):list(data['Date']).index(end_date)+1].values
    Max = max(subset)
    
    areaBelow,areaAbove = splitArea(data = data, start_date = start_date, end_date = end_date, column_name = column_name, ref_price = 0.0, b = b)
    
    return(Max*(len(subset)-1.0)*b - areaAbove)

In [7]:
def length_of_curve(data, start_date, end_date, column_name, b = 1.0):
    
    '''Returns the length of the curve formed by connecting adjacent points with lines
    
       data - pandas dataFrame obtained by reading Yahoo Finance Historical Data CSV file
       start_date - date to define the first point of the curve (e.g. '2018-08-01')
       end_date - later date to define the last point of the curve 
       column_name - name of column to be analyzed (e.g. 'Volume')
       b - arbitrary distance between adjacent dates
    '''  
    # Indexes
    si = list(data['Date']).index(start_date)
    fi = list(data['Date']).index(end_date)
    
    # Dates between start_date and end_date (inclusive)
    X = data['Date'][si:fi+1].values
    
    # Corresponding values from selected column 
    Y = data[column_name][si:fi+1].values
    
    length = 0
    
    # Calculates the length of the curve 
    for i in range(0,len(Y)-1):
        length = length + (b*b + abs(Y[i]-Y[i+1])**2.0)**0.5
        
    return(length)

In [8]:
def cyclicOperator(data, start_date, end_date, column_name, operators):

    '''Returns result from applying list of operators cyclically between values of desired column
    
       data - pandas dataFrame obtained by reading Yahoo Finance Historical Data CSV file
       start_date - date to define the first point of the curve (e.g. '2018-08-01')
       end_date - later date to define the last point of the curve 
       column_name - name of column to be analyzed (e.g. 'Volume')
       operators - list of operators (arbitrary size) to apply cyclically (e.g. ['+','-','/','*'])
    '''  
    # Indexes
    si = list(data['Date']).index(start_date)
    fi = list(data['Date']).index(end_date)

    Y = data[column_name][si:fi+1].values
    result = Y[0]
    j = 0

    print('Intial j:',j)
    for i in range(0,len(Y)-1):
    
        if j == len(operators):
             j = 0
            
        op = operators[j]
  
        if op == '+':
            result = result + Y[i+1]
            print('A')
        
        elif op == '-':
            result = result - Y[i+1]
            print('B')
    
        elif op == '*':   
            result = result * Y[i+1]
            print('C')
        
        elif op == '/':   
            result = result / Y[i+1]
            print('D')
            
        print('End of loop j:',j)
    
        j = j + 1
    
    print('End of function j:',j)

    return(result)

In [10]:
def partitionByAreaUnderneathCurve(data, start_date, end_date, column_name, b, area):
    
    '''Returns a list of dates between the start date and end date 
       (including the start date) such that the area underneath the curve between 
       adjacent dates is as close as possible to the ideal area

       data - pandas dataFrame obtained by reading Yahoo Finance Historical Data CSV file
       start_date - date to define the left end point of the curve you want to partition (e.g. '2018-08-01')
       end_date - date to define the right end point of the curve you wish to partition
       column_name - name of column to be analyzed (e.g. 'Volume')
       b - distance between adjacent dates
       area - ideal area underneath each partition of the curve
    '''
    
    cumulative_area = 0
    cumulative_areas = [0] 

    # To hold dates which partition the curve (including the start_date)
    boundary_dates = [start_date]
    
    # Specific location of the start_date within the desired column of the data frame
    si = list(data['Date']).index(start_date)
    
    # Specific location of the end_date within the desired column of the data frame
    fi = list(data['Date']).index(end_date)
    
    for i in range(si,fi):
        # Calculates the area underneath a curve 
        # formed by connecting 2 adjacent points with a line
        areaBelow, areaAbove = splitArea(data = data, start_date = data['Date'][i], end_date = data['Date'][i+1], column_name = column_name, ref_price = 0.0, b = b)
        cumulative_area = cumulative_area + areaAbove
        cumulative_areas.append(cumulative_area)

    i = 0
    
    start_of_interval_index = 0
    
    while i < len(cumulative_areas)-1:

        end_of_interval_index = i+1 
        area_in_interval = cumulative_areas[end_of_interval_index] - cumulative_areas[start_of_interval_index]
        
        if (area_in_interval > area): 
            
            canidates = []
            canidates.append(cumulative_areas[end_of_interval_index-1]-cumulative_areas[start_of_interval_index])
            canidates.append(area_in_interval)
            
            diff = [(canidate - area) for canidate in canidates]
            absdiff = [abs(x) for x in diff] 
            
            if absdiff.index(min(absdiff)) == 1:
                boundary_dates.append(data['Date'][i + 1 + si])
                start_of_interval_index = i + 1
                i = i + 1
            else:
                boundary_dates.append(data['Date'][i + si])
                start_of_interval_index = i
        else:
            i = i + 1
                
    return(boundary_dates)

In [11]:
def partitionByLengthOfCurve(data, start_date, end_date, column_name, b, length):
    
    '''Returns a list of dates between the start date and end date 
       (including the start date) such that the length of the curve between 
       adjacent dates of that list is as close as possible to the ideal length

       data - pandas dataFrame obtained by reading Yahoo Finance Historical Data CSV file
       start_date - date to define the left end point of the curve you want to partition (e.g. '2018-08-01')
       end_date - date to define the right end point of the curve you want to partition
       column_name - name of column to be analyzed (e.g. 'Volume')
       b - distance between adjacent dates
       length - ideal length of each partition of the curve
    '''
    
    cumulative_length = 0
    cumulative_lengths = [0] 

    # To hold dates which partition the curve (including the start_date)
    boundary_dates = [start_date]
    
    # Specific location of the start_date within the desired column of the data frame
    si = list(data['Date']).index(start_date)
    
    # Specific location of the end_date within the desired column of the data frame
    fi = list(data['Date']).index(end_date)
    
    for i in range(si,fi):
        # Calculates the length of a curve
        # formed by connecting 2 adjacent points with a line
        l = length_of_curve(data = data, start_date = data['Date'][i], end_date = data['Date'][i+1], column_name = column_name, b = b)
        cumulative_length = cumulative_length + l
        cumulative_lengths.append(cumulative_length)

    i = 0
    
    start_of_interval_index = 0
    
    while i < len(cumulative_lengths)-1:
        
        end_of_interval_index = i+1 
        length_in_interval = cumulative_lengths[end_of_interval_index] - cumulative_lengths[start_of_interval_index]
        
        if (length_in_interval > length): 
            
            canidates = []
            canidates.append(cumulative_lengths[end_of_interval_index-1]-cumulative_lengths[start_of_interval_index])
            canidates.append(length_in_interval)
            diff = [(canidate - length) for canidate in canidates]
            absdiff = [abs(x) for x in diff] 
            
            if absdiff.index(min(absdiff)) == 1:
                boundary_dates.append(data['Date'][i + 1 + si])
                start_of_interval_index = i + 1
                i = i + 1
                
            else:
                boundary_dates.append(data['Date'][i + si])
                start_of_interval_index = i
        else:
            i = i + 1
                
    return(boundary_dates)