The purpose of this notebook is to create a df that is arranged for making the observations in the df closer to a IID process. 

In [1]:
import pandas as pd
import numpy as np

In [2]:
def get_info_used(df):
    
    """
    df: A pandas dataframe, with columns,
     y_buy: A primary model's return from a long position
     y_sell: A primary model's return from a short position. 
     exit_price_sell: The prices at which each short position is exited
     exit_price_buy: The prices at which each long position is exited.
     buy_executed: 1 if a limit buy order is filled at the next timestamp and 0 otherwise.
     sell_executed:  1 if a limit sell order is filled at the next timestamp and 0 otherwise.
     # More details explained in calc returns with dynamic pt sl orders.ipynb.
    
    """
    info_used_buy=pd.DataFrame(pd.to_datetime(df.query('buy_executed==1')['buy_exit_date'],format='%Y%m%d %H:%M'),index=df.query('buy_executed==1').index)
    info_used_sell=pd.DataFrame(pd.to_datetime(df.query('sell_executed==1')['sell_exit_date'],format='%Y%m%d %H:%M'),index=df.query('sell_executed==1').index)
    
    t1_buy=pd.DataFrame(info_used_buy.values, columns=['dummy']).dropna()
    t1_sell=pd.DataFrame(info_used_sell.values, columns=['dummy']).dropna()

    return info_used_buy,info_used_sell,t1_buy,t1_sell

In [3]:
def get_idenM(df,info_used,t1):
    
    """
    t1: t1_buy/sell
    info_used:info_used_buy/sell
    df:A dataframe whose indexes are all the timestamps related to a strategy development, and whose cols are range(df.shape[0])
    
    """
    idenM=pd.DataFrame(np.nan,index=df.index,columns=range(info_used.shape[0]))
    t1=t1.set_index('dummy')# dummyをインデックスにすることで、info_used_buy.valuesがインデックスに変換される
    
    for i,(t0,t1) in enumerate(zip(info_used.index,t1.index)):
        idenM.loc[t0:t1,i]=1
        
    idenM=idenM.fillna(0)
    return idenM

In [18]:
def get_avg_uq(idenM):
    
    """
    This function returns the average uniqueness of each label.

    """
    c=idenM.sum(axis=1)
    u=idenM.div(c,axis=0)
    avgU=u[u>0].mean()
    return avgU

In [77]:
def seqBootstrap(idenM,sLength=None):
    
    """
    phi: A list containing 
    phiにはどんどんidenMのカラム（ラベル）が確率的に格納されていく。
    ラベルの実現値にはそれぞれ特有の確率（p)が割り当てられ、それらに基づいて確率的にidenM.columnsから要素を取り出していく。
    では確率配列ｐはどのように定義されているのかが問題になる。
    avgU/avgU.sum()
    avgU: A pandas series containing each label's average uniqueness, and the indicies are the labels.
    prob: An array containing probabilities that will be assigned to each label in np.random.choice
    
    抽出一回目は一様分布から抽出され、どんどん分布が変化していく。この点が普通のブートストラップと異なる。
    
    既に出たラベルとラベルi（すべてのラベルが入る）を用いて、ラベルiの平均にユニークネスを計算する。
    ラベルiの平均ユニークネスの合計に対して、個々のラベルiが何割占めるのか計算する。
    その計算結果（prob）を用いて新たにラベルを抽出する。
    
    phiには既出のラベルが順次格納されていく。ラベルiに既出のラベルと距離が近いものが入ると、cの計算時に、ｃが既出のラベルの影響を受けて
    大きくなる。ｃが大きくなればユニークネスが下がる。つまり平均ユニークネスも下がる。平均ユニークネスが下がると、平均ユニークネスの合計
    に対する割合が小さくなり、それは、抽出時の出現確率が下がることを意味する。
    
    r_0が既出だとして、r_100は離れているので、r_100のユニークネスにｃが変化を起こさない。結果的に、r_100の平均ユニークネスは1になる。
    
    """
    
    if sLength is None:
        sLength=3#
        phi=[]
        while len(phi)<sLength:
            avgU=pd.Series()
            for i in idenM:
                idenM_=idenM[phi+[i]] #reduce idenM。二回目はidenMから二列抽出したものがidenM_となる。
                avgU.loc[i]=get_avg_uq(idenM_).fillna(0).iloc[-1]#すでに出たラベルではないラベルのavg uqを返す。
            prob=avgU/avgU.sum() # uqが高い程、avgU.sum()に占める割合が高くなる⇒uqが高いラベル程、
            phi+=[np.random.choice(idenM.columns,p=prob)]# extract one label and continue this procedure while len(phi)<sLength.
            
        return phi,avgU,prob
            

In [124]:
def get_tp_needed(idenM):
    
    """
    tp_needed: A dict containing the timestamps at which each return label sits as key and each return label as value
    df.items(): returns col name and the values corresponding to the col name
    Series.items(): returns index and the value corresponding to the index
     
    """
    tp_needed={}
    for key, value in idenM.iteritems():
        for tp,val in value.items():
            if val==1:
                tp_needed[key]=tp
                break 
                
    return tp_needed

In [139]:
def get_BSed_obs(df,phi):
    
    """
    in
    -phi:A list containing return label series that are sequentially bootstrapped
    -df: A dataframe containing features and labels (df for y_buy/sell must be processed seperately)
    -index: A list containing the timestamps rearranged the way the output from seqBootstrap is
    
    out
    -df:A dataframe rearranged the way its labels have been closer to a IID process.
    
    """
    index=[]
    for i in phi:
        index.append(tp_needed[i])
    df=df.reindex(index=index)
        
    return df,index

In [55]:
df=pd.read_parquet('df_with_info')
df

Unnamed: 0_level_0,open,high,low,close,maker fee,daily vol,BBANDS_upperband,BBANDS_middleband,BBANDS_lowerband,DEMA,...,Upper,Lower,buy_executed,sell_executed,buy_exit_date,exit_price_buy,y_buy,sell_exit_date,exit_price_sell,y_sell
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-02-23 05:50:00,49963.0,50600.0,48779.5,50060.0,-0.00025,0.029728,1565.719965,416.25,-733.219965,1207.837059,...,,,1,1,2021-02-23 06:35:00,50094.5,0.010622,2021-02-23 06:35:00,50094.5,-0.008070
2021-02-23 06:20:00,50842.0,51216.5,49530.0,50700.0,-0.00025,0.029862,809.986002,-288.85,-1387.686002,425.403967,...,51003.0,50094.5,1,0,2021-02-23 07:05:00,49562.0,-0.012327,NaT,,0.000000
2021-02-23 06:35:00,50700.0,50863.5,50005.0,50169.0,-0.00025,0.030253,469.817542,-457.05,-1383.917542,211.108081,...,50472.5,49562.0,1,0,2021-02-23 07:35:00,50166.5,0.010303,NaT,,0.000000
2021-02-23 07:05:00,49740.0,50465.0,49474.0,49860.5,-0.00025,0.030670,911.688266,69.70,-772.288266,501.199109,...,50166.5,49249.0,0,1,NaT,,0.000000,2021-02-23 07:50:00,49938.5,-0.007683
2021-02-23 07:35:00,50062.5,50560.5,49606.5,50558.0,-0.00025,0.030639,811.992971,186.00,-439.992971,318.906714,...,50868.0,49938.5,1,0,2021-02-23 08:05:00,49178.5,-0.017285,NaT,,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-06-16 00:20:00,22650.5,22694.0,22398.0,22555.0,0.00010,0.053747,145.796931,39.90,-65.996931,-189.536198,...,22797.5,22070.0,0,1,NaT,,0.000000,2022-06-16 01:05:00,22771.0,-0.001669
2022-06-16 00:35:00,22555.0,22877.5,22464.5,22527.5,0.00010,0.054057,10.670640,-101.20,-213.070640,-266.317196,...,22771.0,22040.5,0,1,NaT,,0.000000,NaT,,0.000000
2022-06-16 01:05:00,22667.5,23032.5,22424.5,22951.5,0.00010,0.054733,239.011717,-91.70,-422.411717,-228.163243,...,23202.5,22449.0,1,0,NaT,,0.000000,NaT,,0.000000
2022-06-16 01:20:00,22951.5,23069.0,22600.0,22687.0,0.00010,0.055025,140.931599,-160.20,-461.331599,-281.603797,...,22936.5,22187.5,1,0,NaT,,0.000000,NaT,,0.000000


In [78]:
phi,avgU,prob=seqBootstrap(idenM,sLength=None)

  avgU=pd.Series()


In [121]:
phi

[1637, 2026, 2698]

In [125]:
tp_needed=get_tp_needed(idenM)

In [135]:
df,index=get_BSed_obs(df,phi)

In [137]:
index

[Timestamp('2021-05-08 21:05:00'),
 Timestamp('2021-05-23 12:20:00'),
 Timestamp('2021-06-28 17:35:00')]

In [138]:
tp_needed[1637]

Timestamp('2021-05-08 21:05:00')