# install talib

In [1]:
# !cp ../input/talibinstall/ta-lib-0.4.0-src.tar.gzh  ./ta-lib-0.4.0-src.tar.gz
# !tar -xzvf ta-lib-0.4.0-src.tar.gz > null
# !cd ta-lib && ./configure --prefix=/usr > null && make  > null && make install > null
# !cp ../input/talibinstall/TA-Lib-0.4.21.tar.gzh TA-Lib-0.4.21.tar.gz
# !pip install TA-Lib-0.4.21.tar.gz > null
# !pip install ../input/talibinstall/numpy-1.21.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl >null
# import talib
# talib.__version__

### メモリ削減

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
import psutil

# 特徴量エンジニアリング

In [4]:
import pandas as pd
import numpy as np
import talib
from sklearn.preprocessing import StandardScaler
import gc

class Feature():

    def __init__(self) -> None:
        pass
    
    def __del__(self):
        """オブジェクトが破棄されるとき呼び出される"""
        print('Feature died:', id(self))

    def conv_data(self, df, Asset_ID, df_list, save_fet=False, save_name='feature', save_mem=True, test=False):
        
        df.fillna(method='ffill', inplace=True)
        
        if not test:
        
            for i in range(13):
                if not i == Asset_ID:
                    as_df = df_list[i].copy()
                    df[str(i)+'VWAP_shift105'] = as_df['VWAP'].shift(105) / as_df["VWAP"]
                    df[str(i)+'MOM'] = talib.MOM(as_df['Close'], timeperiod=176)

                    del as_df
                    gc.collect()

            df['VWAP_shift15'] = df['VWAP'].shift(15) / df["VWAP"]
            df['VWAP_shift30'] = df['VWAP'].shift(30) / df["VWAP"]
            df['VWAP_shift45'] = df['VWAP'].shift(45) / df["VWAP"]
            df['VWAP_shift90'] = df['VWAP'].shift(90) / df["VWAP"]
            df['VWAP_shift120'] = df['VWAP'].shift(120) / df["VWAP"]
            df['VWAP_shift180'] = df['VWAP'].shift(180) / df["VWAP"]
            df['VWAP_shift210'] = df['VWAP'].shift(210) / df["VWAP"]
            df['VWAP_shift240'] = df['VWAP'].shift(240) / df["VWAP"]
            df['VWAP_shift310'] = df['VWAP'].shift(310) / df["VWAP"]
            df['VWAP_shift350'] = df['VWAP'].shift(350) / df["VWAP"]
            df['VWAP_shift400'] = df['VWAP'].shift(400) / df["VWAP"]
            df['VWAP_shift450'] = df['VWAP'].shift(450) / df["VWAP"]
            df['VWAP_shift550'] = df['VWAP'].shift(550) / df["VWAP"]
            df['VWAP_shift600'] = df['VWAP'].shift(600) / df["VWAP"]
            df['VWAP_shift650'] = df['VWAP'].shift(650) / df["VWAP"]
            df['VWAP_shift750'] = df['VWAP'].shift(750) / df["VWAP"]
            df['VWAP_shift800'] = df['VWAP'].shift(800) / df["VWAP"]
            df['VWAP_shift1000'] = df['VWAP'].shift(1000) / df["VWAP"]
            df['VWAP_shift1500'] = df['VWAP'].shift(1500) / df["VWAP"]
            df['VWAP_shift2000'] = df['VWAP'].shift(2000) / df["VWAP"]
            df['VWAP_shift2500'] = df['VWAP'].shift(2500) / df["VWAP"]
            df['VWAP_shift3000'] = df['VWAP'].shift(3000) / df["VWAP"]
            df['VWAP_shift3500'] = df['VWAP'].shift(3500) / df["VWAP"]
            df['VWAP_shift4000'] = df['VWAP'].shift(4000) / df["VWAP"]
            df['VWAP_shift4500'] = df['VWAP'].shift(4500) / df["VWAP"]
            df['VWAP_shift5000'] = df['VWAP'].shift(5000) / df["VWAP"]
            df['VWAP_shift5500'] = df['VWAP'].shift(5500) / df["VWAP"]
            df['VWAP_shift6000'] = df['VWAP'].shift(6000) / df["VWAP"]
            df['VWAP_shift7000'] = df['VWAP'].shift(7000) / df["VWAP"]
            df['VWAP_shift8000'] = df['VWAP'].shift(8000) / df["VWAP"]
            df['VWAP_shift9000'] = df['VWAP'].shift(9000) / df["VWAP"]
            df['VWAP_shift10000'] = df['VWAP'].shift(10000) / df["VWAP"]
            df['VWAP_shift11000'] = df['VWAP'].shift(11000) / df["VWAP"]
            df['VWAP_shift12000'] = df['VWAP'].shift(12000) / df["VWAP"]
            df['VWAP_shift14000'] = df['VWAP'].shift(14000) / df["VWAP"]
            df['VWAP_shift16000'] = df['VWAP'].shift(16000) / df["VWAP"]
            df['VWAP_shift18000'] = df['VWAP'].shift(18000) / df["VWAP"]
            df['VWAP_shift25000'] = df['VWAP'].shift(25000) / df["VWAP"]
            df['Volume_shift_sma2000'] = (df['Volume'].rolling(2000).sum() / df["Volume"]).apply(np.log)
            df['Close_shift70'] = df['Close'].shift(70) / df["Close"]
            df['Close_shift280'] = df['Close'].shift(280) / df["Close"]
            df['SMA_18000_std'] = df['Close'].rolling(18000).std().shift() / df['Close']
            df['ROCP'] = talib.ROCP(df['Close'], timeperiod=181)
            df['MOM'] = talib.MOM(df['Close'], timeperiod=176)
            df['MOM_1700'] = talib.MOM(df['Close'], timeperiod=1760)
            df['RSI'] = talib.RSI(df['Close'], timeperiod=206)
            df['EMA'] = (talib.EMA(df['Close'], timeperiod=11) - df['Close']) / df["Close"]
            df['APO'] = talib.APO(df['Close'], fastperiod=117, slowperiod=166, matype=0)
            df['CMO'] = talib.CMO(df['Close'], timeperiod=204)
            macd, macdsignal, macdhist = talib.MACD(df['Close'], fastperiod=324, slowperiod=296, signalperiod=272)
            df['macdhist0'] = macdhist
            df['macdhist1'] = macdhist.shift(1)
            df['macdhist5'] = macdhist.shift(5)
            df['macdhist15'] = macdhist.shift(15)
            df['ADX'] = talib.ADX(df["High"], df["Low"], df["Close"], timeperiod=181)
            df['AD'] = talib.AD(df["High"], df["Low"], df["Close"], df["Volume"])
            df['LINEARREG_ANGLE'] = talib.LINEARREG_ANGLE(df["Close"], timeperiod=181)
            df['HT_DCPERIOD'] = talib.HT_DCPERIOD(df["Close"])
            df['NATR'] = talib.NATR(df["High"], df["Low"], df["Close"], timeperiod=181)
            df['AROONOSC'] = talib.AROONOSC(df["High"], df["Low"], timeperiod=181)
            aroondown, aroonup = talib.AROON(df["High"], df["Low"], timeperiod=181)
            df['aroondown'] = aroondown
            df['aroonup'] = aroonup
            df['PLUS_DI'] = talib.PLUS_DI(df["High"], df["Low"], df["Close"], timeperiod=181)
            upper1, middle,lower1 = talib.BBANDS(df["Close"], timeperiod=181, nbdevup=3, nbdevdn=3, matype=0)
            df['BBANDS'] = (upper1 - df['Close']).apply(np.log)
            df['CDL2CROWS'] = talib.CDL2CROWS(df["Open"], df["High"], df["Low"], df["Close"])

            categorical_features = ['CDL2CROWS']
            
            delete_columns = ['Asset_ID','High','Low','Close','Open','VWAP','Volume','Count']
            #df.drop(delete_columns, axis=1, inplace=True)
            for colum in delete_columns:
                del df[colum]; gc.collect();
                
            df.reset_index(drop=True, inplace=True)
        else:
            
            cols = []
            for i in range(13):
                if not i == Asset_ID:
                    cols.append(str(i)+'VWAP_shift105')
                    cols.append(str(i)+'MOM')
            cols.extend(['VWAP_shift15','VWAP_shift30','VWAP_shift45','VWAP_shift90','VWAP_shift120','VWAP_shift180','VWAP_shift210','VWAP_shift240','VWAP_shift310','VWAP_shift350','VWAP_shift400','VWAP_shift450','VWAP_shift550','VWAP_shift600','VWAP_shift650','VWAP_shift750','VWAP_shift800','VWAP_shift1000','VWAP_shift1500','VWAP_shift2000','VWAP_shift2500','VWAP_shift3000','VWAP_shift3500','VWAP_shift4000','VWAP_shift4500','VWAP_shift5000','VWAP_shift5500','VWAP_shift6000','VWAP_shift7000','VWAP_shift8000','VWAP_shift9000','VWAP_shift10000','VWAP_shift11000','VWAP_shift12000','VWAP_shift14000','VWAP_shift16000','VWAP_shift18000','VWAP_shift25000','Volume_shift_sma2000','Close_shift70','Close_shift280','SMA_18000_std','ROCP','MOM','MOM_1700','RSI','EMA','APO','CMO','macdhist0','macdhist1','macdhist5','macdhist15','ADX','AD','LINEARREG_ANGLE','HT_DCPERIOD','NATR','AROONOSC','aroondown','aroonup','PLUS_DI','BBANDS','CDL2CROWS'])
            ans = pd.DataFrame(index=[0], columns=cols)
            
            start = time.time()
            for i in range(13):
                if not i == Asset_ID:
                    as_df = df_list[i].copy()
                    ans[str(i)+'VWAP_shift105'] = as_df.iloc[-106,7] / as_df.iloc[-1,7]
                    ans[str(i)+'MOM'] = talib.MOM(as_df.iloc[-177:,5], timeperiod=176).iloc[-1]
            print ("1.0elapsed_time:{0}".format(time.time() - start) + "[sec]")

            start = time.time()
            ans['VWAP_shift15'] = df.iloc[-16,7] / df.iloc[-1,7]
            ans['VWAP_shift30'] = df.iloc[-31,7] / df.iloc[-1,7]
            ans['VWAP_shift45'] = df.iloc[-46,7] / df.iloc[-1,7]
            ans['VWAP_shift90'] = df.iloc[-91,7] / df.iloc[-1,7]
            ans['VWAP_shift120'] = df.iloc[-121,7] / df.iloc[-1,7]
            ans['VWAP_shift180'] = df.iloc[-181,7] / df.iloc[-1,7]
            ans['VWAP_shift210'] = df.iloc[-211,7] / df.iloc[-1,7]
            ans['VWAP_shift240'] = df.iloc[-241,7] / df.iloc[-1,7]
            ans['VWAP_shift310'] = df.iloc[-311,7] / df.iloc[-1,7]
            ans['VWAP_shift350'] = df.iloc[-351,7] / df.iloc[-1,7]
            ans['VWAP_shift400'] = df.iloc[-401,7] / df.iloc[-1,7]
            ans['VWAP_shift450'] = df.iloc[-451,7] / df.iloc[-1,7]
            ans['VWAP_shift550'] = df.iloc[-551,7] / df.iloc[-1,7]
            ans['VWAP_shift600'] = df.iloc[-601,7] / df.iloc[-1,7]
            ans['VWAP_shift650'] = df.iloc[-651,7] / df.iloc[-1,7]
            ans['VWAP_shift750'] = df.iloc[-751,7] / df.iloc[-1,7]
            ans['VWAP_shift800'] = df.iloc[-801,7] / df.iloc[-1,7]
            ans['VWAP_shift1000'] = df.iloc[-1001,7] / df.iloc[-1,7]
            ans['VWAP_shift1500'] = df.iloc[-1501,7] / df.iloc[-1,7]
            ans['VWAP_shift2000'] = df.iloc[-2001,7] / df.iloc[-1,7]
            ans['VWAP_shift2500'] = df.iloc[-2501,7] / df.iloc[-1,7]
            ans['VWAP_shift3000'] = df.iloc[-3001,7] / df.iloc[-1,7]
            ans['VWAP_shift3500'] = df.iloc[-3501,7] / df.iloc[-1,7]
            ans['VWAP_shift4000'] = df.iloc[-4001,7] / df.iloc[-1,7]
            ans['VWAP_shift4500'] = df.iloc[-4501,7] / df.iloc[-1,7]
            ans['VWAP_shift5000'] = df.iloc[-5001,7] / df.iloc[-1,7]
            ans['VWAP_shift5500'] = df.iloc[-5501,7] / df.iloc[-1,7]
            ans['VWAP_shift6000'] = df.iloc[-6001,7] / df.iloc[-1,7]
            ans['VWAP_shift7000'] = df.iloc[-7001,7] / df.iloc[-1,7]
            ans['VWAP_shift8000'] = df.iloc[-8001,7] / df.iloc[-1,7]
            ans['VWAP_shift9000'] = df.iloc[-9001,7] / df.iloc[-1,7]
            ans['VWAP_shift10000'] = df.iloc[-10001,7] / df.iloc[-1,7]
            ans['VWAP_shift11000'] = df.iloc[-11001,7] / df.iloc[-1,7]
            ans['VWAP_shift12000'] = df.iloc[-12001,7] / df.iloc[-1,7]
            ans['VWAP_shift14000'] = df.iloc[-14001,7] / df.iloc[-1,7]
            ans['VWAP_shift16000'] = df.iloc[-16001,7] / df.iloc[-1,7]
            ans['VWAP_shift18000'] = df.iloc[-18001,7] / df.iloc[-1,7]
            ans['VWAP_shift25000'] = df.iloc[-25001,7] / df.iloc[-1,7]
            print ("1.1elapsed_time:{0}".format(time.time() - start) + "[sec]")
            start = time.time()
            ans['Volume_shift_sma2000'] = (df['Volume'].rolling(2000).sum() / df["Volume"]).apply(np.log).iloc[-1]
            ans['Close_shift70'] = df.iloc[-71,5] / df.iloc[-1,5]
            ans['Close_shift280'] = df.iloc[-281,5] / df.iloc[-1,5]
            ans['SMA_18000_std'] = df['Close'].rolling(18000).std().shift() / df.iloc[:,5]
            ans['ROCP'] = talib.ROCP(df.iloc[-182:,5], timeperiod=181).iloc[-1]
            ans['MOM'] = talib.MOM(df.iloc[-177:,5], timeperiod=176).iloc[-1]
            ans['MOM_1700'] = talib.MOM(df.iloc[-1761:,5], timeperiod=1760).iloc[-1]
            ans['RSI'] = talib.RSI(df.iloc[-207:,5], timeperiod=206).iloc[-1]
            ans['EMA'] = (talib.EMA(df.iloc[-12:,5], timeperiod=11).iloc[-1] - df.iloc[-1,5]) / df.iloc[-1,5]
            ans['APO'] = talib.APO(df.iloc[-167:,5], fastperiod=117, slowperiod=166, matype=0).iloc[-1]
            ans['CMO'] = talib.CMO(df.iloc[-205:,5], timeperiod=204).iloc[-1]
            macd, macdsignal, macdhist = talib.MACD(df.iloc[-610:,5], fastperiod=324, slowperiod=296, signalperiod=272)
            ans['macdhist0'] = macdhist.iloc[-1]
            ans['macdhist1'] = macdhist.iloc[-2]
            ans['macdhist5'] = macdhist.iloc[-6]
            ans['macdhist15'] = macdhist.iloc[-16]
            ans['ADX'] = talib.ADX(df.iloc[-362:,3], df.iloc[-362:,4], df.iloc[-362:,5], timeperiod=181).iloc[-1]
            ans['AD'] = talib.AD(df.iloc[-2:,3], df.iloc[-2:,4], df.iloc[-2:,5], df.iloc[-2:,6]).iloc[-1]
            ans['LINEARREG_ANGLE'] = talib.LINEARREG_ANGLE(df.iloc[-182:,5], timeperiod=181).iloc[-1]
            ans['HT_DCPERIOD'] = talib.HT_DCPERIOD(df.iloc[-1000:,5]).iloc[-1]
            ans['NATR'] = talib.NATR(df.iloc[-182:,3], df.iloc[-182:,4], df.iloc[-182:,5], timeperiod=181).iloc[-1]
            ans['AROONOSC'] = talib.AROONOSC(df.iloc[-182:,3], df.iloc[-182:,4], timeperiod=181).iloc[-1]
            aroondown, aroonup = talib.AROON(df.iloc[-182:,3], df.iloc[-182:,4], timeperiod=181)
            ans['aroondown'] = aroondown.iloc[-1]
            ans['aroonup'] = aroonup.iloc[-1]
            ans['PLUS_DI'] = talib.PLUS_DI(df.iloc[-182:,3], df.iloc[-182:,4], df.iloc[-182:,5], timeperiod=181).iloc[-1]
            upper1, middle,lower1 = talib.BBANDS(df.iloc[-182:,5], timeperiod=181, nbdevup=3, nbdevdn=3, matype=0)
            ans['BBANDS'] = (upper1.iloc[-2:] - df.iloc[-2:,5]).apply(np.log).iloc[-1]

            ans['CDL2CROWS'] = talib.CDL2CROWS(df.iloc[-1000:,2], df.iloc[-1000:,3], df.iloc[-1000:,4], df.iloc[-1000:,5]).iloc[-1]
            categorical_features = ['CDL2CROWS']
            
            #print(ans.info())################################
            
            df = ans.copy()
            print ("1.2elapsed_time:{0}".format(time.time() - start) + "[sec]")
            
        start = time.time()
        
        df.replace([np.inf, -np.inf], np.nan, inplace=True)
        df.fillna(method='ffill', inplace=True)
        df.fillna(0, inplace=True)
        
        dfX = df.copy()
        dfX.drop(categorical_features, axis=1, inplace=True)
        columns = dfX.columns.tolist()
        X = dfX.values
        if not test:
            del dfX
            gc.collect()
        """正規化 sklearn
        scaler = StandardScaler()
        scaler.fit(X)
        X = scaler.transform(X)"""
        
        print ("1.3elapsed_time:{0}".format(time.time() - start) + "[sec]")
        start = time.time()
        
        mean_n = [[ 9.99885373e-01,  3.09758273e-02,  1.00018938e+00, -1.61865578e-01,
                   9.21656800e-01,  1.65773236e-04,  6.56436859e-01,  1.86611370e-05,
                   1.00013456e+00, -2.89830662e-04,  9.99977581e-01,  2.04807467e-01,
                   1.00009365e+00,  2.15764428e-03,  9.05655877e-01, -9.88395078e-05,
                   1.00009076e+00, -5.51462303e-03,  9.10990142e-01, -1.16363673e+01,
                   1.00010193e+00, -1.56828965e-02,  9.65466065e-01, -6.98199983e-05,
                   9.99993611e-01,  9.99986900e-01,  9.99979881e-01,  9.99957910e-01,
                   9.99942407e-01,  9.99909481e-01,  9.99893581e-01,  9.99877334e-01,
                   9.99838662e-01,  9.99816700e-01,  9.99788575e-01,  9.99761023e-01,
                   9.99704426e-01,  9.99676509e-01,  9.99649796e-01,  9.99595992e-01,
                   9.99569505e-01,  9.99468803e-01,  9.99220890e-01,  9.98908461e-01,
                   9.98604001e-01,  9.98327831e-01,  9.98046306e-01,  9.97759143e-01,
                   9.97491716e-01,  9.97218832e-01,  9.96945641e-01,  9.96689312e-01,
                   9.96208285e-01,  9.95795422e-01,  9.95385685e-01,  9.95037603e-01,
                   9.94681685e-01,  9.94254386e-01,  9.93227183e-01,  9.92229041e-01,
                   9.91203069e-01,  9.87183158e-01,  7.96081342e+00,  9.99967919e-01,
                   9.99854823e-01,  4.95978312e-02,  2.16599492e-04,  2.68893869e+00,
                   2.84985841e+01,  5.01774243e+01,  5.11677265e-07,  3.75372870e-01,
                   3.66093910e-01, -1.06804469e-03, -1.06421319e-03, -1.04921838e-03,
                  -1.01535536e-03,  1.73946113e+01, -6.55988342e+06,  8.84185123e-01,
                   2.13108282e+01,  7.92234250e-01,  2.13442151e+00,  4.46625790e+01,
                   4.67970006e+01,  7.52846011e+00,  4.47998655e+00]]
            
            
        std_n = [[1.83716857e-02, 3.89781053e+00, 1.95957894e-02, 1.74106538e+01,
                  2.69214360e-01, 1.68087136e-02, 4.75186737e-01, 4.54993587e-03,
                  1.95644498e-02, 1.67338689e-01, 1.63526168e-02, 2.37303288e+01,
                  2.08237552e-02, 8.69332369e-01, 2.93120932e-01, 2.00583700e-02,
                  1.75618881e-02, 2.86042554e+00, 3.37676176e-01, 8.20031691e+01,
                  1.87165419e-02, 4.14204761e+00, 1.84251487e-01, 6.56426652e-03,
                  5.26436091e-03, 7.40879647e-03, 9.02836838e-03, 1.26595499e-02,
                  1.45492033e-02, 1.76408183e-02, 1.90177510e-02, 2.02852605e-02,
                  2.29294077e-02, 2.43220605e-02, 2.59342450e-02, 2.74509028e-02,
                  3.02354843e-02, 3.15529478e-02, 3.28535234e-02, 3.53239890e-02,
                  3.65483469e-02, 4.12821836e-02, 5.13174474e-02, 5.86539714e-02,
                  6.52650303e-02, 7.14069839e-02, 7.69020650e-02, 8.20182956e-02,
                  8.69482487e-02, 9.15959641e-02, 9.61233061e-02, 1.00670713e-01,
                  1.09340057e-01, 1.17680116e-01, 1.25375445e-01, 1.33253465e-01,
                  1.40662143e-01, 1.47272021e-01, 1.58726547e-01, 1.69942399e-01,
                  1.79984757e-01, 2.12667894e-01, 8.42835655e-01, 1.12048392e-02,
                  2.18179046e-02, 3.50607903e-02, 1.47249468e-02, 3.38298633e+02,
                  1.02971586e+03, 3.29244781e+00, 1.76477768e-03, 5.60760509e+01,
                  6.53106150e+00, 6.53123593e+00, 6.53123373e+00, 6.53122532e+00,
                  6.53120810e+00, 1.57331750e+01, 4.61154350e+06, 3.59955260e+01,
                  5.35233709e+00, 1.04905727e+00, 5.59595607e+01, 3.31159638e+01,
                  3.37248055e+01, 4.47656604e+00, 1.38126899e+00]]
        
        #mean_n = X.mean(axis = 0, keepdims = True)
        #標準偏差を計算 ddof=0なら標準偏差、ddof=1なら不偏標準偏差
        #std_n = X.std(axis = 0, keepdims = True, ddof = 0)
        #標準化の計算
        X = (X - mean_n) / std_n
        
        df = pd.concat([pd.DataFrame(X, columns=columns), df[categorical_features]], axis=1)

        if save_fet:
            df.to_csv(save_name+'.csv', index=False)
            
        if save_mem:
            df = reduce_mem_usage(df)
        
        if not test:
            del df_list, X
            gc.collect()
        print ("1.4elapsed_time:{0}".format(time.time() - start) + "[sec]")

        return df
    
    def normalize(self, df):
        return (df - df.mean()) / df.std(ddof=0)

# Util

In [5]:
from sklearn.metrics import accuracy_score
import numpy as np

class Util():

    def __init__(self) -> None:
        pass

    def data_conv(self, data):
        #data = (data > 0.5).astype(int)
        return data


    def accuracy_score(self, train, predict):
        """
        どのくらい答えに近いか評価するスコアを出す
        あっているほど数値が高いようにする
        コンペによって評価方法が違うからこれを変える
        """
        #ピアソンの相関係数
        return np.corrcoef(train,predict)[0,1]

# モデル

In [6]:
from matplotlib import pyplot as plt
from sklearn.model_selection import KFold
import xgboost as xgb
from xgboost import plot_importance, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import ElasticNet
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from catboost import Pool
from catboost import CatBoost, CatBoostRegressor, CatBoostClassifier
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Dropout
from tensorflow.keras.callbacks import EarlyStopping
import pandas as pd
import pickle

from memory_profiler import profile
import gc


ut = Util()

class Models:
    def __init__(self,ID) -> None:
        fileID = ID
        self.models_xgboost = []
        for fold_id in range(4):
            try:
                with open('xgboost'+str(fileID)+'0'+str(fold_id)+'.pickle', 'rb') as web:
                    self.models_xgboost.append(pickle.load(web))
                #print('load')
            except FileNotFoundError:
                #print('Not find '+str(fileID)+'0'+str(fold_id))
                pass
    
    def __del__(self):
        """オブジェクトが破棄されるとき呼び出される"""
        print('Models died:', id(self))

    def select_model(self, categorical_features, model_name, learn_type, fileID=0, X_train=None, y_train=None, X_valid=None, y_valid=None, X_test=None):
        if model_name == "random_forest":
            score, y_val_pre, y_pred = self.random_forest(learn_type, fileID=fileID, X_train=X_train, y_train=y_train, X_valid=X_valid, y_valid=y_valid, X_test=X_test)
        elif model_name == "light_gbm":
            score, y_val_pre, y_pred = self.light_gbm(categorical_features, learn_type, fileID=fileID, X_train=X_train, y_train=y_train, X_valid=X_valid, y_valid=y_valid, X_test=X_test)
        elif model_name == 'xgboost':
            score, y_val_pre, y_pred = self.xgboost(learn_type, fileID=fileID, X_train=X_train, y_train=y_train, X_valid=X_valid, y_valid=y_valid, X_test=X_test)
        elif model_name == "catboost":
            score, y_val_pre, y_pred = self.catboost(categorical_features, learn_type, fileID=fileID, X_train=X_train, y_train=y_train, X_valid=X_valid, y_valid=y_valid, X_test=X_test)
        elif model_name == "logistic_regression":
            score, y_val_pre, y_pred = self.logistic_regression(learn_type, fileID=fileID, X_train=X_train, y_train=y_train, X_valid=X_valid, y_valid=y_valid, X_test=X_test)
        elif model_name == "dnn":
            score, y_val_pre, y_pred = self.dnn(learn_type, fileID=fileID, X_train=X_train, y_train=y_train, X_valid=X_valid, y_valid=y_valid, X_test=X_test)
        else:
            raise NameError("指定されたアルゴリズムは存在しません")
            

        return score, y_val_pre, y_pred
    
    def KFold(self, categorical_features, model_name, learn_type, fileID=0, X_train=None, y_train=None, X_test=None, n_splits=4):

        cv_score, oof_pre, y_sub = None, None, None
        scores = []
        oof_pre = np.array([])
        valid_indexs = np.array([])
        y_preds = []
        cv = KFold(n_splits=n_splits, shuffle=True, random_state=0)
        if learn_type=='learn':
            for fold_id, (train_index, valid_index) in enumerate(cv.split(X_train, y_train)):
                X_tr = X_train.loc[train_index, :]
                X_val = X_train.loc[valid_index, :]
                y_tr = y_train[train_index]
                y_val = y_train[valid_index]

                score, y_val_pre, _ = self.select_model(categorical_features, model_name, 'learn', fileID=str(fileID)+str(fold_id),X_train=X_tr, y_train=y_tr, X_valid=X_val, y_valid=y_val)

                scores.append(score)
                oof_pre = np.append(oof_pre,y_val_pre)
                valid_indexs = np.append(valid_indexs, valid_index)

            oof_pre = oof_pre[np.argsort(valid_indexs)]
            cv_score = sum(scores) / len(scores)
        elif learn_type=='predict':
            for fold_id in range(n_splits):
                score, y_val_pre, y_pred = self.select_model(categorical_features, model_name, 'predict', fileID=str(fileID)+str(fold_id), X_test=X_test)
                oof_pre = np.append(oof_pre,y_val_pre)
                y_preds.append(y_pred)
            
            y_sub = sum(y_preds) / len(y_preds)

        return cv_score, oof_pre, y_sub

    def random_forest(self, learn_type, fileID=0, X_train=None, y_train=None, X_valid=None, y_valid=None, X_test=None, n_estimators=67, max_depth=6, random_state=0):
        """
        pandasでの教師データ
        パラメータ
        return valスコア(float)、その取り出し方での予測値
        """
        print('========random_forest========')
        score, y_val_pre, y_pred = None, None, None
        if learn_type=='predict':
            with open('RandomForest'+str(fileID)+'.pickle', 'rb') as web:
                RandomForest = pickle.load(web)
            y_pred = RandomForest.predict(X_test)
        elif learn_type=='learn':
            #RandomForest = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
            RandomForest = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
            RandomForest.fit(X_train, y_train)
            y_val_pre = RandomForest.predict(X_valid)
            score = ut.accuracy_score(y_valid, y_val_pre)
            with open('RandomForest'+str(fileID)+'.pickle', 'wb') as web:
                pickle.dump(RandomForest , web)

        return score, y_val_pre, y_pred

    def light_gbm(self, categorical_features, learn_type, fileID=0, X_train=None, y_train=None, X_valid=None, y_valid=None, X_test=None, params = {'objective': 'binary','max_bin': 284,'learning_rate': 0.068,'num_leaves': 45}):
        """
        pandasでの教師データ
        categorical_features:カテゴリかる属性のカラム名を示したリスト
        パラメータ
        return valスコア(float), y_val_pre(valでの予測値), その取り出し方での予測値
        """
        print('========light_gbm========')
        score, y_val_pre, y_pred = None, None, None
        if learn_type=='predict':
            with open('light_gbm'+str(fileID)+'.pickle', 'rb') as web:
                model = pickle.load(web)
            y_pred = model.predict(X_test, num_iteration=model.best_iteration)
            y_pred = ut.data_conv(y_pred)
        elif learn_type=='learn':
            lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical_features)
            lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train, categorical_feature=categorical_features)
            model = lgb.train(params, lgb_train,valid_sets=[lgb_train, lgb_eval],verbose_eval=10,num_boost_round=1000,early_stopping_rounds=10)

            y_val_pre = model.predict(X_valid, num_iteration=model.best_iteration)
            y_val_pre = ut.data_conv(y_val_pre)
            score = ut.accuracy_score(y_valid, y_val_pre)
            with open('light_gbm'+str(fileID)+'.pickle', 'wb') as web:
                pickle.dump(model , web)

        return score, y_val_pre, y_pred
    
    def xgboost(self, learn_type, fileID=0, X_train=None, y_train=None, X_valid=None, y_valid=None, X_test=None, params = {'tree_method': 'gpu_hist', 'objective': 'reg:squarederror','silent':1, 'random_state':0,'learning_rate': 0.15, 'eval_metric': 'rmse',}, num_round = 450):
        """
        pandasでの教師データ
        categorical_features:カテゴリかる属性のカラム名を示したリスト
        パラメータ
        return valスコア(float), y_val_pre(valでの予測値), その取り出し方での予測値
        """
        score, y_val_pre, y_pred = None, None, None
        if learn_type=='predict':
            test = xgb.DMatrix(X_test)
            model = self.models_xgboost[int(str(fileID)[-1])]
            y_pred = model.predict(test)
        elif learn_type=='learn':
            print('========xgboost========')
            train = xgb.DMatrix(X_train, label=y_train)
            valid = xgb.DMatrix(X_valid, label=y_valid)
            self.model_xgboost = xgb.train(params,
                    train,#訓練データ
                    num_round,#設定した学習回数
                    early_stopping_rounds=20,
                    evals=[(train, 'train'), (valid, 'eval')],
                    verbose_eval=100
                    )
            y_val_pre = self.model_xgboost.predict(valid)
            score = ut.accuracy_score(y_valid, y_val_pre)
            with open('xgboost'+str(fileID)+'.pickle', 'wb') as web:
                pickle.dump(self.model_xgboost, web)
                
        
        
#         _, ax = plt.subplots(figsize=(12, 15))
#         xgb.plot_importance(model,
#                     ax=ax,
#                     importance_type='gain',
#                     show_values=False)
#         plt.show()
        
        
        return score, y_val_pre, y_pred

    def catboost(self, categorical_features, learn_type, fileID=0, X_train=None, y_train=None, X_valid=None, y_valid=None, X_test=None, params ={'depth' : 3,'learning_rate' : 0.054,'early_stopping_rounds' : 9,'iterations' : 474, 'loss_function' : 'RMSE', 'random_seed' :0}):
        """
        pandasでの教師データ
        categorical_features:カテゴリかる属性のカラム名を示したリスト
        パラメータ
        return valスコア(float), y_val_pre(valでの予測値), その取り出し方での予測値
        """
        print('========catboost========')
        score, y_val_pre, y_pred = None, None, None
        if learn_type=='predict':
            with open('catboost'+str(fileID)+'.pickle', 'rb') as web:
                model = pickle.load(web)
            y_pred = model.predict(X_test)
            y_pred = ut.data_conv(y_pred)
        elif learn_type=='learn':
            train = Pool(X_train, y_train, cat_features=categorical_features)
            eval = Pool(X_valid, y_valid, cat_features=categorical_features)
            #cab = CatBoostClassifier(custom_loss=['Accuracy'],random_seed=0)
            #cab = CatBoostClassifier(**params)
            cab = CatBoostRegressor(random_seed=0)
            cab = CatBoostRegressor(**params)
            model = cab.fit(train, eval_set=eval)

            y_val_pre = model.predict(X_valid)
            y_val_pre = ut.data_conv(y_val_pre)
            score = ut.accuracy_score(y_valid, y_val_pre)
            with open('catboost'+str(fileID)+'.pickle', 'wb') as web:
                pickle.dump(model , web)

        return score, y_val_pre, y_pred

    def logistic_regression(self, learn_type, fileID=0, X_train=None, y_train=None, X_valid=None, y_valid=None, X_test=None):
        """
        pandasでの教師データ
        パラメータ
        return valスコア(float)、その取り出し方での予測値
        """
        print('========logistic_regression========')
        score, y_val_pre, y_pred = None, None, None
        if learn_type=='predict':
            with open('logistic_regression'+str(fileID)+'.pickle', 'rb') as web:
                model = pickle.load(web)
            y_pred = model.predict(X_test)
            y_pred = ut.data_conv(y_pred)
        elif learn_type=='learn':
            #model = LogisticRegression(penalty='l2', solver='sag', random_state=0)
            model = ElasticNet(random_state=0)
            model.fit(X_train, y_train)
            y_val_pre = model.predict(X_valid)
            y_val_pre = ut.data_conv(y_val_pre)
            score = ut.accuracy_score(y_valid, y_val_pre)
            with open('logistic_regression'+str(fileID)+'.pickle', 'wb') as web:
                pickle.dump(model , web)

        return score, y_val_pre, y_pred

    def dnn(self, learn_type, fileID=0, X_train=None, y_train=None, X_valid=None, y_valid=None, X_test=None):
        
        print('========dnn========')
        score, y_val_pre, y_pred = None, None, None

        lr_schedule=tf.keras.optimizers.schedules.ExponentialDecay( \
                    initial_learning_rate=0.001, #初期の学習率
                    decay_steps=3, #減衰ステップ数
                    decay_rate=0.01, #最終的な減衰率 
                    staircase=True)

        model=Sequential()
        model.add(Dense(len(X_train.columns),input_shape=(len(X_train.columns),),activation='relu',
                    kernel_regularizer=keras.regularizers.l2(0.001), #重みの正則化考慮
                    kernel_initializer='random_uniform',
                    bias_initializer='zero'))
                    
        model.add(BatchNormalization()) #バッチ正規化
        model.add(Dropout(0.1)) # ドロップアウト層・ドロップアウトさせる割合
        model.add(Dense(int(len(pd.DataFrame(X_train).columns)/2),activation='sigmoid'))

        model.add(BatchNormalization()) #バッチ正規化
        model.add(Dropout(0.1)) # ドロップアウト層・ドロップアウトさせる割合
        model.add(Dense(int(len(pd.DataFrame(X_train).columns)/2),activation='sigmoid'))

        model.add(BatchNormalization()) #バッチ正規化
        model.add(Dropout(0.1)) # ドロップアウト層・ドロップアウトさせる割合
        model.add(Dense(len(pd.DataFrame(y_train).columns),activation='sigmoid'))
        Ecall=EarlyStopping(monitor='val_loss',patience=1000,restore_best_weights=False)
        model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=lr_schedule))
        model.summary()

        if learn_type=='predict':
            model.load_weights('dnn'+str(fileID)+'.h5')
            y_pred = model.predict(X_test)
            y_pred = ut.data_conv(y_pred)
        elif learn_type=='learn':
            res=model.fit(X_train.values,y_train.values,epochs=3,callbacks=[Ecall],verbose=1,validation_data=(X_valid.values,y_valid.values))
            y_val_pre = model.predict(X_valid)[:,0]
            y_val_pre = ut.data_conv(y_val_pre)
            print(y_valid.shape)
            print(y_val_pre.shape)
            score = ut.accuracy_score(y_valid, y_val_pre)
            model.save_weights('dnn'+str(fileID)+'.h5')

        return score, y_val_pre, y_pred

# パラメータオプティマイザー

In [7]:
from sklearn.model_selection import train_test_split
import optuna
from sklearn.metrics import log_loss

#md = Models(0)

#random_forest : {'n_estimators': 67, 'max_depth': 6}
#light_gbm : {'max_bin': 284, 'learning_rate': 0.06759289191947715, 'num_leaves': 45}
#xgboost : {'learning_rate': 0.180343853211702, 'num_round': 394}
#catboost : {'depth': 3, 'learning_rate': 0.053925065258405916, 'early_stopping_rounds': 9, 'iterations': 474}
class Optimizer():
    def __init__(self) -> None:
        pass

    def param_opt(self, model_name, X_train, y_train, categorical_features=None):
        """
        パラメータオプティマイザー
        model name list: random_forest, light_gbm
        """
        X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.3,random_state=0, stratify=y_train)
        study = optuna.create_study(sampler=optuna.samplers.RandomSampler(seed=0))
        if model_name == "random_forest":
            study.optimize(self.objective_random_forest(X_train, y_train, X_valid, y_valid), n_trials=80)
            return study.best_params

        elif model_name == "light_gbm":
            study.optimize(self.objective_light_gbm(X_train, y_train, X_valid, y_valid, categorical_features), n_trials=80)
            return study.best_params

        elif model_name == "xgboost":
            study.optimize(self.objective_xgboost(X_train, y_train, X_valid, y_valid), n_trials=80)
            return study.best_params

        elif model_name == "catboost":
            study.optimize(self.objective_catboost(X_train, y_train, X_valid, y_valid, categorical_features), n_trials=80)
            return study.best_params
            
        elif model_name == 'logistic_regression':
            raise NameError('logistic_regressionはパラメータが存在しないのでサポートしていません')

    def objective_random_forest(self, X_train, y_train, X_valid, y_valid):
        def objective(trial):
            n_estimators = trial.suggest_int('n_estimators', 10, 300)
            max_depth = trial.suggest_int('max_depth', 1, 15)
            _, y_val_pre, _ = md.random_forest('learn', X_train=X_train, y_train=y_train, X_valid=X_valid, y_valid=y_valid, n_estimators=n_estimators, max_depth=max_depth, random_state=0)
            
            score = log_loss(y_valid, y_val_pre)

            return score
        return objective

    def objective_light_gbm(self, X_train, y_train, X_valid, y_valid, categorical_features):
        def objective(trial):
            params = {
            'objective': 'binary',
            'max_bin': trial.suggest_int('max_bin', 255, 500),
            'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.1),
            'num_leaves': trial.suggest_int('num_leaves', 32, 128),
            }
            _, y_val_pre, _ = md.light_gbm(categorical_features, 'learn', X_train=X_train, y_train=y_train, X_valid=X_valid, y_valid=y_valid, params=params)
            
            score = log_loss(y_valid, y_val_pre)

            return score
        return objective

    def objective_xgboost(self, X_train, y_train, X_valid, y_valid):
        def objective(trial):
            params = {'objective': 'reg:squarederror',
                    'silent':1, 
                    'random_state':0,
                    'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.2), 
                    'eval_metric': 'rmse',
            }
            num_round = trial.suggest_int('num_round', 100, 900)
            _, y_val_pre, _ = md.xgboost('learn', X_train=X_train, y_train=y_train, X_valid=X_valid, y_valid=y_valid, params=params, num_round=num_round)
            
            score = log_loss(y_valid, y_val_pre)

            return score
        return objective

    def objective_catboost(self, X_train, y_train, X_valid, y_valid, categorical_features):
        def objective(trial):
            params = {
                'depth' : trial.suggest_int('depth', 1, 15),                  # 木の深さ
                'learning_rate' : trial.suggest_uniform('learning_rate', 0.01, 0.1),       # 学習率
                'early_stopping_rounds' : trial.suggest_int('early_stopping_rounds', 3, 20),
                'iterations' : trial.suggest_int('iterations', 50, 500), 
                'custom_loss' :['Accuracy'], 
                'random_seed' :0
            }
            _, y_val_pre, _ = md.catboost(categorical_features, 'learn', X_train=X_train, y_train=y_train, X_valid=X_valid, y_valid=y_valid, params=params)
            
            score = log_loss(y_valid, y_val_pre)

            return score
        return objective

    #LogisticRegressionはパラメータがない

# アンサンブル

In [8]:
from sklearn import ensemble
import numpy as np
import pandas as pd

#md = Models()
ut = Util()

class Ensemble():

    def __init__(self) -> None:
        pass

    def stacking(self, categorical_features, learn_type, fileID=0, X_train=None, y_train=None, X_test=None, fst_lay=['random_forest', 'light_gbm', 'xgboost', 'catboost'], snd_lay='light_gbm', enable_2ndorigx=True):
        #enable_2ndorigx:二層目にオリジナルの入力データを入力するか

        stack_oof_pred = []
        stack_pred = []
        for index, model_name in enumerate(fst_lay):
            
            if learn_type=='learn':
                cv_score, oof_pre, y_sub = md.KFold(categorical_features, model_name, learn_type, fileID=fileID+'0', X_train=X_train, y_train=y_train)
                stack_oof_pred = oof_pre if index == 0 else np.c_[stack_oof_pred, oof_pre]
            elif learn_type=='predict':
                cv_score, oof_pre, y_sub = md.KFold(categorical_features, model_name, learn_type, fileID=fileID+'0', X_test=X_test)
                stack_pred = y_sub if index == 0 else np.c_[stack_pred, y_sub]
            else:
                raise NameError("指定されたlearn_typeは存在しません")

        if enable_2ndorigx:
            X_train2 =  pd.concat([pd.DataFrame(stack_oof_pred), X_train], axis=1)
            X_test2 =  pd.concat([pd.DataFrame(stack_pred), X_test], axis=1)
        else:
            X_train2 = pd.DataFrame(stack_oof_pred)
            X_test2 = pd.DataFrame(stack_pred)
            categorical_features = []

        #二層目
        if learn_type=='learn':
            cv_score, oof_pre, y_sub = md.KFold(categorical_features, snd_lay, learn_type, fileID=fileID+'1', X_train=X_train2, y_train=y_train)
        elif learn_type=='predict':
            cv_score, oof_pre, y_sub = md.KFold(categorical_features, snd_lay, learn_type, fileID=fileID+'1', X_test=X_test2)
            y_sub = ut.data_conv(y_sub)

        return cv_score, oof_pre, y_sub

    def mean(self, categorical_features, learn_type, fileID=0, X_train=None, y_train=None, X_test=None, models=['random_forest', 'light_gbm', 'xgboost', 'catboost'], type='mean'):
        '''
        type:平均の取り方 
        mean -> 算術平均
        hmean -> 調和平均
        gmean -> 幾何平均
        '''

        stack_oof_pred = []
        stack_pred = []
        for index, model_name in enumerate(models):
            if learn_type=='learn':
                cv_score, oof_pre, y_sub = md.KFold(categorical_features, model_name, learn_type, fileID=fileID+'0', X_train=X_train, y_train=y_train)
                stack_oof_pred = oof_pre if index == 0 else np.c_[stack_oof_pred, oof_pre]
            elif learn_type=='predict':
                cv_score, oof_pre, y_sub = md.KFold(categorical_features, model_name, learn_type, fileID=fileID+'0', X_test=X_test)
                stack_pred = y_sub if index == 0 else np.c_[stack_pred, y_sub]
            else:
                raise NameError("指定されたlearn_typeは存在しません")

        if type == 'mean':
            y_off = np.average(stack_oof_pred, axis=1)
            y_sub = np.average(stack_pred, axis=1)
        elif type == 'hmean':
            from scipy.stats import hmean
            y_off = hmean(stack_oof_pred, axis = 1)
            y_sub = hmean(stack_pred, axis = 1)
        elif type == 'gmean':
            from scipy.stats.mstats import gmean
            y_off = gmean(stack_oof_pred, axis = 1)
            y_sub = gmean(stack_pred, axis = 1)
        
        y_off = ut.data_conv(y_off)
        y_sub = ut.data_conv(y_sub)

        cv_score = ut.accuracy_score(y_train, y_off)
        
        return cv_score, oof_pre, y_sub

# コントローラー

In [9]:
import numpy as np
import pandas as pd

op = Optimizer()
ens = Ensemble()

class Controller():

    def __init__(self,ID) -> None:
        self.md = Models(ID)
        self.ID = str(ID)

    def opt(self, X_train, y_train):
        print(op.param_opt('light_gbm', X_train, y_train))

    def KFold_learn(self, categorical_features, X_train, y_train, model_name):
        cv_score, y_val_pre, y_sub = self.md.KFold(categorical_features, model_name, 'learn', fileID=self.ID+'0', X_train=X_train, y_train=y_train)

        print('CV score-----------------------------------',cv_score)
        #random_forest 0.822635113928818
        #light_gbm 0.8293829640323895
        #catboost 0.8204004770573097
        #logistic_regression 0.6846023476241291
        #xgboost 0.8192643274119641
        #dnn 0.8159060950348378s
        
        return cv_score

    def KFold_predict(self, categorical_features, X_test, model_name):
        cv_score, y_val_pre, y_sub = self.md.KFold(categorical_features, model_name, 'predict', fileID=self.ID+'0', X_test=X_test)
        
        return y_sub

    def stacking_learn(self, categorical_features, X_train, y_train, fst_lay=['random_forest', 'light_gbm', 'xgboost', 'catboost'], snd_lay='light_gbm', enable_2ndorigx=False):
        cv_score, _, y_sub = ens.stacking(categorical_features, 'learn', fileID=self.ID, X_train=X_train, y_train=y_train, fst_lay=fst_lay, snd_lay=snd_lay, enable_2ndorigx=enable_2ndorigx)

        print('CV score-----------------------------------',cv_score)
        #0.8293829640323895 2ndlgtm
        #0.8237712635741635 2ndrandomforest

    def stacking_predict(self, categorical_features, X_test, fst_lay=['random_forest', 'light_gbm', 'xgboost', 'catboost'], snd_lay='light_gbm', enable_2ndorigx=False):
        _, _, y_sub = ens.stacking(categorical_features, 'predict', fileID=self.ID, X_test=X_test, fst_lay=fst_lay, snd_lay=snd_lay, enable_2ndorigx=enable_2ndorigx)

        sub = pd.read_csv('input/titanic/gender_submission.csv')
        sub['Survived'] = y_sub
        sub.to_csv('submission.csv', index=False)
        #0.8293829640323895

    def mean_learn(self, categorical_features, learn_type, X_train, y_train, models=['random_forest', 'light_gbm', 'xgboost', 'catboost'], type='mean'):
        cv_score, _, y_sub = ens.mean(categorical_features, learn_type, fileID=self.ID, X_train=X_train, y_train=y_train, models=models, type=type)
        #mean:0.8237934904601572 hmean:0.819304152637486 gmean:0.819304152637486
        print('CV score-----------------------------------',cv_score)

    def mean_predict(self, categorical_features, learn_type, X_test, models=['random_forest', 'light_gbm', 'xgboost', 'catboost'], type='mean'):
        cv_score, _, y_sub = ens.mean(categorical_features, learn_type, fileID=self.ID, X_test=X_test, models=models, type=type)

        sub = pd.read_csv('input/titanic/gender_submission.csv')
        sub['Survived'] = y_sub
        sub.to_csv('submission.csv', index=False)

# Main

教師データ読み込み

In [10]:
import numpy as np
import pandas as pd
import random
import os

def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything

data_folder = "../input/g-research-crypto-forecasting/"
#crypto_df = reduce_mem_usage(pd.read_csv(data_folder + 'train.csv'))
crypto_df = pd.read_csv(data_folder + 'train.csv')

train_list = list(range(13))
for Asset_ID in range(13):#通貨別にデータを作りそれを通貨別でリストにDFを格納
    #train = reduce_mem_usage(crypto_df[crypto_df["Asset_ID"]==Asset_ID].set_index("timestamp"))
    train = crypto_df[crypto_df["Asset_ID"]==Asset_ID].set_index("timestamp")
    train_list[Asset_ID] = train
del crypto_df
del train
gc.collect()

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

## 学習

In [None]:
scores = []
fe = Feature()
for Asset_ID in range(13):#通貨別にデータを作りそれを通貨別でリストにDFを格納
    
    print('通貨番号',Asset_ID)
    
    train_raw = train_list[Asset_ID].copy()
    y_train = train_raw['Target'].copy()
    X_train = train_raw.drop('Target', axis=1).copy()
    
    y_train.reset_index(drop=True, inplace=True)
    y_train.replace([np.inf, -np.inf], np.nan, inplace=True)
    y_train.fillna(method='ffill', inplace=True)
    y_train.fillna(0, inplace=True)
    X_train = fe.conv_data(X_train, Asset_ID, train_list, save_fet=False, save_name='feature')
    #print(train)
    #print(train_list[1])
    categorical_features = ['CDL2CROWS']
    
    ct = Controller(Asset_ID)
            
    cv_score = ct.KFold_learn(categorical_features, X_train, y_train, 'xgboost')
    scores.append(cv_score)
    #ct.stacking_learn(categorical_features, X_train, y_train, fst_lay=['dnn', 'light_gbm', 'xgboost'], snd_lay='light_gbm', enable_2ndorigx=False)
    
    del train_raw, y_train, X_train, ct
    gc.collect()
    
    print('5',psutil.virtual_memory().percent)
    
    
print('CV mean-----------------------------'+str(sum(scores)/len(scores)))

#CV score----------------------------------- 0.23738583123946141 light_gbm 0.2578
#xgboost 0.44045052797107087
#random forest 微妙　0.15 
#catboost 0.06

In [None]:
# #del train_raw
# del X_train, y_train
# gc.collect()

In [None]:
# del train_list
# gc.collect()

In [None]:
# import sys

# print("{}{: >25}{}{: >10}{}".format('|','Variable Name','|','Memory','|'))
# print(" ------------------------------------ ")
# for var_name in dir():
#     if not var_name.startswith("_"):
#         print("{}{: >25}{}{: >10}{}".format('|',var_name,'|',sys.getsizeof(eval(var_name)),'|'))

In [None]:
#train_list[0].reset_index(drop=True)

In [None]:
#fe.conv_data(train_list[0], save_fet=False, save_name='feature')

## predict

In [11]:
supplemental_df = pd.read_csv(data_folder + 'supplemental_train.csv')

In [12]:
test_list = list(range(13))
for Asset_ID in range(13):#通貨別にデータを作りそれを通貨別でリストにDFを格納
    supplemental_train = supplemental_df[supplemental_df["Asset_ID"]==Asset_ID].set_index("timestamp")
    if len(supplemental_train) > 26000:#30000
        #supplemental_df.drop(supplemental_df.index[-26000:], inplace=True)
        supplemental_train = supplemental_train.iloc[-26000:,:]
    test_list[Asset_ID] = supplemental_train

In [13]:
import gresearch_crypto
#以下二つは1セッションで一度しか実行できない
env = gresearch_crypto.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test set and sample submission

In [14]:
import time
fe = Feature()
ctlist = []
for Asset_ID in range(13):
    ct = Controller(Asset_ID)
    ctlist.append(ct)
for i, (df_test, df_pred) in enumerate(iter_test):
    start1 = time.time()
    for Asset_ID in range(13):
        test_raw = df_test[df_test["Asset_ID"]==Asset_ID].set_index("timestamp")
        test = pd.concat([test_list[Asset_ID], test_raw], sort=False)
        if len(test) > 26000:#30000
            #test.drop(test.index[-26000:], inplace=True)
            test = test.iloc[-26000:,:]
        test_list[Asset_ID] = test.copy()
    print ("1elapsed_time:{0}".format(time.time() - start) + "[sec]")
    for Asset_ID in range(13):
        start = time.time()
        print('通貨番号',Asset_ID)

        test_raw = test_list[Asset_ID].copy()
        
        #print(test_raw)
        
        row_id = test_raw.iat[-1, 9]
        print('row_id :',row_id)
        X_test = test_raw.drop(['Target','row_id'], axis=1).copy()
        print ("2elapsed_time:{0}".format(time.time() - start) + "[sec]")
        start = time.time()
        X_test = fe.conv_data(X_test, Asset_ID, test_list, save_fet=False, save_name='feature', save_mem=False, test=True).iloc[-1:]
        categorical_features = ['CDL2CROWS']
        print ("3elapsed_time:{0}".format(time.time() - start) + "[sec]")
        start = time.time()
        ct = ctlist[Asset_ID]
        y_sub = ct.KFold_predict(categorical_features, X_test, 'xgboost')[-1]
        #y_sub = ct.stacking_predict(categorical_features, X_test)
        print ("4elapsed_time:{0}".format(time.time() - start) + "[sec]")
        
        print('pred :',y_sub)
        #print(df_pred)
        #print(df_pred['row_id'] == row_id)
        
        df_pred.loc[df_pred['row_id'] == row_id, 'Target'] = y_sub
    
    #print(df_pred)
    print ("elapsed_time:{0}".format(time.time() - start1) + "[sec]")
    env.predict(df_pred)   # register your predictions

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
1elapsed_time:0.04245257377624512[sec]
通貨番号 0
row_id : 2.0
2elapsed_time:0.0026099681854248047[sec]
1.0elapsed_time:0.01214909553527832[sec]
1.1elapsed_time:0.009057044982910156[sec]
1.2elapsed_time:0.017085790634155273[sec]
1.3elapsed_time:0.0018231868743896484[sec]
1.4elapsed_time:0.0014662742614746094[sec]
3elapsed_time:0.04572272300720215[sec]
4elapsed_time:0.3429715633392334[sec]
pred : 0.16936742
通貨番号 1
row_id : 3.0
2elapsed_time:0.002687215805053711[sec]
1.0elapsed_time:0.014374732971191406[sec]
1.1elapsed_time:0.008491992950439453[sec]
1.2elapsed_time:0.01434636116027832[sec]
1.3elapsed_time:0.0011775493621826172[sec]
1.4elapsed_time:0.0011112689971923828[sec]
3elapsed_time:0.043028831481933594[sec]
4elapsed_time:0.021122217178344727[sec]
pred : 0.018645097
通貨番号 2
row_id : 1.0
2elapsed_time:0.002485990524291992[sec]
1.0elapsed_time:0.012256383895874023[se

In [None]:
fe = Feature()
X_test = test_raw.drop('Target', axis=1).copy()

print(1)
X_test = fe.conv_data(X_test, Asset_ID, test_list, save_fet=False, save_name='feature', save_mem=False).iloc[-1:]
print(2)
y_sub = ct.KFold_predict(categorical_features, X_test, 'xgboost')
print(2)

In [None]:
y_sub[-1]

# G-Research

* tutrial - [https://www.kaggle.com/cstein06/tutorial-to-the-g-research-crypto-competition](http://)

# description

* 15分後の価格の変動率を予測する
* 常に市場の傾向が変動するので、確実な予測モデルを立てるのが難しい(傾向が非定常的)
* オーバーフィッティングの可能性が高い
* 通貨間での価格変動の関連性がある。特にビットコインはほかの通貨に影響を与えやすい。
* 将来、価格がどのように動くかを予測することである。過去の価格の時系列データを学習データとして、価格が上がるか下がるか、またどの程度上がるか、すなわち資産リターンを予測

### Data

#### train.csv

* timestamp: データのUNIX秒。すべて60秒間隔よって、一分ごとにデータが与えられている。
* Asset_ID: asset_details.csvに書いている通貨IDと結びついており、通貨の種類の識別に使う。 (e.g. Asset_ID = 1 for Bitcoin)
* Count: 前の一分間で取引された回数
* Open: 始値 (in USD).
* High: 前の一分間での最大の価格 (in USD).
* Low: 前の一分間での最小の価格 (in USD).
* Close: Close price 終値 (in USD).
* Volume: 前の一分間の引通貨量(USD)
* VWAP: 一定期間内での取引価格の、取引量による加重平均
* Target: 15分前の価格との差をlogでとったもの Residual log-returns for the asset over a 15 minute horizon.

#### asset_details.csv

* Asset_ID 通貨ID
* Weight 性能評価するときにその通貨の正答率がどのくらい加味されるかの重み
* Asset_Name IDに結びついている通貨名


### 評価方法





# task

1. 概要(overview)をしっかり読む
2. 似ている過去のコンペを探し、参加し基本的な分析を行う
3. 似たような大会のsolutionを読む
4. 論文を読んでその分野の進捗を見逃さないようにする
5. データを分析し安定したCVのモデルを構築する
6. データ前処理、特徴量エンジニアリングを行い一定のモデルでCVを比較しいい特徴量エンジニアリングを探す
7. モデルの予測と教師データを比較し分析、予測の難しいデータに対し考察
8. 分析に基づき高性能なモデルをアンサンブルなどを取り入れて構築
9.  データ解析、結果分析からより高度な予測の難しいサンプルを解決するモデルを設計
10. 必要であれば前のステップに戻る

# scores

# else

* supplemental_trainとtrainをくっつけて最後に学習
* B (billion)	1,000,000,000

# 可視化

In [None]:
# import pandas as pd
# import numpy as np

In [None]:
# data_folder = "../input/g-research-crypto-forecasting/"

# crypto_df = reduce_mem_usage(pd.read_csv(data_folder + 'train.csv'))

In [None]:
# train_list = []
# for Asset_ID in range(13):#通貨別にデータを作りそれを通貨別でリストにDFを格納
#     train = crypto_df[crypto_df["Asset_ID"]==Asset_ID].set_index("timestamp")
#     train = fe.conv_data(train, save_fet=False, save_name='feature')
#     train_list.append(train)

In [None]:
# fe = Feature()
# for Asset_ID in range(13):#通貨別にデータを作りそれを通貨別でリストにDFを格納
#     train = train_list[Asset_ID]
    
#     categorical_features = ['Embarked', 'Pclass', 'Sex']
#     y_train = train['Target']
#     X_train = train.drop('Target', axis=1)
    
#     #ct = Controller(Asset_ID)
#     #ct.stacking_learn(categorical_features, X_train, y_train)

In [None]:
# import gresearch_crypto
# #以下二つは1セッションで一度しか実行できない
# env = gresearch_crypto.make_env()   # initialize the environment
# iter_test = env.iter_test()    # an iterator which loops over the test set and sample submission

In [None]:
# for (test_df, sample_prediction_df) in iter_test:
    
#     for Asset_ID in df_test.Asset_ID.unique():
#         df_test = df_test[df_test["Asset_ID"]==Asset_ID].set_index("timestamp")
        
#         test = fe.conv_data(test, save_fet=False, save_name='feature')

#         categorical_features = ['Embarked', 'Pclass', 'Sex']
#         y_train = train['Target']
        
#         ct = Controller(Asset_ID)
#         ct.stacking_predict(categorical_features, X_test)
    
#     #testと教師データを結合して特徴量を作る
    
#     sample_prediction_df['Target'] = 0  # make your predictions here
#     env.predict(sample_prediction_df)   # register your predictions

In [None]:
# for (test_df, sample_prediction_df) in iter_test:
#     sample_prediction_df['Target'] = 0
#     env.predict(sample_prediction_df)
#     print(test_df)

In [None]:
# data_folder = "../input/g-research-crypto-forecasting/"
# crypto_df = pd.read_csv(data_folder + 'supplemental_train.csv')

In [None]:
# import pandas_profiling

# train_list[0].profile_report()

# feature optimizer

In [None]:
# import numpy as np
# import pandas as pd
# data_folder = "../input/g-research-crypto-forecasting/"
# crypto_df = pd.read_csv(data_folder + 'train.csv')

In [None]:
# import optuna
# import talib

# price = np.array(crypto_df['Close'])
# returns = np.array(crypto_df['Close'].shift(-15)) - price
# #returns = np.array(crypto_df['Target'])

# #ROCP {'timeperiod': 6} best param -0.0005372071418046449 best score
# #MOM {'timeperiod': 127} best param -0.001971999282000135 best score
# #RSI -0.0009095762807234701 3
# #EMA {'timeperiod': 11} best param -0.00034526380519110374 best score

# def objective(trial):
#     timeperiod = trial.suggest_int('timeperiod', 2, 240)
    
#     df = talib.ROCP(price, timeperiod=timeperiod)
#     returns_new = returns[~np.isnan(df)]
#     df_new = df[~np.isnan(df)]
#     returns_last = returns_new[~np.isnan(returns_new)]
#     df_new = df_new[~np.isnan(returns_new)]
#     ic = get_ic(df_new, returns_last)
#     print(ic, timeperiod)
#     return -abs(ic)

# def get_ic(x, returns, normalize=True) -> float:
#     """
#     :param np.ndarray x: 指標
#     :param np.ndarray returns: リターン
#     :param bool normalize: x をスケーリングするかどうか
#     """
#     assert(len(x) == len(returns))
#     x = (x - x.mean()) / x.std() if normalize else x
#     returns = (returns - returns.mean()) / returns.std() if normalize else returns
#     ic = np.corrcoef(x, returns)[0, 1]

#     return ic

# study = optuna.create_study(sampler=optuna.samplers.RandomSampler(seed=0))
# study.optimize(objective, n_trials=100)
# print(study.best_params,'best param')
# print(study.best_value,'best score')