# <center>**DTW Match: correlation threshold method**</center>      

In [1]:
import pandas as pd
import numpy as np
import warnings
import gc
import os 
import sys
from IPython.core.interactiveshell import  InteractiveShell
pd.set_option('max.rows',200)
pd.set_option('max.columns',100)
warnings.filterwarnings('ignore')
InteractiveShell.ast_node_interactivity = "all"

In [2]:
from tqdm import tqdm

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
cm = sns.light_palette('blue',as_cmap=True)
sns.set_style("whitegrid", {'axes.grid' : False})
sns.mpl.rcParams['figure.figsize'] = (8.0, 4.0)

#### **Index data (new)**    

In [4]:
data = pd.read_pickle('..//data//data.pkl')
date_list = data.index.tolist() 

## **Ⅰ Calculate DTW Match Correlation**                

In [5]:
from DTW import DynamicTimeWrapper

In [6]:
def industryRollingMatch(data,split_date,target_len,match_len,method:'normalization method'):
    '''
    rolling match results by industry 
    '''
    
    test_data = data[data.index>split_date]
    match_record = {
        'asset':[],
        'test_start':[],
        'test_end':[],
        'train_start':[],
        'train_end':[],
        'corr':[],
    }
    
    for test_index in tqdm(range(0,len(test_data),target_len)[:-1]):
        # test data 
        test_start = test_data.index[test_index] 
        test_end = test_data.index[test_index+target_len] 
        test = data[test_start:test_end] 
        # normalize
        if method == 'median':
            test /= np.median(test,axis=0)
        elif method == 'mean':
            test /= np.mean(test,axis=0)
        # train data
        train_data = data[:test_start]
        
        for train_index in range(0,len(train_data),match_len)[:-1]:
            # train data 
            train_start = train_data.index[train_index]
            train_end = train_data.index[train_index+match_len] 
            train = train_data[train_start:train_end]
            # normalize 
            if method == 'median':
                train /= np.median(train,0)
            elif method == 'mean':
                train /= np.mean(train,0)                   
            
            for asset in data.columns:
                dtw = DynamicTimeWrapper(train[asset],test[asset])
                corr = dtw.DTWMatchCorr()
                # record 
                match_record['asset'].append(asset) 
                match_record['test_start'].append(test_start)
                match_record['test_end'].append(test_end)
                match_record['train_start'].append(train_start)
                match_record['train_end'].append(train_end)
                match_record['corr'].append(corr)
    
    return match_record 

In [7]:
split = '2016-01-01'       
target_len = 20 

### **Ⅰ-①: median normalization method**      

#### **1. historical period length: 20 days**      

In [10]:
%%time 
match_len = 20 
match_record = industryRollingMatch(data,split,target_len,match_len)

100%|█████████████████████████████████| 60/60 [2:02:57<00:00, 122.96s/it]

Wall time: 2h 2min 57s





In [11]:
match_record = pd.DataFrame(match_record)
match_record.to_pickle('..//result//corr_by_industry_20')

#### **2. historical period length: 30 days**    

In [8]:
%%time 
match_len = 30 
match_record_30 = industryRollingMatch(data,split,target_len,match_len)
match_record_30 = pd.DataFrame(match_record_30)
match_record_30.to_pickle('..//result//corr_median_30')

100%|█████████████████████████████████| 60/60 [6:24:58<00:00, 384.97s/it]


Wall time: 6h 24min 58s


#### **3. historical period length: 40 days**  

In [9]:
%%time 
match_len = 40 
match_record_40 = industryRollingMatch(data,split,target_len,match_len)
match_record_40 = pd.DataFrame(match_record_40)
match_record_40.to_pickle('..//result//corr_median_40')

100%|██████████████████████████████████| 60/60 [1:21:28<00:00, 81.47s/it]

Wall time: 1h 21min 28s





### **Ⅱ-② mean normalization method**        

#### **1. historical period length: 20 days** 

In [8]:
%%time 
match_len = 20 
match_record = industryRollingMatch(data,split,target_len,match_len)
match_record = pd.DataFrame(match_record)
match_record.to_pickle('..//result//corr_mean_20')

100%|█████████████████████████████████| 60/60 [6:28:44<00:00, 388.74s/it]

Wall time: 6h 28min 44s





#### **2. historical period length: 30 days** 

In [10]:
%%time 
match_len = 30 
match_record_30 = industryRollingMatch(data,split,target_len,match_len)
match_record_30 = pd.DataFrame(match_record_30)
match_record_30.to_pickle('..//result//corr_mean_30')

100%|█████████████████████████████████| 60/60 [2:00:21<00:00, 120.36s/it]

Wall time: 2h 21s





#### **3. historical period length: 40 days**  

In [11]:
%%time
match_len = 40 
match_record_40 = industryRollingMatch(data,split,target_len,match_len)
match_record_40 = pd.DataFrame(match_record_40)
match_record_40.to_pickle('..//result//corr_mean_40')

100%|██████████████████████████████████| 60/60 [1:18:07<00:00, 78.12s/it]

Wall time: 1h 18min 7s



