In [1]:
import numpy as np
import pandas as pd
from math import sqrt

import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import train_test_split,cross_val_score,cross_validate
from sklearn.model_selection import TimeSeriesSplit

from sklearn.linear_model import Lasso,LassoCV,LassoLarsCV 


In [2]:
# Import datasets
returns = pd.read_excel("Returns_Clean.xlsx", index_col = 'Dates')
returns.index = pd.to_datetime(returns.index, format='%Y%m%d')

flows = pd.read_excel("Flows_Clean.xlsx", index_col = 'Dates')
flows.index = pd.to_datetime(flows.index, format='%Y%m%d')

# Label the returns and flows with its stock tickets for the convenience of feature extraction
returns_labeled = returns.copy()
returns_labeled.columns =['return_'+str(i) for i in returns.columns]
flows_labeled = flows.copy()
flows_labeled.columns = ['flow_'+str(i) for i in flows.columns]

print(returns_labeled.head())
print(flows_labeled.head())
print(len(returns))


# the combine of returns and flows of stocks
S = pd.concat([returns, flows], axis = 1)
S_labeled = pd.concat([returns_labeled, flows_labeled], axis = 1)


            return_55976  return_41072  return_27748  return_75259  \
Dates                                                                
1993-04-01     -0.019685      0.019391     -0.007282     -0.003891   
1993-04-02     -0.020080      0.002717     -0.002445      0.058594   
1993-04-05     -0.024590     -0.006775     -0.002451      0.022140   
1993-04-06     -0.067227      0.009550     -0.007371      0.025271   
1993-04-07      0.000000      0.005405      0.000000      0.021127   

            return_40970  return_62519  return_21792  return_54084  \
Dates                                                                
1993-04-01      0.083333      0.027027      0.005348      0.020000   
1993-04-02     -0.038462     -0.026316     -0.007979     -0.019608   
1993-04-05     -0.040000      0.027027      0.005362      0.000000   
1993-04-06      0.041667      0.052632      0.005333      0.000000   
1993-04-07      0.040000     -0.050000      0.002653      0.000000   

            return

In [3]:
# Using the equally-weighted portfolio consisting with all 100 stocks to demonstrate the model
# The portfolio returns and flows are the average of all stocks
port_returns = pd.DataFrame(data = returns.mean(axis=1), columns = ['Portfolio_returns'] )
# print(port_returns.head())
port_flows = pd.DataFrame(data = flows.mean(axis = 1), columns = ['Portfolio_flows'])
# port_flows.head()
portfolio = pd.concat([port_returns, port_flows], axis = 1)
length = len(port_returns)
# generate predictor matrix for portfolio

# predictor matrix P_small, includes portfolio's own returns up to three lags and portfolio’s net trading flows up to three lags
P_small = pd.DataFrame()   
for j in range(1,4):
    p_t = port_returns.iloc[:length-j]
    p_t.columns = [str(j) + '_lag return_portfolio']
    f_t = port_flows.iloc[:length-j]
    f_t.columns = [str(j) + '_lag flow_portfolio']
    P_small = pd.concat([P_small, p_t], axis = 1)
    P_small = pd.concat([P_small, f_t], axis = 1)
    
    
#     generate full predictor variables matrix by adding all stock to P_small
P_full = pd.concat([P_small, S_labeled], axis = 1)
P_full.fillna(0, inplace = True)
print(P_full)




            1_lag return_portfolio  1_lag flow_portfolio  \
Dates                                                      
1993-04-01               -0.001138             -1.399531   
1993-04-02               -0.015585             -2.371056   
1993-04-05                0.003637             -0.140873   
1993-04-06               -0.003112              0.453289   
1993-04-07               -0.001031             -0.559098   
...                            ...                   ...   
2000-12-22                0.020414              1.526125   
2000-12-26                0.004582             -0.103670   
2000-12-27                0.023689              0.216686   
2000-12-28                0.019886             -1.246641   
2000-12-29                0.000000              0.000000   

            2_lag return_portfolio  2_lag flow_portfolio  \
Dates                                                      
1993-04-01               -0.001138             -1.399531   
1993-04-02               -0.015585     

In [4]:
# add skewness of return as a predictor variable
from scipy.stats import skew 
from scipy.stats import kurtosis

n=50

P_full['skewness'] = port_returns.rolling(window=n).skew()
# print(P_full['skewness'])

In [5]:
# add momentum_20_50 and momentum_8_20 as a predictor variable
            
P_full['momentum_20_50'] = np.zeros(shape=(length,1))
P_full['momentum_8_20'] = np.zeros(shape=(length,1))  
# print(type(port_returns))
for i in range(49, length):
    if portfolio.Portfolio_returns[i-19:i+1].mean() > portfolio.Portfolio_returns[i-49:i+1].mean():
        P_full['momentum_20_50'][i] = 1
    if portfolio.Portfolio_returns[i-19:i+1].mean() < portfolio.Portfolio_returns[i-49:i+1].mean():
        P_full['momentum_20_50'][i] = 0
    if portfolio.Portfolio_returns[i-7:i+1].mean() > portfolio.Portfolio_returns[i-19:i+1].mean():
        P_full['momentum_8_20'][i] = 1
    if portfolio.Portfolio_returns[i-7:i+1].mean() < portfolio.Portfolio_returns[i-19:i+1].mean():
        P_full['momentum_8_20'][i] = 0

print(P_full['momentum_20_50'])



Dates
1993-04-01    0.0
1993-04-02    0.0
1993-04-05    0.0
1993-04-06    0.0
1993-04-07    0.0
             ... 
2000-12-22    0.0
2000-12-26    0.0
2000-12-27    1.0
2000-12-28    1.0
2000-12-29    1.0
Name: momentum_20_50, Length: 1958, dtype: float64


In [6]:
# Add distribution bottom&top and cumulative returns as predictor variables

n = 50

P_full['distbottom'] = np.zeros(shape=(length,1))
P_full['disttop'] = np.zeros(shape=(length,1))
P_full['cumulative'] = np.cumprod(1 + portfolio.Portfolio_returns.values)

for i in range(49, length):
    P_full['distbottom'][i] = (P_full['cumulative'][i]- P_full['cumulative'][i-n+1:i+1].min()) / P_full['cumulative'][i]
    P_full['disttop'][i] = (P_full['cumulative'][i-n+1:i+1].max()- P_full['cumulative'][i])/P_full['cumulative'][i]

# print(P_full['cumulative'])
# print(P_full['disttop'])

In [7]:
def rsi(prices, n=14):
    deltas = np.diff(prices)
    seed = deltas[:n+1]
    up = seed[seed>=0].sum()/n
    down = -seed[seed<0].sum()/n
    rs = up/down
    rsi = np.zeros_like(prices)
    rsi[:n] = 100. - 100./(1.+rs)

    for i in range(n, len(prices)):
        delta = deltas[i-1] # cause the diff is 1 shorter

        if delta>0:
            upval = delta
            downval = 0.
        else:
            upval = 0.
            downval = -delta

        up = (up*(n-1) + upval)/n
        down = (down*(n-1) + downval)/n
    
        rs = up/down
        rsi[i] = 100. - 100./(1.+rs)

    return rsi

In [8]:
#  Add rsi and rsi change as predictor variables

P_full['rsi'] = np.zeros(shape=(length,1))
P_full['rsi'] = rsi(P_full['cumulative'])
# print(P_full['rsi'])

P_full['rsi_change'] = np.zeros(shape=(length,1))
P_full['rsi_change'] = P_full['rsi'].pct_change()

# print(P_full['rsi_change'])


In [9]:
# Add macd_rsi as predictor variables

P_full['macd_rsi'] = np.zeros(shape=(length,1))

for i in range(25, length):
    if P_full['rsi'][i-11:i+1].mean() > P_full['rsi'][i-25:i+1].mean():
        P_full['macd_rsi'][i] = 1
    if P_full['rsi'][i-11:i+1].mean() < P_full['rsi'][i-22:i+1].mean():
        P_full['macd_rsi'][i] = 0

# print(P_full['macd_rsi'])


In [10]:
# Add stochastic as predictor variables

def stochastic(prices, n=14):
    stochastic = np.zeros(shape=(length,1))
    for i in range(n, len(prices)):
        stochastic[i] = ((prices[i]-prices[i-n:i].min())/(prices[i-n:i].max()-prices[i-n:i].min()))*100
    return stochastic

P_full['stochastic'] = np.zeros(shape=(length,1))
P_full['stochastic'] = stochastic(P_full['cumulative'])

# print(P_full['stochastic'])

In [11]:
# Add flow_times_return and 5-day flow times return as predictor variables

P_full['flow_times_return'] = np.zeros(shape=(length,1))
P_full['5dayflow_times_return'] = np.zeros(shape=(length,1))

for i in range(4, length):
    P_full['flow_times_return'][i] = portfolio.Portfolio_returns[i] * portfolio.Portfolio_flows[i]
    P_full['5dayflow_times_return'][i] = portfolio.Portfolio_returns[i-4:i+1].mean() * portfolio.Portfolio_flows[i-4:i+1].sum()  

# print(P_full['5dayflow_times_return'])

In [12]:
# Add dist_top_band, dist_bottom_band as predictor variables

P_full['dist_top_band'] = np.zeros(shape=(length,1))
P_full['dist_bottom_band'] = np.zeros(shape=(length,1))

for i in range(19, length):
    std = P_full['cumulative'][i-19:i+1].std()
    mean = P_full['cumulative'][i-19:i+1].mean()
    bollinger_top = mean + 2*std
    bollinger_bottom = mean - 2*std
    P_full['dist_top_band'][i] = (bollinger_top - P_full['cumulative'][i])/P_full['cumulative'][i]
    P_full['dist_bottom_band'][i] = (P_full['cumulative'][i] - bollinger_bottom)/P_full['cumulative'][i]
    
print(P_full['dist_bottom_band'])    

Dates
1993-04-01    0.000000
1993-04-02    0.000000
1993-04-05    0.000000
1993-04-06    0.000000
1993-04-07    0.000000
                ...   
2000-12-22    0.042335
2000-12-26    0.046834
2000-12-27    0.071415
2000-12-28    0.094541
2000-12-29    0.086743
Name: dist_bottom_band, Length: 1958, dtype: float64


In [13]:
# Add difference from mean as predictor variables

def difference_from_mean(prices, n=50):
    difference = np.zeros(shape=(length,1))
    for i in range(n, len(prices)):
        difference[i] = prices[i] - prices[i-n:i].mean()
    return difference

P_full['difference_from_mean'] = np.zeros(shape=(length,1))
P_full['difference_from_mean'] = difference_from_mean(portfolio.Portfolio_returns)

print(P_full['difference_from_mean'])

Dates
1993-04-01    0.000000
1993-04-02    0.000000
1993-04-05    0.000000
1993-04-06    0.000000
1993-04-07    0.000000
                ...   
2000-12-22    0.019395
2000-12-26    0.002889
2000-12-27    0.022186
2000-12-28    0.018073
2000-12-29   -0.009087
Name: difference_from_mean, Length: 1958, dtype: float64


In [14]:
print(P_full.shape)

(1958, 221)


In [15]:
P_full['skewness'] = P_full['skewness'].iloc[:length-1]
P_full['momentum_20_50'] = P_full['momentum_20_50'].iloc[:length-1]
P_full['momentum_8_20'] = P_full['momentum_8_20'].iloc[:length-1]
P_full['distbottom'] = P_full['distbottom'].iloc[:length-1]
P_full['disttop'] = P_full['disttop'].iloc[:length-1]
P_full['rsi'] = P_full['rsi'].iloc[:length-1]
P_full['rsi_change'] = P_full['rsi_change'].iloc[:length-1]
P_full['macd_rsi'] = P_full['macd_rsi'].iloc[:length-1]
P_full['stochastic'] = P_full['stochastic'].iloc[:length-1]
P_full['flow_times_return'] = P_full['flow_times_return'].iloc[:length-1]
P_full['5dayflow_times_return'] = P_full['5dayflow_times_return'].iloc[:length-1]
P_full['dist_top_band'] = P_full['dist_top_band'].iloc[:length-1]
P_full['dist_bottom_band'] = P_full['dist_bottom_band'].iloc[:length-1]
P_full['difference_from_mean'] = P_full['difference_from_mean'].iloc[:length-1]

P_full.fillna(0, inplace = True)
print(P_full.head())

<bound method NDFrame.head of             1_lag return_portfolio  1_lag flow_portfolio  \
Dates                                                      
1993-04-01               -0.001138             -1.399531   
1993-04-02               -0.015585             -2.371056   
1993-04-05                0.003637             -0.140873   
1993-04-06               -0.003112              0.453289   
1993-04-07               -0.001031             -0.559098   
...                            ...                   ...   
2000-12-22                0.020414              1.526125   
2000-12-26                0.004582             -0.103670   
2000-12-27                0.023689              0.216686   
2000-12-28                0.019886             -1.246641   
2000-12-29                0.000000              0.000000   

            2_lag return_portfolio  2_lag flow_portfolio  \
Dates                                                      
1993-04-01               -0.001138             -1.399531   
1993-04-0

In [19]:
technical_list = P_full.iloc[:,206:].columns.to_list()
technical_list

['skewness',
 'momentum_20_50',
 'momentum_8_20',
 'distbottom',
 'disttop',
 'cumulative',
 'rsi',
 'rsi_change',
 'macd_rsi',
 'stochastic',
 'flow_times_return',
 '5dayflow_times_return',
 'dist_top_band',
 'dist_bottom_band',
 'difference_from_mean']

In [20]:
len(technical_list)

15