### About this file
This file cleans factor and sector return data to get them ready for portfolio allocation. 

raw data should be stored in `input` folder and outputs should be stored in `input\sector` or `input\factor` folder

Before saving the files, the processed data would be displayed in the output and a input bar would appear to ask for permission to store the data. Simply entering "Y" to the bar would allow the function to store the data (enter any other things to stop it). This feature is included for double check before storing data.

In [2]:
import numpy as np
import pandas as pd

### HK and US factor return

In [3]:
def convert(data):
    data.index = pd.to_datetime(data.index + "-5", format="%G-%V-%u").strftime("%Y-%m-%d")
    data.columns = ["rf" if item == "rf_week" else item for item in data.columns]
    data.index.name = "date"
    display(data)
    return data
    
code = 'hk400'
port = 5
path =  r"\ten_factor_vw_{}_week_{}.csv".format(code, port)
data = pd.read_csv(r'.\input'+path, index_col=0)
data = convert(data)

dates = pd.date_range(start=data.index[0], end=data.index[-1], freq="W-FRI")
print("These fridays are not covered: ")
print(dates[~dates.isin(data.index)])
print("In total the file contain {} fridays and there are {} fridays in between the horizon.".format(len(data.index), len(dates)))

save_file = input("save file? (enter Y to save)")
if save_file == "Y":
    data.to_csv(r'.\input\factor'+path)

Unnamed: 0_level_0,size,idvc,beta,bm,m12,m1,roe,ag,dtvm,ep,market,rf
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2000-01-07,0.298444,0.010769,-0.039471,0.031981,-0.021910,0.092760,-0.156988,0.002698,0.054103,0.008577,0.031862,0.00021
2000-01-14,0.477417,0.107354,0.077793,-0.046589,0.030768,0.000544,0.118260,0.023924,0.170218,-0.119043,-0.087594,0.00021
2000-01-21,0.094772,-0.063608,-0.069866,0.067108,-0.161696,0.040433,0.008297,0.121894,0.040519,-0.011585,-0.017263,0.00021
2000-01-28,-0.225225,0.156284,0.083046,-0.088520,0.201948,-0.156734,-0.133770,-0.114977,-0.078840,0.052241,-0.026133,0.00021
2000-02-04,-0.021565,0.019252,0.034534,-0.018117,0.132670,0.100314,-0.083621,-0.057233,-0.144459,0.004840,0.024051,0.00021
...,...,...,...,...,...,...,...,...,...,...,...,...
2022-02-04,0.004626,0.058892,-0.066309,0.053195,0.043099,0.055287,0.017401,0.006708,-0.002545,0.065396,-0.034644,0.00000
2022-02-11,0.004205,0.080990,-0.096479,0.056828,0.061310,0.051407,0.076991,0.047198,0.006396,0.075841,0.032656,0.00000
2022-02-18,-0.021045,-0.007990,0.067123,-0.026522,-0.057563,0.012723,-0.042392,-0.015133,-0.003653,-0.043683,-0.000935,0.00000
2022-02-25,0.004356,-0.001466,-0.001671,0.017859,-0.000380,-0.004351,0.038559,0.017801,0.006133,0.046497,-0.015739,0.00000


These fridays are not covered: 
DatetimeIndex([], dtype='datetime64[ns]', freq='W-FRI')
In total the file contain 1157 fridays and there are 1157 fridays in between the horizon.
save file? (enter Y to save)n


In [5]:
# converting LONG SHORT data
code = 'us1500'
port = 5
side = "long"   # "short" or "long"
path =  r"\ten_factor_{}_week_{}_{}.csv".format(code, side, port)
data = pd.read_csv(r'.\input'+path, index_col=0)
data = convert(data)

dates = pd.date_range(start=data.index[0], end=data.index[-1], freq="W-FRI")
print("These fridays are not covered: ")
print(dates[~dates.isin(data.index)])
print("In total the file contain {} fridays and there are {} fridays in between the horizon.".format(len(data.index), len(dates)))

save_file = input("save file? (enter Y to save)")
if save_file == "Y":
    data.to_csv(r'.\input\factor'+path)

Unnamed: 0_level_0,size,idvc,beta,bm,m12,m1,roe,ag,dtvm,ep
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2000-01-07,0.034229,0.011577,0.070158,0.007598,0.122583,0.102014,0.044329,0.025440,0.019988,0.011672
2000-01-14,0.013219,-0.049821,0.000785,-0.056097,0.045599,0.010854,-0.030360,-0.035942,-0.011001,-0.051189
2000-01-21,0.038911,-0.032090,-0.025655,-0.012261,0.007095,0.009505,-0.014779,-0.017875,0.015790,-0.036004
2000-01-28,0.015153,-0.040880,0.027770,-0.044699,0.079637,0.044977,0.001304,-0.012068,-0.006814,-0.021661
2000-02-04,0.085662,0.049791,0.069330,0.023210,0.088309,0.073000,0.065459,0.052169,0.058202,0.042355
...,...,...,...,...,...,...,...,...,...,...
2022-02-04,-0.023596,-0.002304,-0.045919,-0.027266,-0.015019,-0.006375,-0.010228,-0.002531,-0.010695,-0.031797
2022-02-11,-0.005256,-0.032283,-0.049649,-0.011946,-0.024105,-0.014321,-0.035478,-0.023347,-0.003320,-0.023180
2022-02-18,0.046102,0.055274,0.129813,0.040546,0.075942,0.034307,0.073377,0.042021,0.029437,0.055688
2022-02-25,-0.000237,0.020779,0.024009,0.025834,0.008713,0.032116,0.021205,0.013496,-0.002762,0.011785


These fridays are not covered: 
DatetimeIndex([], dtype='datetime64[ns]', freq='W-FRI')
In total the file contain 1157 fridays and there are 1157 fridays in between the horizon.
save file? (enter Y to save)Y


### CN factor return

In [311]:
def convert_cn(data):
    week_ref = pd.read_csv(r".\input\factor\CSMAR_week.csv", index_col = 1)
    new_index = data.join(week_ref['Clsdt'])
    data.index = pd.to_datetime(new_index['Clsdt'])
    
    # adjust weekdays to Friday if the end-of-week dates are not Friday
    # if year-end is not friday, combine with the year-begin week of the next year
    data.index = pd.Series(list(map(lambda x: x + np.timedelta64(4-x.weekday(), 'D'), data.index)))
    data = (data+1).groupby(data.index).prod()-1
    data.index = data.index.strftime("%Y-%m-%d")
    data.index.name = "date"
    display(data)
    return data

code = 'cn800'
port = 5
side = "long" 
path =  r".\input\ten_factor_vw_{}_week_{}.csv".format(code, port)
# path = r".\input\factor\ten_factor_{}_week_{}_{}.csv".format(code, side, str(port))
data = pd.read_csv(path, index_col = 0)
data = convert_cn(data)

dates = pd.date_range(start=data.index[0], end=data.index[-1], freq="W-FRI")
print("These fridays are not covered: ")
print(dates[~dates.isin(data.index)])
print("In total the file contain {} fridays and there are {} fridays in between the horizon.".format(len(data.index), len(dates)))

save_file = input("save file? (enter Y to save)")
if save_file == "Y":
    data.to_csv(r".\input\factor\ten_factor_vw_{}_week_{}.csv".format(code, port))

Unnamed: 0_level_0,size,idvc,beta,bm,m12,m1,roe,ag,dtvm,ep,market,rf
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2000-01-07,0.016482,-0.123263,0.032326,-0.067067,0.075248,-0.079904,-0.026584,-0.041596,-0.046936,-0.091819,0.117723,0.000428
2000-01-14,0.027138,0.062216,-0.062625,0.044838,-0.039855,0.065685,-0.005238,0.029326,0.068971,0.033622,-0.065863,0.000428
2000-01-21,0.017300,-0.007532,0.015516,0.019208,-0.020238,0.017515,-0.032202,-0.027583,0.017806,-0.002595,0.050413,0.000428
2000-01-28,-0.013990,0.022580,-0.018914,0.003145,-0.004991,0.006146,-0.000871,0.017594,-0.002357,0.006293,0.058546,0.000428
2000-02-18,0.037561,0.011885,-0.033715,0.010473,-0.013210,-0.003378,-0.028432,-0.001695,0.056722,-0.015204,0.100604,0.000428
...,...,...,...,...,...,...,...,...,...,...,...,...
2022-01-28,0.011619,0.034985,-0.044521,0.053048,-0.000609,-0.035987,-0.013667,-0.049498,0.027476,0.040124,-0.048564,0.000286
2022-02-11,-0.007509,-0.014484,0.010010,-0.014124,0.034074,-0.014450,0.013834,0.009568,-0.014409,-0.006862,0.018880,0.000286
2022-02-18,-0.004188,0.000644,-0.004545,-0.011271,0.008732,0.019244,-0.003011,0.034463,-0.008669,-0.011420,0.013508,0.000286
2022-02-25,0.002236,0.006198,-0.034722,0.033037,0.001383,-0.030679,-0.002430,-0.034938,0.006040,0.028333,-0.007349,0.000286


These fridays are not covered: 
DatetimeIndex(['2000-02-04', '2000-02-11', '2000-05-05', '2000-10-06',
               '2001-01-26', '2001-02-02', '2001-10-05', '2002-02-15',
               '2002-02-22', '2002-10-04', '2003-02-07', '2003-05-09',
               '2004-01-23', '2004-05-07', '2005-02-11', '2005-05-06',
               '2005-10-07', '2006-02-03', '2006-05-05', '2006-10-06',
               '2007-02-23', '2007-10-05', '2008-10-03', '2009-01-30',
               '2010-02-19', '2011-10-07', '2012-01-27', '2012-10-05',
               '2013-02-15', '2016-02-12', '2016-10-07', '2017-10-06',
               '2018-10-05', '2019-02-08', '2020-01-31', '2022-02-04'],
              dtype='datetime64[ns]', freq=None)
In total the file contain 1121 fridays and there are 1157 fridays in between the horizon.
save file? (enter Y to save)Y


PermissionError: [Errno 13] Permission denied: '.\\input\\factor\\ten_factor_vw_cn800_week_5.csv'

In [322]:
# converting LONG SHORT data
code = 'cn800'
port = 5
side = "short" # "short" or "long"
path = r".\input\ten_factor_{}_week_{}_{}.csv".format(code, side, str(port))
data = pd.read_csv(path, index_col = 0)
data = convert_cn(data)

dates = pd.date_range(start=data.index[0], end=data.index[-1], freq="W-FRI")
print("These fridays are not covered: ")
print(dates[~dates.isin(data.index)]) # not properly formatted
print("In total the file contain {} fridays and there are {} fridays in between the horizon.".format(len(data.index), len(dates)))

save_file = input("save file? (enter Y to save)")
if save_file == "Y":
    data.to_csv(r".\input\factor\ten_factor_{}_week_{}_{}.csv".format(code, side, port))

Unnamed: 0_level_0,size,idvc,beta,bm,m12,m1,roe,ag,dtvm,ep
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2000-01-07,0.112261,0.038425,0.104190,0.135855,0.071951,0.073955,0.121173,0.116236,0.073784,0.158770
2000-01-14,-0.012971,-0.003531,0.001691,-0.047747,-0.006743,0.005982,-0.024522,-0.043701,0.011136,-0.049816
2000-01-21,0.073254,0.046921,0.046778,0.043547,0.071642,0.067044,0.075382,0.077328,0.078120,0.064247
2000-01-28,-0.032793,-0.008597,-0.012697,-0.021542,-0.015246,-0.019518,-0.020577,-0.034396,-0.022006,-0.029670
2000-02-18,0.004485,-0.020050,-0.003597,-0.025099,-0.015269,-0.031245,-0.000876,-0.013794,0.019567,-0.014746
...,...,...,...,...,...,...,...,...,...,...
2022-01-28,-0.002070,0.003673,0.002726,-0.037493,-0.024983,-0.035152,-0.003626,0.015332,0.003660,-0.026171
2022-02-11,-0.046003,-0.046714,-0.045891,-0.025480,-0.055017,-0.045640,-0.047781,-0.043615,-0.048201,-0.035256
2022-02-18,-0.016677,-0.006072,-0.012077,-0.006555,-0.015060,-0.009462,-0.016978,-0.033067,-0.020739,-0.001678
2022-02-25,-0.014847,-0.007003,-0.001949,-0.030260,-0.026608,-0.026755,-0.024683,-0.001113,-0.010663,-0.026387


These fridays are not covered: 
DatetimeIndex(['2000-02-04', '2000-02-11', '2000-05-05', '2000-10-06',
               '2001-01-26', '2001-02-02', '2001-10-05', '2002-02-15',
               '2002-02-22', '2002-10-04', '2003-02-07', '2003-05-09',
               '2004-01-23', '2004-05-07', '2005-02-11', '2005-05-06',
               '2005-10-07', '2006-02-03', '2006-05-05', '2006-10-06',
               '2007-02-23', '2007-10-05', '2008-10-03', '2009-01-30',
               '2010-02-19', '2011-10-07', '2012-01-27', '2012-10-05',
               '2013-02-15', '2016-02-12', '2016-10-07', '2017-10-06',
               '2018-10-05', '2019-02-08', '2020-01-31', '2022-02-04'],
              dtype='datetime64[ns]', freq=None)
In total the file contain 1121 fridays and there are 1157 fridays in between the horizon.
save file? (enter Y to save)Y


### Sector return

In [300]:
code = input("input country code (us1500, cn800, hk400): ")
index = input("input index code (dj, csi, hsci): ")
data = pd.read_csv(r'.\input\{}_sectors.csv'.format(index), index_col = 0)
factor = pd.read_csv(r'.\input\factor\ten_factor_vw_{}_week_5.csv'.format(code), index_col = 0)
data.index = pd.to_datetime(data.index, format = "%d/%m/%Y" if "/" in data.index[0] else "%Y-%m-%d")
factor.index = pd.to_datetime(factor.index, format = "%d/%m/%Y" if "/" in factor.index[0] else "%Y-%m-%d")

input country code: (us1500, cn800, hk400)cn800
input index code: (dj, csi, hsci)csi


In [301]:
if code == "cn800":
    weekly_data = data[data.index.isin(factor.index)]
    weekly_data.loc[pd.to_datetime("2005-01-01"),:] = 1000 # china data start from 1000
    weekly_data = weekly_data.sort_index()
else:
    weekly_data = pd.concat([data.iloc[0,:], data.loc[factor.index].T], axis = 1).T
weekly_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weekly_data.loc[pd.to_datetime("2005-01-01"),:] = 1000 # china data start from 1000


Unnamed: 0_level_0,SH000928 Index,SH000929 Index,SH000930 Index,SH000931 Index,SH000932 Index,SH000933 Index,SH000934 Index,SH000935 Index,SH000936 Index,SH000937 Index
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2005-01-01,1000.00,1000.0000,1000.0000,1000.00,1000.0000,1000.0000,1000.0000,1000.00,1000.00,1000.00
2005-01-07,959.35,962.8600,994.0000,1005.60,989.6500,988.1600,998.5500,1020.52,988.08,973.97
2005-01-14,949.64,970.1700,998.2800,1008.25,997.0100,995.2600,1005.2300,1012.66,1002.58,967.68
2005-01-21,932.78,967.2900,979.5800,1007.95,987.3700,979.7500,1014.1000,996.03,1037.80,930.74
2005-01-28,933.94,961.3500,959.8800,974.96,971.9800,961.2700,993.4100,947.12,1009.81,913.65
...,...,...,...,...,...,...,...,...,...,...
2022-01-28,2052.26,5505.2157,5762.7752,6231.35,25787.6494,11943.2820,7333.5848,5152.03,4406.40,2256.44
2022-02-11,2321.88,5889.9735,5561.7101,6323.38,26407.5362,11534.1901,7744.3315,5007.91,4499.98,2421.49
2022-02-18,2321.09,6043.3996,5718.8248,6297.17,26865.7489,12131.6846,7565.0450,5094.77,4543.87,2356.38
2022-02-25,2296.72,6017.3860,5754.3441,6120.81,25836.9830,12296.2033,7269.1804,5166.62,4422.67,2366.82


In [302]:
weekly_ret = weekly_data.pct_change().dropna(how="all")
weekly_ret.index = pd.Series(weekly_ret.index, name = "date")
if "hk" in code:
    weekly_ret = weekly_ret.drop(columns = "HSCICO Index")
display(weekly_ret)
save_file = input("save file?")
if save_file == "Y":
    weekly_ret.to_csv(r".\input\sector\{}_sectors_weekly.csv".format(index))

Unnamed: 0_level_0,SH000928 Index,SH000929 Index,SH000930 Index,SH000931 Index,SH000932 Index,SH000933 Index,SH000934 Index,SH000935 Index,SH000936 Index,SH000937 Index
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2005-01-07,-0.040650,-0.037140,-0.006000,0.005600,-0.010350,-0.011840,-0.001450,0.020520,-0.011920,-0.026030
2005-01-14,-0.010121,0.007592,0.004306,0.002635,0.007437,0.007185,0.006690,-0.007702,0.014675,-0.006458
2005-01-21,-0.017754,-0.002969,-0.018732,-0.000298,-0.009669,-0.015584,0.008824,-0.016422,0.035129,-0.038174
2005-01-28,0.001244,-0.006141,-0.020111,-0.032730,-0.015587,-0.018862,-0.020402,-0.049105,-0.026971,-0.018362
2005-02-04,0.069801,0.054184,0.025337,0.023437,0.026245,0.015979,0.065260,0.013335,0.020291,0.054124
...,...,...,...,...,...,...,...,...,...,...
2022-01-28,-0.063267,-0.040916,-0.030680,-0.050792,-0.048151,-0.064849,-0.043786,-0.059201,-0.076550,-0.050484
2022-02-11,0.131377,0.069890,-0.034890,0.014769,0.024038,-0.034253,0.056009,-0.027973,0.021237,0.073146
2022-02-18,-0.000340,0.026049,0.028249,-0.004145,0.017352,0.051802,-0.023151,0.017345,0.009753,-0.026888
2022-02-25,-0.010499,-0.004304,0.006211,-0.028006,-0.038293,0.013561,-0.039109,0.014103,-0.026673,0.004431


save file?N


### Archive

In [318]:
def read_factor_weekly_ls(code, format = "%Y-%m-%d"):
    list_short = {'cn800':['size', 'm1','dtvm','idvc'], 'hk300':['size', 'ag', 'dtvm', 'idvc'], 'hk400':[]}
    top = pd.read_csv(r'.\input\ten_factor_{}_week_{}_5.csv'.format(code,"long"), index_col=0)
    bottom = pd.read_csv(r'.\input\ten_factor_{}_week_{}_5.csv'.format(code,"short"), index_col=0)
#     market = pd.read_csv(r".\input\factor\ten_factor_vw_{}_week_5.csv".format(code), usecols =["date", "market"], index_col = 0)
    set_full = set(top.columns)
    set_short = set(list_short[code])
    set_long = set_full - set_short
    long = pd.concat([top[set_long], bottom[set_short]], axis = 1)
    short = pd.concat([top[set_short], bottom[set_long]], axis = 1)    
#     long.index = pd.to_datetime(long.index, format=format)
#     short.index = pd.to_datetime(short.index, format=format)
    return(long, short)

code = "hk300"
long, short = read_factor_weekly_ls(code)

In [319]:
long.to_csv(r'.\input\ten_factor_{}_week_long_5.csv'.format(code))

In [320]:
short.to_csv(r'.\input\ten_factor_{}_week_short_5.csv'.format(code))