In [None]:
%pylab inline
import pandas as pd
import sqlalchemy
import numpy as np

In [None]:
%%time
factor_data = pd.read_hdf('multifactor.hdf', 'factor_data')
common_500 = pd.read_hdf('multifactor.hdf', 'common_500')
index_components = pd.read_hdf('multifactor.hdf', 'index_components')
index_data = pd.read_hdf('multifactor.hdf', 'index_data')
prod_500 = pd.read_hdf('multifactor.hdf', 'prod_500')
risk_factor_500 = pd.read_hdf('multifactor.hdf', 'risk_factor_500')
return_data_500 = pd.read_hdf('multifactor.hdf', 'return_data_500')
prod_factor_cols = pd.read_hdf('multifactor.hdf', 'prod_factor_cols')
risk_factor_cols = pd.read_hdf('multifactor.hdf', 'risk_factor_cols')
common_factor_cols = pd.read_hdf('multifactor.hdf', 'common_factor_cols')
common_500_factor_cols = pd.read_hdf('multifactor.hdf', 'common_500_factor_cols')

# data merging
---------------------

In [None]:
index_components_name = '500Weight'
benchmark = 'zz500'

In [None]:
return_data_500['dret'] = return_data_500['D1LogReturn']

In [None]:
index_data['ret'] = index_data[benchmark] / index_data[benchmark].shift(1) - 1.
index_data['dret_b'] = index_data['ret'] .shift(-2)

In [None]:
%%time
total_data = pd.merge(factor_data, prod_500[prod_factor_cols.append(pd.Series(['Date', 'Code']))], on=['Date', 'Code'])
total_data = pd.merge(total_data, common_500[common_500_factor_cols.append(pd.Series(['Date', 'Code']))], on=['Date', 'Code'])
total_data = pd.merge(total_data, index_data[['Date', 'dret_b']], on='Date', how='left')
total_data.dropna(inplace=True)
total_data = pd.merge(total_data, index_components[['Date', 'Code', index_components_name]], on=['Date', 'Code'], how='left')
total_data.fillna(0, inplace=True)
total_data = pd.merge(total_data, risk_factor_500, on=['Date', 'Code'])
total_data = pd.merge(total_data, return_data_500[['Date', 'Code', 'dret']], on=['Date', 'Code'])

In [None]:
total_data = total_data[total_data[index_components_name] != 0]

In [None]:
len(total_data)

# data processing
---------------------------

In [None]:
from alphamind.data.standardize import standardize
from alphamind.data.neutralize import neutralize
from alphamind.data.winsorize import winsorize_normal

In [None]:
total_factors = common_factor_cols.append(prod_factor_cols)
total_factors = total_factors.append(common_500_factor_cols)

#risk_factor_cols = risk_factor_cols[risk_factor_cols != 'Size']

all_factors = total_data[total_factors]
risk_factors = total_data[risk_factor_cols]
groups = total_data.Date.values.astype(np.int)

In [None]:
%%time
factor_processed = neutralize(risk_factors.values,
                              winsorize_normal(all_factors.values, groups=groups),
                              groups=groups)

In [None]:
normed_factor = pd.DataFrame(factor_processed, columns=total_factors, index=total_data.Date)

# Factor Performance (Long top)
---------------------------------

In [None]:
from alphamind.portfolio.rankbuilder import rank_build

In [None]:
%%time
use_rank = 100
pos_data = rank_build(normed_factor.values, use_rank, groups)
pos_df = pd.DataFrame(pos_data, columns=normed_factor.columns, index=normed_factor.index) / use_rank

In [None]:
ret_mat = (pos_df.values - total_data[[index_components_name]].values / 100.) * total_data[['dret']].values
ret_df = pd.DataFrame(ret_mat, columns=normed_factor.columns, index=normed_factor.index)

In [None]:
top_factors = ret_df.groupby(level=0).sum()[-90:].sum().abs().sort_values(ascending=False)[:10].index

In [None]:
ret_df[top_factors].groupby(level=0).sum()[-180:].cumsum().plot(figsize=(16, 8))

In [None]:
ret_df[top_factors].groupby(level=0).sum()[-90:].cumsum().plot(figsize=(16, 8))

In [None]:
ret_df[top_factors].groupby(level=0).sum()[-60:].cumsum().plot(figsize=(16, 8))

In [None]:
ret_df[top_factors].groupby(level=0).sum().cumsum().plot(figsize=(16, 8))

In [None]:
ret_df[prod_factor_cols].groupby(level=0).sum().cumsum().plot(figsize=(16, 8))

In [None]:
ret_df[prod_factor_cols].groupby(level=0).sum()[-90:].cumsum().plot(figsize=(16, 8))

In [None]:
pos_corr = pos_df.corr()

In [None]:
pos_corr = pos_corr.loc[total_factors, total_factors]

In [None]:
turn_over_table = {}
pos_df['Code'] = total_data.Code.values
pos_df.reset_index(inplace=True)

for name in total_factors:
    pos_series = pos_df[['Date', 'Code', name]]
    pivot_position = pos_series.pivot_table(name, index='Date', columns='Code').fillna(0.)
    turn_over_series = pivot_position.diff().abs().sum(axis=1)
    turn_over_table[name] = turn_over_series.values
    
turn_over_table = pd.DataFrame(turn_over_table, index=pos_df.Date.unique())
turn_over_table = turn_over_table[total_factors]
turn_over_table

In [None]:
from pandas import ExcelWriter

In [None]:
writer = ExcelWriter('Summary_500_long_top_tmp.xlsx')
ret_series = ret_df.groupby(level=0).sum().dropna()
ret_series.to_excel(writer, 'ret_series')
pos_corr.to_excel(writer, 'pos_corr')
turn_over_table.to_excel(writer, 'turn_over')
writer.close()

# Factor Performance (risk neutral)
---------------------------------

In [None]:
from alphamind.portfolio.linearbuilder import linear_build

In [None]:
total_data[total_factors] = normed_factor.values
total_data

In [None]:
marke_netural_pos = {}
for i, name in enumerate(total_factors):

    lbound_exposure = -1e-2 * np.ones(len(risk_factor_cols))
    ubound_exposure = 1e-2 * np.ones(len(risk_factor_cols))

    def get_benchmark_match_pos(x, name):
        er = x[name].values
        bm = x[index_components_name].values / 100.
        lbound = 0.
        ubound = 0.01 + bm
        risk_exposure = x[risk_factor_cols].values

        status, value , ret = linear_build(er,
                                           lbound=lbound,
                                           ubound=ubound,
                                           risk_exposure=risk_exposure,
                                           bm=bm,
                                           risk_target=(lbound_exposure, ubound_exposure),
                                           solver='GLPK')

        if status != 'optimal':
            return pd.Series(np.ones(len(er)) / len(er))
        else:
            return pd.Series(ret)
    
    look_into = risk_factor_cols.append(pd.Series([index_components_name, 'Date', name]))
    res = total_data[look_into].groupby('Date').apply(get_benchmark_match_pos, name=name).values
    marke_netural_pos[name] = res
    print('{0}: Factor {1} is finished'.format(i, name))

In [None]:
pos_df = pd.DataFrame(marke_netural_pos, index=total_data.Date)

In [None]:
ret_mat = (pos_df.values - total_data[[index_components_name]].values / 100.) * total_data[['dret']].values
ret_df = pd.DataFrame(ret_mat, columns=pos_df.columns, index=normed_factor.index)

In [None]:
ret_df[prod_factor_cols].groupby(level=0).sum().cumsum().plot(figsize=(16, 8))

In [None]:
ret_df[prod_factor_cols].groupby(level=0).sum()[-90:].cumsum().plot(figsize=(16, 8))

In [None]:
pos_corr = pos_df.corr()
pos_corr.loc[prod_factor_cols.tolist(), prod_factor_cols.tolist()]

In [None]:
turn_over_table = {}
pos_df['Code'] = total_data.Code.values
pos_df.reset_index(inplace=True)

for name in total_factors:
    pos_series = pos_df[['Date', 'Code', name]]
    pivot_position = pos_series.pivot_table(name, index='Date', columns='Code').fillna(0.)
    turn_over_series = pivot_position.diff().abs().sum(axis=1)
    turn_over_table[name] = turn_over_series.values
    
turn_over_table = pd.DataFrame(turn_over_table, index=pos_df.Date.unique())
turn_over_table = turn_over_table[total_factors]
turn_over_table

In [None]:
pos_corr = pos_corr.loc[total_factors, total_factors]

writer = ExcelWriter('Summary_500_risk_neutral_tmp.xlsx')
ret_series = ret_df.groupby(level=0).sum().dropna()
ret_series.to_excel(writer, 'ret_series')
pos_corr.to_excel(writer, 'pos_corr')
turn_over_table.to_excel(writer, 'turn_over')
writer.close()

In [None]:
turn_over_table

# Raw Product Factor 
-----------------------------------------

In [None]:
risk_factor_values = total_data[risk_factor_cols].values
index_components_values = total_data[[index_components_name]].values / 100.

In [None]:
for i, name in enumerate(total_factors):
    pos_values = pos_df[[name]].values
    risk_values = (pos_values - index_components_values) * risk_factor_values
    risk_tables = pd.DataFrame(risk_values, columns=risk_factor_cols, index=total_data.Date)
    aggregated_risk = risk_tables.groupby(level=0).sum()
    print('{0}: Factor {1}, {2}, {3}'.format(i, name, aggregated_risk.min(),aggregated_risk.max()))

In [None]:
aggregated_risk.max()