In [None]:
%pylab inline
import pandas as pd
import sqlalchemy
import numpy as np

In [None]:
factor_data = pd.read_hdf('multifactor.hdf', 'factor_data')
index_components = pd.read_hdf('multifactor.hdf', 'index_components')
index_data = pd.read_hdf('multifactor.hdf', 'index_data')
prod_500 = pd.read_hdf('multifactor.hdf', 'prod_500')
risk_factor_500 = pd.read_hdf('multifactor.hdf', 'risk_factor_500')
trade_data = pd.read_hdf('multifactor.hdf', 'trade_data')
prod_factor_cols = pd.read_hdf('multifactor.hdf', 'prod_factor_cols')
risk_factor_cols = pd.read_hdf('multifactor.hdf', 'risk_factor_cols')
common_factor_cols = pd.read_hdf('multifactor.hdf', 'common_factor_cols')

# data merging
---------------------

In [None]:
del risk_factor_500['Bank']
del risk_factor_500['NonBankFinancial']
del factor_data['申万二级行业']
del factor_data['申万三级行业']

In [None]:
index_components_name = '500Weight'
benchmark = 'zz500'

del risk_factor_cols[25]
del risk_factor_cols[26]

In [None]:
decay = 2

In [None]:
trade_data['dret'] = trade_data.Return.groupby(trade_data.Code).shift(-decay)

In [None]:
index_data['ret'] = index_data[benchmark] / index_data[benchmark].shift(1) - 1.
index_data['dret_b'] = index_data['ret'] .shift(-decay)

In [None]:
total_data = pd.merge(factor_data, prod_500[prod_factor_cols.append(pd.Series(['Date', 'Code']))], on=['Date', 'Code'])
total_data = pd.merge(total_data, index_data[['Date', 'dret_b']], on='Date', how='left')
total_data.dropna(inplace=True)
total_data = pd.merge(total_data, index_components[['Date', 'Code', index_components_name]], on=['Date', 'Code'], how='left')
total_data.fillna(0, inplace=True)
total_data = pd.merge(total_data, risk_factor_500, on=['Date', 'Code'])
total_data = pd.merge(total_data, trade_data[['Date', 'Code', 'dret']], on=['Date', 'Code'])

In [None]:
total_data = total_data[total_data[index_components_name] != 0]

# data processing
---------------------------

In [None]:
from alphamind.data.standardize import standardize
from alphamind.data.neutralize import neutralize
from alphamind.data.winsorize import winsorize_normal

In [None]:
total_factors = common_factor_cols.append(prod_factor_cols)

all_factors = total_data[total_factors]
risk_factors = total_data[risk_factor_cols]
groups = total_data.Date.values.astype(np.int)

In [None]:
%%time
factor_processed = neutralize(risk_factors.values,
                              standardize(winsorize_normal(all_factors.values, groups=groups),
                                          groups=groups),
                              groups=groups)

In [None]:
normed_factor = pd.DataFrame(factor_processed, columns=total_factors, index=total_data.Date)

# Factor Performance (Long top)
---------------------------------

In [None]:
from alphamind.portfolio.rankbuilder import rank_build

In [None]:
%%time
use_rank = 100
pos_data = rank_build(normed_factor.values, use_rank, groups)
pos_df = pd.DataFrame(pos_data, columns=normed_factor.columns, index=normed_factor.index) / use_rank

In [None]:
ret_mat = (pos_df.values - total_data[[index_components_name]].values / 100.) * total_data[['dret']].values
ret_df = pd.DataFrame(ret_mat, columns=normed_factor.columns, index=normed_factor.index)

In [None]:
top_factors = ret_df.groupby(level=0).sum()[-90:].sum().abs().sort_values(ascending=False)[:10].index

In [None]:
ret_df[top_factors].groupby(level=0).sum()[-180:].cumsum().plot(figsize=(16, 8))

In [None]:
ret_df[top_factors].groupby(level=0).sum()[-90:].cumsum().plot(figsize=(16, 8))

In [None]:
ret_df[top_factors].groupby(level=0).sum()[-60:].cumsum().plot(figsize=(16, 8))

In [None]:
ret_df[top_factors].groupby(level=0).sum().cumsum().plot(figsize=(16, 8))

In [None]:
ret_df[prod_factor_cols].groupby(level=0).sum().cumsum().plot(figsize=(16, 8))

In [None]:
ret_df[prod_factor_cols].groupby(level=0).sum()[-90:].cumsum().plot(figsize=(16, 8))

In [None]:
pos_corr = pos_df.corr()

In [None]:
pos_corr.loc[prod_factor_cols.tolist(), prod_factor_cols.tolist()]

# Save Data
------------------------

In [None]:
from pandas import ExcelWriter

In [None]:
writer = ExcelWriter('Summary_500.xlsx')
ret_series = ret_df.groupby(level=0).sum().dropna()
ret_series.to_excel(writer, 'ret_series')
pos_corr.to_excel(writer, 'pos_corr')
writer.close()

# Raw Product Factor 
-----------------------------------------

In [None]:
pos_data = rank_build(total_data[prod_factor_cols].values, use_rank, groups)
pos_df = pd.DataFrame(pos_data, columns=prod_factor_cols, index=total_data.Date) / use_rank

In [None]:
ret_mat = (pos_df.values - total_data[[index_components_name]].values / 100.) * total_data[['dret']].values
ret_df = pd.DataFrame(ret_mat, columns=pos_df.columns, index=pos_df.index)

In [None]:
ret_df.groupby(level=0).sum()[-90:].cumsum().plot(figsize=(16, 8))

In [None]:
len(total_data)