In [None]:
%pylab inline
import pandas as pd
import sqlalchemy
import numpy as np

In [None]:
engine = sqlalchemy.create_engine('mysql+mysqldb://sa:we083826@10.63.6.176/multifactor?charset=utf8')

In [None]:
common_factors = ['EPSAfterNonRecurring',
                  'DivP']

prod_factor = ['CFinc1', 'BDTO', 'RVOL', 'CHV']
common_500 = []

factor_weights = 1. / np.array([15.44, 32.72, 49.90, 115.27, 97.76, 280.89])
factor_weights = factor_weights / factor_weights.sum()

total_factors = common_factors + common_500 + prod_factor
risk_factor_500 = ['CommunicationsAndTransportation',
'LeisureServices',
'MultiMedia',
'PublicUtility',
'Agriculture',
'ChemicalIndustry',
'MedicationAndBio',
'CommercialTrade',
'DefenseIndustry',
'HouseholdAppliances',
'ConstructionAndMaterial',
'BuildingDecoration',
'RealEstate',
'DiversifiedMetal',
'Machinary',
'MotorVehicle',
'ElectronicIndustry',
'ElectricalEquip',
'TextileAndGarment',
'Synthetics',
'Computer',
'LightManufacturing',
'Telecoms',
'ExtractiveIndustry',
'Metal',
'FoodAndBeverage',
'Size']

index_components = '500Weight'
return_data = 'D1LogReturn'

In [None]:
commo_factors_df = pd.read_sql('select Date, Code, 申万一级行业, {0} from factor_data'.format(','.join(common_factors)), engine)

In [None]:
prod_factors_df = pd.read_sql('select Date, Code, {0} from prod_500'.format(','.join(prod_factor)), engine)

In [None]:
common_500_df = pd.read_sql('select Date, Code, {0} from common_500'.format(','.join(common_500)), engine)

In [None]:
risk_factor_df = pd.read_sql('select Date, Code, {0} from risk_factor_500'.format(','.join(risk_factor_500)), engine)

In [None]:
index_components_df = pd.read_sql('select Date, Code, {0} from index_components'.format(index_components), engine)

In [None]:
return_df = pd.read_sql('select Date, Code, {0} from return_500'.format(return_data), engine)

# Merge Data
-------------------------

In [None]:
%%time
#total_data = pd.merge(commo_factors_df, common_500_df, on=['Date', 'Code'])
total_data = pd.merge(commo_factors_df, prod_factors_df, on=['Date', 'Code'])
total_data = pd.merge(total_data, risk_factor_df, on=['Date', 'Code'])
total_data = pd.merge(total_data, index_components_df, on=['Date', 'Code'])
total_data = pd.merge(total_data, return_df, on=['Date', 'Code'])

In [None]:
total_data = total_data[total_data[index_components] != 0]

In [None]:
len(total_data)

# Process Data
--------------------------------

In [None]:
from alphamind.data.standardize import standardize
from alphamind.data.neutralize import neutralize
from alphamind.data.winsorize import winsorize_normal

In [None]:
risk_factors_names = risk_factor_500 + ['Market']
total_data['Market'] = 1.

all_factors = total_data[total_factors]
risk_factors = total_data[risk_factors_names]
groups = total_data.Date.values.astype(np.int)

In [None]:
%%time
factor_processed = neutralize(risk_factors.values,
                              standardize(winsorize_normal(all_factors.values, groups=groups),
                                          groups=groups),
                              groups=groups)

In [None]:
normed_factor = pd.DataFrame(factor_processed, columns=total_factors, index=total_data.Date)

In [None]:
normed_factor.tail()

In [None]:
res = normed_factor.iloc[:, :-1] @ factor_weights[:-1]

In [None]:
total_data['res'] = res
total_data[total_factors] = normed_factor.values

In [None]:
total_data.tail()

# Factor Performance (Long Top)
---------------------------------

In [None]:
from alphamind.portfolio.rankbuilder import rank_build

In [None]:
factor_names = total_factors + ['res']

In [None]:
%%time
use_rank = 100
pos_data = rank_build(total_data[factor_names].values, use_rank, groups)
pos_df = pd.DataFrame(pos_data, columns=factor_names, index=normed_factor.index) / use_rank

In [None]:
ret_mat = (pos_df.values - total_data[[index_components]].values / 100.) * total_data[['D1LogReturn']].values
ret_df = pd.DataFrame(ret_mat, columns=factor_names, index=normed_factor.index)

In [None]:
ret_df.groupby(level=0).sum().tail()

In [None]:
ret_df['res'].groupby(level=0).sum()[-500:].cumsum().plot(figsize=(16, 8))

In [None]:
pos_df['Code'] = total_data.Code.values
pos_df['Ind'] = total_data['申万一级行业'].values
pos_df['bm'] = total_data[index_components].values / 100.

In [None]:
pos_df.loc['2017-01-01':, :].to_csv('aggregated_pos.csv')

In [None]:
turn_over_table = {}
pos_df['Code'] = total_data.Code.values
pos_df.reset_index(inplace=True)

for name in total_factors + ['res']:
    pos_series = pos_df[['Date', 'Code', name]]
    pivot_position = pos_series.pivot_table(name, index='Date', columns='Code').fillna(0.)
    turn_over_series = pivot_position.diff().abs().sum(axis=1)
    turn_over_table[name] = turn_over_series.values
    
turn_over_table = pd.DataFrame(turn_over_table, index=pos_df.Date.unique())
turn_over_table = turn_over_table[total_factors + ['res']]

In [None]:
ret_tc_long_top_df = ret_df.groupby(level=0).sum() - turn_over_table * 0.0015

In [None]:
ret_tc_long_top_df[total_factors + ['res']][-30:].cumsum().plot(figsize=(16, 8))

In [None]:
ret_tc_long_top_df['res'][-30:].cumsum().plot(figsize=(16, 8))

# Factor Performance (risk neutral)
---------------------------------

In [None]:
import alphamind.portfolio.linearbuilder as lb
import importlib
importlib.reload(lb)

In [None]:
marke_netural_pos = {}

for i, name in enumerate(total_factors + ['res']):

    lbound_exposure = -0.01
    ubound_exposure = 0.01

    def get_benchmark_match_pos(x, name):
        er = x[name].values
        bm = x[index_components].values / 100.
        lbound = 0.
        ubound = 0.01 + bm
        risk_exposure = x[risk_factors_names].values

        status, value , ret = lb.linear_build(er,
                                           lbound=lbound,
                                           ubound=ubound,
                                           risk_exposure=risk_exposure,
                                           bm=bm,
                                           risk_target=(lbound_exposure, ubound_exposure),
                                           solver=None)
        print(status)

        if status != 'optimal':
            return pd.Series(np.ones(len(er)) / len(er))
        else:
            return pd.Series(ret)
    
    look_into = risk_factors_names + [index_components, 'Date', name]
    res = total_data[look_into].groupby('Date').apply(get_benchmark_match_pos, name=name).values
    marke_netural_pos[name] = res
    print('{0}: Factor {1} is finished'.format(i, name))

In [None]:
pos_df = pd.DataFrame(marke_netural_pos, index=total_data.Date)

In [None]:
ret_mat = (pos_df.values - total_data[[index_components]].values / 100.) * total_data[['D1LogReturn']].values
ret_df = pd.DataFrame(ret_mat, columns=pos_df.columns, index=normed_factor.index)

In [None]:
ret_df[total_factors + ['res']].groupby(level=0).sum()[-500:].cumsum().plot(figsize=(16, 8))

In [None]:
ret_df[total_factors + ['res']].groupby(level=0).sum().tail()

In [None]:
pos_df['Code'] = total_data.Code.values
pos_df['Ind'] = total_data['申万一级行业'].values
pos_df['bm'] = total_data[index_components].values / 100.
pos_df.loc['2017-01-01':, :].to_csv('aggregated_pos_risk_neutral.csv')

In [None]:
ret_df['res'].groupby(level=0).sum().cumsum().plot(figsize=(16, 8))

In [None]:
ret_df['res'].groupby(level=0).sum()[-500:].cumsum().plot(figsize=(16, 8))

In [None]:
pos_df['2017-01-01':].corr()

In [None]:
pos_df.corr()

In [None]:
turn_over_table = {}
pos_df['Code'] = total_data.Code.values
pos_df.reset_index(inplace=True)

for name in total_factors + ['res']:
    pos_series = pos_df[['Date', 'Code', name]]
    pivot_position = pos_series.pivot_table(name, index='Date', columns='Code').fillna(0.)
    turn_over_series = pivot_position.diff().abs().sum(axis=1)
    turn_over_table[name] = turn_over_series.values
    
turn_over_table = pd.DataFrame(turn_over_table, index=pos_df.Date.unique())
turn_over_table = turn_over_table[total_factors + ['res']]

In [None]:
ret_tc_risk_neutral_df = ret_df.groupby(level=0).sum() - turn_over_table * 0.0015

In [None]:
ret_tc_risk_neutral_df[total_factors + ['res']].cumsum().plot(figsize=(16, 8))

In [None]:
ret_tc_risk_neutral_df['res'][-500:].cumsum().plot(figsize=(16, 8))

# Comparisong
--------------------

In [None]:
ret_tc_risk_neutral_df[-500:].std()[total_factors + ['res']]

In [None]:
ret_tc_long_top_df[-500:].std()[total_factors + ['res']]

In [None]:
ret_tc_risk_neutral_df[-500:].mean()[total_factors + ['res']]

In [None]:
ret_tc_long_top_df[-500:].mean()[total_factors + ['res']]

In [None]:
ret_tc_risk_neutral_df.tail(50)

In [None]:
ret_df.groupby(level=0).sum().tail(50)