In [None]:
%pylab inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
print(plt.style.available)

In [None]:
import numpy as np
import sqlalchemy
import pandas as pd
import alphamind.data.neutralize as ne
import alphamind.data.winsorize as ws
import alphamind.data.standardize as st
import alphamind.portfolio.rankbuilder as rb
import alphamind.portfolio.percentbuilder as pb

In [None]:
%%time
risk_factor_table = 'risk_factor_500'
benchmark = 'zz500'
factor = 'ROEAfterNonRecurring' # 'DROEAfterNonRecurring'

conn = sqlalchemy.create_engine('mysql+pymysql://sa:we083826@10.63.6.176:3306/multifactor?charset=utf8')
raw_df = pd.read_sql('select factor_data.{0},  factor_data.申万一级行业, trade_data.Return as dailyReturn, {1}.*, 1 as Market '
                     'from factor_data, trade_data, {1} '
                     'where factor_data.Date = {1}.Date and factor_data.Code = {1}.Code '
                     'and factor_data.Date = trade_data.Date and factor_data.Code = trade_data.Code;'.format(factor,
                                                                                                             risk_factor_table),
                     conn)

In [None]:
del raw_df['Bank']
del raw_df['NonBankFinancial']

In [None]:
risk_facto_cols = raw_df.columns[5:]
risk_facto_cols

In [None]:
df = raw_df.copy()

In [None]:
df['d1ret'] = df.dailyReturn.groupby(df.Code).shift(-1)

In [None]:
benchmark_data = pd.read_sql('select {0}, Date from index_data'.format(benchmark), conn)

In [None]:
benchmark_data['ret'] = benchmark_data[benchmark] / benchmark_data[benchmark].shift(1) - 1.
benchmark_data['d1ret_b'] = benchmark_data['ret'] .shift(-1)

In [None]:
df = pd.merge(df, benchmark_data[['Date', 'd1ret_b']], on='Date', how='inner')

In [None]:
df.dropna(inplace=True)
df.shape

In [None]:
# to transform industry codes to int variable
old_ind_values = df['申万一级行业'].copy()

ind_list = df['申万一级行业'].unique()
ind_dict = {}
for i, ind in enumerate(ind_list):
    ind_dict[ind] = i

df['申万一级行业'].replace(ind_dict, inplace=True)

# Factor Date Preprocessing (Winsorize -> Standardize -> neutralize)
-----------------------------------------------------------------------

In [None]:
total_data = df.copy()

In [None]:
y = total_data[factor].values
y.shape = -1, 1
groups = total_data.Date.values.astype(np.int)

In [None]:
%%time
total_data['res'] = ne.neutralize(total_data[risk_facto_cols].values,
                                  st.standardize(ws.winsorize_normal(y, groups=groups),
                                                 groups=groups),
                                  groups)

In [None]:
total_data[[factor, 'res', 'Date', 'Code']].tail()

# Factor Performance (long_short)
------------------------------------------------------------------------------------

In [None]:
%%time
total_data['pos_long_short'] = total_data.res.groupby(groups).apply(lambda x: x / np.abs(x).sum())

In [None]:
total_data[['pos_long_short', 'res', 'dailyReturn', 'd1ret', 'd1ret_b', 'Code', 'Date']].tail()

In [None]:
ret_series = (total_data.pos_long_short * (total_data.d1ret - total_data.d1ret_b)).groupby(total_data.Date).sum()

In [None]:
ret_series.cumsum().plot(figsize=(14,7))

In [None]:
ret_series.cumsum()[-60:].plot(figsize=(14,7))

In [None]:
total_data.pos_long_short.groupby(total_data.Date).apply(lambda x: x.sum()).head()

# Factor Performance (Long Only - Top 100 Equal Weighted)
------------------------------------------------------------------------------------

In [None]:
%%time
use_rank = 100
total_data['pos_100'] = rb.rank_build(total_data.res.values, use_rank, groups) / use_rank

In [None]:
total_data[['pos_100', 'res', 'dailyReturn', 'd1ret', 'd1ret_b', 'Code', 'Date']].tail()

In [None]:
ret_series = (total_data.pos_100 * (total_data.d1ret - total_data.d1ret_b)).groupby(total_data.Date).sum()

In [None]:
ret_series.cumsum().plot(figsize=(14,7))

In [None]:
ret_series.cumsum()[-60:].plot(figsize=(14,7))

In [None]:
total_data.pos_100.groupby(total_data.Date).sum().head()

# Factor Performance (Long Only - Top 100 Uniformly Distributed In Each Sector Equal Weighted)
-------------------------------------------------

In [None]:
%%time
factor_data_values = total_data[['Date', 'res', '申万一级行业']]

def get_percent_pos(x):
    res_values = x.res.values
    #percent = 115. / len(res_values)
    ind_values = x['申万一级行业'].values.astype(int)
    final_choosed = pb.percent_build(res_values, 0.1, ind_values)
    return pd.Series(final_choosed / final_choosed.sum())

total_data['pos_100_uind'] = factor_data_values.groupby('Date').apply(get_percent_pos).values

In [None]:
total_data[['pos_100_uind', 'res', 'dailyReturn', 'd1ret', 'd1ret_b', 'Code', 'Date']].tail()

In [None]:
ret_series = (total_data.pos_100_uind * (total_data.d1ret - total_data.d1ret_b)).groupby(total_data.Date).sum()

In [None]:
ret_series.cumsum().plot(figsize=(14,7))

In [None]:
ret_series.cumsum()[-60:].plot(figsize=(14,7))

In [None]:
total_data.pos_100_uind.groupby(total_data.Date).sum().head()

# Postion Comparison
----------------------------

In [None]:
pos_table = total_data[['Date', '申万一级行业', 'pos_long_short', 'pos_100', 'pos_100_uind']]
pos_table['申万一级行业'] = old_ind_values.values

In [None]:
aggregated_pos_table = pos_table.groupby(['Date', '申万一级行业']).sum()
aggregated_pos_table.reset_index(level=1, inplace=True)

In [None]:
aggregated_pos_table.loc[aggregated_pos_table['申万一级行业'] == '申万化工',:].plot(figsize=(16,7))

In [None]:
aggregated_pos_table.corr()

# Risk Exposure (Long Only - Top 100)
-------------------------------------

In [None]:
excess_return = (total_data.d1ret - total_data.d1ret_b).values
excess_return.shape = -1, 1
pos_series = total_data.pos_100.values
pos_series.shape = -1, 1

In [None]:
to_explain = total_data.pos_100.values * excess_return
depends_pos = total_data[risk_facto_cols].values
depends = depends_pos * excess_return

In [None]:
%%time
idiosyncratic, other_stats = ne.neutralize(depends, to_explain, groups, output_exposure=True, output_explained=True)

In [None]:
systemetic = other_stats['explained']
exposure = other_stats['exposure']

In [None]:
analyis_table = hstack((idiosyncratic, systemetic[:, :, 0]))

In [None]:
cols = ['idiosyncratic']
cols.extend(risk_facto_cols)
analyis_table = pd.DataFrame(analyis_table, columns=cols, index=total_data.Date)

In [None]:
aggregated_bars = analyis_table.groupby(level=0).sum()
aggregated_bars.index = pd.to_datetime(aggregated_bars.index, format='%Y%m%d')

In [None]:
top_sources = aggregated_bars.sum().abs().sort_values(ascending=False).index[:10]
aggregated_bars.sum().sort_values(ascending=False).plot(kind='bar', figsize=(16, 8))

In [None]:
aggregated_bars[top_sources].cumsum().plot(figsize=(14, 7))

In [None]:
exposure_table = pd.DataFrame(exposure[:, :, 0], columns=risk_facto_cols, index=total_data.Date)
exposure_table = exposure_table.groupby(level=0).first()

In [None]:
exposure_table[top_sources.difference(['idiosyncratic'])].plot(figsize=(14,7))
plt.legend(loc='upper center', ncol=len(top_sources[1:]) // 3)

# Risk Exposure (Long Only - Top 100 Uniformly Distributed)
-------------------------------------

In [None]:
excess_return = (total_data.d1ret - total_data.d1ret_b).values
excess_return.shape = -1, 1
pos_series = total_data.pos_100_uind.values
pos_series.shape = -1, 1

In [None]:
to_explain = total_data.pos_100_uind.values * excess_return
depends_pos = total_data[risk_facto_cols].values
depends = depends_pos * excess_return

In [None]:
%%time
idiosyncratic, other_stats = ne.neutralize(depends, to_explain, groups, output_exposure=True, output_explained=True)

In [None]:
systemetic = other_stats['explained']
exposure = other_stats['exposure']

In [None]:
analyis_table = hstack((idiosyncratic, systemetic[:, :, 0]))

In [None]:
cols = ['idiosyncratic']
cols.extend(risk_facto_cols)
analyis_table = pd.DataFrame(analyis_table, columns=cols, index=total_data.Date)

In [None]:
aggregated_bars = analyis_table.groupby(level=0).sum()
aggregated_bars.index = pd.to_datetime(aggregated_bars.index, format='%Y%m%d')

In [None]:
top_sources = aggregated_bars.sum().abs().sort_values(ascending=False).index[:10]
aggregated_bars.sum().sort_values(ascending=False).plot(kind='bar', figsize=(16, 8))

In [None]:
aggregated_bars[top_sources].cumsum().plot(figsize=(14, 7))

In [None]:
exposure_table = pd.DataFrame(exposure[:, :, 0], columns=risk_facto_cols, index=total_data.Date)
exposure_table = exposure_table.groupby(level=0).first()

In [None]:
exposure_table[top_sources.difference(['idiosyncratic'])].plot(figsize=(14,7))
plt.legend(loc='upper center', ncol=len(top_sources[1:]) // 3)

# Risk Exposure for Historical Position
-----------------------------------------------------------------------

In [None]:
total_data = df.copy()
unique_code = total_data.Code.unique()
unique_date = total_data.Date.unique()

In [None]:
hist_data = pd.read_csv('portfolio.csv')
hist_data.Date = pd.to_datetime(hist_data.Date.astype('str'), format='%Y%m%d')

In [None]:
def func(x):
    x = x.set_index('Code')
    return x.ix[unique_code]

hist_data = hist_data.groupby('Date').apply(func)['Alpha_Trading']
hist_data = hist_data.reset_index()
hist_data['Alpha_Trading'] = hist_data['Alpha_Trading'].fillna(0.).values

In [None]:
hist_data = pd.merge(total_data, hist_data, on=['Date', 'Code'], how='left')

In [None]:
hist_data = hist_data[hist_data.Date >= '2015-01-09'].reset_index(drop=True)

In [None]:
hist_data['Alpha_Trading'] = hist_data['Alpha_Trading'].groupby(hist_data.Code).fillna(method='pad')

In [None]:
hist_data.dropna(inplace=True)

In [None]:
excess_return = (hist_data.d1ret - hist_data.d1ret_b).values
raw_returns = hist_data.d1ret.values
groups = hist_data.Date.values.astype(int)

In [None]:
to_explain = hist_data.Alpha_Trading.values * excess_return
to_explain.shape = -1, 1
depends_pos = hist_data[risk_facto_cols].values
depends = depends_pos * excess_return.reshape((-1, 1))

In [None]:
ret_series = pd.DataFrame(to_explain).groupby(hist_data.Date).sum()

In [None]:
ret_series.cumsum().plot(figsize=(14, 7))

In [None]:
%%time
idiosyncratic, other_stats = ne.neutralize(depends, to_explain, groups, output_exposure=True, output_explained=True)

In [None]:
systemetic = other_stats['explained']
exposure = other_stats['exposure']

In [None]:
analyis_table = hstack((idiosyncratic, systemetic[:, :, 0]))

In [None]:
cols = ['idiosyncratic']
cols.extend(risk_facto_cols)
analyis_table = pd.DataFrame(analyis_table, columns=cols, index=hist_data.Date)

In [None]:
aggregated_bars = analyis_table.groupby(level=0).sum()

In [None]:
top_sources = aggregated_bars.sum().abs().sort_values(ascending=False).index[:10]
aggregated_bars.sum().sort_values(ascending=False).plot(kind='bar', figsize=(16, 8))

In [None]:
aggregated_bars[top_sources].cumsum().plot(figsize=(14, 7))

In [None]:
exposure_table = pd.DataFrame(exposure[:, :, 0], columns=risk_facto_cols, index=hist_data.Date)
exposure_table = exposure_table.groupby(level=0).first()

In [None]:
exposure_table[top_sources.difference(['idiosyncratic'])].plot(figsize=(14,7))
plt.legend(loc='upper center', ncol=len(top_sources[1:]) // 3)

# Clean up
--------------------------