In [None]:
%pylab inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
print(plt.style.available)

In [None]:
import numpy as np
import sqlalchemy
import pandas as pd
import alphamind.data.neutralize as ne
import alphamind.data.winsorize as ws
import alphamind.data.standardize as st
import alphamind.portfolio.rankbuilder as rb

In [None]:
%%time
risk_factor_table = 'risk_factor_500'
benchmark = 'zz500'
factor = 'ROEAfterNonRecurring' # 'DROEAfterNonRecurring'

conn = sqlalchemy.create_engine('mysql+mysqldb://root:we083826@localhost:3306/multifactor?charset=utf8')
df = pd.read_sql('select factor_data.{0},  trade_data.Return as dailyReturn, {1}.*, 1 as Market '
                 'from factor_data, trade_data, {1} '
                 'where factor_data.Date = {1}.Date and factor_data.Code = {1}.Code '
                 'and factor_data.Date = trade_data.Date and factor_data.Code = trade_data.Code;'.format(factor, 
                                                                                                         risk_factor_table), 
                 conn)

In [None]:
del df['Bank']
del df['NonBankFinancial']

In [None]:
risk_facto_cols = df.columns[4:]
risk_facto_cols

In [None]:
df['d1ret'] = df.dailyReturn.groupby(df.Code).shift(-1)

In [None]:
benchmark_data = pd.read_sql('select {0}, Date from index_data'.format(benchmark), conn)

In [None]:
benchmark_data['ret'] = benchmark_data[benchmark] / benchmark_data[benchmark].shift(1) - 1.
benchmark_data['d1ret_b'] = benchmark_data['ret'] .shift(-1)

In [None]:
df = pd.merge(df, benchmark_data[['Date', 'd1ret_b']], on='Date', how='inner')

In [None]:
df.dropna(inplace=True)
df.shape

# Factor Date Preprocessing (Winsorize -> Standardize -> neutralize)
-----------------------------------------------------------------------

In [None]:
total_data = df.copy()

In [None]:
y = total_data[factor].values
y.shape = -1, 1
groups = total_data.Date.values.astype(int)

In [None]:
%%time
total_data['res'] = ne.neutralize(total_data[risk_facto_cols].values,
                                  st.standardize(ws.winsorize_normal(y, groups=groups),
                                                 groups=groups),
                                  groups)

In [None]:
total_data[risk_facto_cols].tail()

In [None]:
total_data[[factor, 'res', 'Date', 'Code']].tail()

# Factor Performance (long_short)
------------------------------------------------------------------------------------

In [None]:
%%time
total_data['pos'] = total_data.res.groupby(groups).apply(lambda x: x / np.abs(x).sum())

In [None]:
total_data[['pos', 'res', 'dailyReturn', 'd1ret', 'd1ret_b', 'Code', 'Date']].tail()

In [None]:
ret_series = (total_data.pos * (total_data.d1ret - total_data.d1ret_b)).groupby(total_data.Date).sum()
ret_series.index = pd.to_datetime(ret_series.index, format='%Y%m%d')

In [None]:
ret_series.cumsum().plot(figsize=(14,7))

In [None]:
ret_series.cumsum()[-20:].plot(figsize=(14,7))

In [None]:
total_data.pos.groupby(groups).apply(lambda x: np.abs(x).sum()).head()

# Factor Performance (Long Only - Top 100 equal weighted)
------------------------------------------------------------------------------------

In [None]:
%%time
use_rank = 100
total_data['pos'] = rb.rank_build(total_data.res.values, use_rank, groups) / use_rank

In [None]:
total_data[['pos', 'res', 'dailyReturn', 'd1ret', 'd1ret_b', 'Code', 'Date']].tail()

In [None]:
ret_series = (total_data.pos * (total_data.d1ret - total_data.d1ret_b)).groupby(groups).sum()
ret_series.index = pd.to_datetime(ret_series.index, format='%Y%m%d')

In [None]:
ret_series.cumsum().plot(figsize=(14,7))

In [None]:
ret_series.cumsum()[-20:].plot(figsize=(14,7))

In [None]:
total_data.pos.groupby(groups).sum().head()

# Risk Exposure
-------------------------------------

In [None]:
excess_return = (total_data.d1ret - total_data.d1ret_b).values
excess_return.shape = -1, 1
pos_series = total_data.pos.values
pos_series.shape = -1, 1

In [None]:
to_explain = total_data.pos.values * excess_return
depends_pos = total_data[risk_facto_cols].values
depends = depends_pos * excess_return

In [None]:
%%time
idiosyncratic, other_stats = ne.neutralize(depends, to_explain, groups, output_exposure=True, output_explained=True)

In [None]:
systemetic = other_stats['explained']
exposure = other_stats['exposure']

In [None]:
analyis_table = hstack((idiosyncratic, systemetic[:, :, 0]))

In [None]:
cols = ['idiosyncratic']
cols.extend(risk_facto_cols)
analyis_table = pd.DataFrame(analyis_table, columns=cols, index=groups)

In [None]:
aggregated_bars = analyis_table.groupby(level=0).sum()
aggregated_bars.index = pd.to_datetime(aggregated_bars.index, format='%Y%m%d')

In [None]:
top_sources = aggregated_bars.sum().abs().sort_values(ascending=False).index[:10]
aggregated_bars.sum().sort_values(ascending=False).plot(kind='bar', figsize=(16, 8))

In [None]:
aggregated_bars[top_sources].cumsum().plot(figsize=(14, 7))

In [None]:
exposure_table = pd.DataFrame(exposure[:, :, 0], columns=risk_facto_cols, index=groups)
exposure_table = exposure_table.groupby(level=0).first()
exposure_table.index = pd.to_datetime(exposure_table.index, format='%Y%m%d')

In [None]:
exposure_table[top_sources.difference(['idiosyncratic'])].plot(figsize=(14,7))
plt.legend(loc='upper center', ncol=len(top_sources[1:]) // 3)

# Clean up
-----------------------

In [None]:
del df
del total_data

In [None]:
import gc
gc.collect()