In [55]:

import pandas as pd 
import glob
import os 
from typing import List
from sqlalchemy import create_engine
from sqlalchemy.engine import Engine
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
from plotly.subplots import make_subplots
import datetime


In [56]:
# Utility functions
url = f"postgresql://quant_factor:quant_factor@192.168.1.156:5432/quant_dev"
kwargs = dict(pool_size=1, max_overflow=-1, isolation_level="AUTOCOMMIT", pool_pre_ping=True, pool_recycle=600,
                      echo=False)

def creating_engine():
    print(f"Creating sqlalchemy engine")
    engine = create_engine(url,**kwargs)
    return engine

engine = creating_engine()

def read_factor_formula_ratios():
    query = f"""SELECT name, is_active, smb_positive , pillar FROM factor.factor_formula_ratios;"""
    with engine.connect() as connection:
        print(f"Reading table")
        df = pd.read_sql(query,connection, index_col = ['name'])
    return df

def read_table_all_from_factor_processed_ratio_20_pct():
    query = 'SELECT * FROM factor.factor_processed_premium_20_pct;'
    with engine.connect() as connection:
        print(f"Reading table all")
        df_20_pct = pd.read_sql(query,connection, index_col = ['testing_period','field','group','weeks_to_expire','average_days'])
    return df_20_pct
    
def read_table_all_from_factor_processed_ratio_20_pct_with_specified_market(market: str = 'USD'):
    query = f"""SELECT * FROM factor.factor_processed_premium_20_pct WHERE "group"='{market.upper()}';"""
    with engine.connect() as connection:
        print(f"Reading table all with market = {market.upper()}")
        df_20_pct_specified_mkt = pd.read_sql(query,connection, index_col = ['testing_period','field','group'])
        df_20_pct_specified_mkt = df_20_pct_specified_mkt.drop(columns=['weeks_to_expire','average_days', 'updated'])
    return df_20_pct_specified_mkt

def merge_factor_ratio_with_smb_and_retain_active_factor(df_unprocessed: pd.DataFrame = None):
    formula = read_factor_formula_ratios()
    formula = formula.reset_index()


    active_factor = formula.loc[(formula['is_active']==True) & ~(formula['smb_positive'].isnull())]['name'].tolist()
    formula_merge = formula.loc[(formula['is_active']==True) & ~(formula['smb_positive'].isnull())][['name','smb_positive','pillar']]

    df_unprocessed = df_unprocessed.loc[df_unprocessed.index.isin(active_factor,level='field')]
    df_unprocessed = df_unprocessed.reset_index()
    df_unprocessed = df_unprocessed.merge(formula_merge, how='left', left_on=['field'], right_on=['name'])

    df_processed = df_unprocessed.set_index(['testing_period','field','group', 'pillar'])

    return df_processed

#=======================================================================================================================================================


# def data_processing_value_for_each_field_and_trading_data_and_reverse(df_processed: pd.DataFrame = None):
def categorizing_larger_between_smaller(df_reversed: pd.DataFrame = None):
    df_reversed = df_reversed.reset_index().set_index(['field','testing_period','group', 'pillar'])
    df_reversed = df_reversed.groupby(['testing_period','group', 'field', 'pillar'])['value'].transform(lambda x : '>0.001' if x.sum() >0.001 else '<-0.001' if x.sum() <-0.001 else '[-0.001,0.001]').rename('value')
    return df_reversed

def reversing_heuristics(df_processed: pd.DataFrame = None):
    df_processed['sign'] = np.where(df_processed['smb_positive']==False,-1, 1)
    df_processed['value'] = df_processed['value']*df_processed['sign']
    df_reversed = df_processed.drop(columns=['sign','smb_positive'])
    return df_reversed




Creating sqlalchemy engine


In [57]:
# Market comparison functions
def market_comparison_data_processing(df_all: pd.DataFrame = None):
    df_processed = merge_factor_ratio_with_smb_and_retain_active_factor(df_all)
    df_reversed = reversing_heuristics(df_processed=df_processed)
    df_reversed_and_categorized = categorizing_larger_between_smaller(df_reversed)
    return df_reversed_and_categorized


def for_each_pillar_compare_different_markets(df_grouped: pd.DataFrame = None):
    pillars = sorted(list(df_grouped.index.levels[3]))
    markets = sorted(list(df_grouped.index.levels[2]))


    for pillar in pillars:
        year_ranges = [datetime.date(2008,1,1),datetime.date(2019,1,1),datetime.datetime.today().date()]

        for year_range in year_ranges:
            df_group = df_grouped.reset_index()
            if year_range == datetime.date(2008,1,1):
                df_group =  df_group.loc[df_group['testing_period']<year_range].set_index(['field','testing_period', 'group','pillar'])
            elif year_range == datetime.date(2019,1,1):
                df_group =  df_group.loc[(df_group['testing_period']>datetime.date(2008,1,1))&(df_group['testing_period']<year_range)].set_index(['field','testing_period', 'group','pillar'])
            else:
                df_group =  df_group.loc[(df_group['testing_period']>datetime.date(2019,1,1))&(df_group['testing_period']<year_range)].set_index(['field','testing_period', 'group','pillar'])

            df_percent = df_group.groupby(['field','group', 'pillar']).value_counts(normalize=True).rename('value') # normalize to give percentage
            fields_in_pillar = sorted(list(df_percent.loc[df_percent.index.isin([pillar],level='pillar')].index.get_level_values('field').drop_duplicates()))
    
            i=1
            j=1
            fig = make_subplots(rows=3, cols=10, subplot_titles=fields_in_pillar)
            show_legend=True
            df = df_percent
            for field in fields_in_pillar:
                if i==1 and j==1:
                    show_legend=True
                else:
                    show_legend=False

                
                
                bigger_than_pos_001 = df.loc[df.index.isin(['>0.001'],level='value')&(df.index.isin([pillar],level='pillar'))&(df.index.isin([field],level='field'))]*100


                between = df.loc[df.index.isin(['[-0.001,0.001]'],level='value')&(df.index.isin([pillar],level='pillar'))&(df.index.isin([field],level='field'))]*100

                between_hkd = between.loc[between.index.isin(['HKD'],level='group')]
                between_cny = between.loc[between.index.isin(['CNY'],level='group')]
                between_usd = between.loc[between.index.isin(['USD'],level='group')]
                between_eur = between.loc[between.index.isin(['EUR'],level='group')]

                between.loc[(field, 'HKD', pillar, '[-0.001,0.001]')] = 0 if len(between_hkd)==0 else between_hkd[0]
                between.loc[(field, 'CNY', pillar, '[-0.001,0.001]')]  = 0 if len(between_cny)==0 else between_cny[0]
                between.loc[(field, 'USD', pillar, '[-0.001,0.001]')]  = 0 if len(between_usd)==0 else between_usd[0]
                between.loc[(field, 'EUR', pillar, '[-0.001,0.001]')]  = 0 if len(between_eur)==0 else between_eur[0]

                between = between.sort_index(level='group')

                less_than_neg_001 = df.loc[df.index.isin(['<-0.001'],level='value')&(df.index.isin([pillar],level='pillar'))&(df.index.isin([field],level='field'))]*100

                fig.add_trace(go.Bar(x=['CNY', 'EUR', 'HKD', 'USD'], y=less_than_neg_001.tolist(), name='<-0.001', showlegend=show_legend, legendgroup='<-0.001',marker_color='red'), row=i, col =j)
                fig.add_trace(go.Bar(x=['CNY', 'EUR', 'HKD', 'USD'], y=between.tolist(), name='[-0.001,0.001]', showlegend=show_legend, legendgroup='[-0.001,0.001]',marker_color='blue'), row=i, col =j)
                fig.add_trace(go.Bar(x=['CNY', 'EUR', 'HKD', 'USD'], y=bigger_than_pos_001.tolist(), name='>0.001', showlegend=show_legend, legendgroup='>0.001',marker_color='yellow'), row=i, col =j)
                fig.update_layout(barmode='stack')

                if j>=9:
                    j=1
                    i = i+1
                else:    
                    j = j+1
        
            if year_range == datetime.date(2008,1,1):
                text = 'before 2008'
            elif year_range ==datetime.date(2019,1,1):
                text = 'between 2008 and 2019'
            else:
                text = 'after 2019'
            fig.update_layout(height=2000, width=2100, title_text=f"Factor premium for different markets for pillar {pillar.upper()} "+text)
            # fig.show()
            fig.write_html(f"./market_comparison/market_comparison_{pillar}_{year_range}.html")

    


In [58]:
# Periods comparison functions
def period_comparison_data_processing(df_all: pd.DataFrame = None):
    df_processed = merge_factor_ratio_with_smb_and_retain_active_factor(df_all)
    df_reversed = reversing_heuristics(df_processed=df_processed)
    df_reversed_and_categorized = categorizing_larger_between_smaller(df_reversed)
    return df_reversed_and_categorized


def for_each_pillar_compare_different_periods(df_grouped: pd.DataFrame = None):
    pillars = sorted(list(df_grouped.index.levels[3]))
    markets = sorted(list(df_grouped.index.levels[2]))


    for pillar in pillars:
        df_group = df_grouped.loc[df_grouped.index.isin([pillar],level='pillar')]
        for market in markets:
            df_group_market = df_group.loc[df_group.index.isin([market],level='group')]
            fields_in_pillar = sorted(df_group_market.reset_index()['field'].drop_duplicates().tolist())
            i=1
            j=1
            fig = make_subplots(rows=3, cols=10, subplot_titles=fields_in_pillar)
            show_legend=True
            for field in fields_in_pillar:
                if i==1 and j==1:
                    show_legend=True
                else:
                    show_legend=False
        
                df_group_field = df_group_market.loc[df_group_market.index.isin([field],level='field')]
                df_group_year = df_group_field.reset_index()

                df_group_before_2008_for_each_field =  df_group_year.loc[df_group_year['testing_period']<datetime.date(2008,1,1)].set_index(['field','testing_period', 'group','pillar']) 
                df_group_between_for_each_field =  df_group_year.loc[(df_group_year['testing_period']>datetime.date(2008,1,1))&(df_group_year['testing_period']<datetime.date(2019,1,1))].set_index(['field','testing_period', 'group','pillar'])
                df_group_after_for_each_field =  df_group_year.loc[(df_group_year['testing_period']>datetime.date(2019,1,1))&(df_group_year['testing_period']<datetime.datetime.today().date())].set_index(['field','testing_period', 'group','pillar'])


                df_group_before_2008_total_percentage = df_group_before_2008_for_each_field.groupby(['field']).value_counts(normalize=True).rename('value')   # normalize to give percentage
                df_group_between_total_percentage = df_group_between_for_each_field.groupby(['field']).value_counts(normalize=True).rename('value')   # normalize to give percentage
                df_group_after_total_percentage = df_group_after_for_each_field.groupby(['field']).value_counts(normalize=True).rename('value')   # normalize to give percentage


                val_bigger_than_pos_001_before_2008 = df_group_before_2008_total_percentage.loc[df_group_before_2008_total_percentage.index.isin(['>0.001'],level='value')]
                val_bigger_than_pos_001_between_2008_2019 = df_group_between_total_percentage.loc[df_group_between_total_percentage.index.isin(['>0.001'],level='value')]
                val_bigger_than_pos_001_after = df_group_after_total_percentage.loc[df_group_after_total_percentage.index.isin(['>0.001'],level='value')]

                val_less_than_neg_001_before_2008 = df_group_before_2008_total_percentage.loc[df_group_before_2008_total_percentage.index.isin(['<-0.001'],level='value')]
                val_less_than_neg_001_between_2008_2019 = df_group_between_total_percentage.loc[df_group_between_total_percentage.index.isin(['<-0.001'],level='value')]
                val_less_than_neg_001_after = df_group_after_total_percentage.loc[df_group_after_total_percentage.index.isin(['<-0.001'],level='value')]

                val_between_before_2008 = df_group_before_2008_total_percentage.loc[df_group_before_2008_total_percentage.index.isin(['[-0.001,0.001]'],level='value')]
                val_between_between_2008_2019 = df_group_between_total_percentage.loc[df_group_between_total_percentage.index.isin(['[-0.001,0.001]'],level='value')]
                val_between_after = df_group_after_total_percentage.loc[df_group_after_total_percentage.index.isin(['[-0.001,0.001]'],level='value')]

                val_between_before_2008_len = len(val_between_before_2008)
                val_between_between_2008_2019_len = len(val_between_between_2008_2019)
                val_between_after_len = len(val_between_after)


                bigger_than_pos_001 = [val_bigger_than_pos_001_before_2008[0],val_bigger_than_pos_001_between_2008_2019[0],val_bigger_than_pos_001_after[0]]
                less_than_neg_001 = [val_less_than_neg_001_before_2008[0],val_less_than_neg_001_between_2008_2019[0],val_less_than_neg_001_after[0]]
                

                val_between_before_2008 = 0 if val_between_before_2008_len == 0 else val_between_before_2008[0]
                val_between_between_2008_2019 = 0 if val_between_between_2008_2019_len == 0 else val_between_between_2008_2019[0]
                val_between_after = 0 if val_between_after_len == 0 else val_between_after[0]

                between = [val_between_before_2008, val_between_between_2008_2019, val_between_after]

                fig.add_trace(go.Bar(x=['Before 2008', 'Between 2008 and 2019', 'After 2019'], y=less_than_neg_001, name='<-0.001', showlegend=show_legend, legendgroup='<-0.001',marker_color='red'), row=i, col =j)
                fig.add_trace(go.Bar(x=['Before 2008', 'Between 2008 and 2019', 'After 2019'], y=between, name='[-0.001,0.001]', showlegend=show_legend, legendgroup='[-0.001,0.001]',marker_color='blue'), row=i, col =j)
                fig.add_trace(go.Bar(x=['Before 2008', 'Between 2008 and 2019', 'After 2019'], y=bigger_than_pos_001, name='>0.001', showlegend=show_legend, legendgroup='>0.001',marker_color='yellow'), row=i, col =j)
                fig.update_layout(barmode='stack')

                if j>=10:
                    j=1
                    i = i+1
                else:    
                    j = j+1
        
            fig.update_layout(height=2000, width=2100, title_text=f"Factor premium for different year periods for pillar {pillar.upper()} for market {market.upper()}")
            # fig.show()
            fig.write_html(f"./period_comparison/periods_comparison_{pillar}_{market}.html")



In [125]:
# Graph plotting functions

def plot_histogram(data: pd.DataFrame = None, bin: int = 20, title: str = None, x_title: str = None, mean: pd.DataFrame = None, median: pd.DataFrame =None):
    fig = px.histogram(data, nbins=bin, title=title, x=x_title, histnorm='probability', color='group')
    mean_hkd = mean.loc[mean.index.isin(['HKD'],level='group')].values[0][0]
    mean_cny = mean.loc[mean.index.isin(['CNY'],level='group')].values[0][0]
    mean_eur = mean.loc[mean.index.isin(['EUR'],level='group')].values[0][0]
    mean_usd = mean.loc[mean.index.isin(['USD'],level='group')].values[0][0]
    
    median_hkd = median.loc[median.index.isin(['HKD'],level='group')].values[0][0]
    median_cny = median.loc[median.index.isin(['CNY'],level='group')].values[0][0]
    median_eur = median.loc[median.index.isin(['EUR'],level='group')].values[0][0]
    median_usd = median.loc[median.index.isin(['USD'],level='group')].values[0][0]
    
    dmax = data['value'].max()
    dmin = data['value'].min()
    
    fig.add_trace(go.Scatter(x=[mean_hkd,mean_hkd], 
                         y=[dmin,dmax], 
                         mode='lines', 
                         line=dict(color='green', width=1, dash='dash'),
                         name=f'HKD_mean_{mean_hkd}'))
    fig.add_trace(go.Scatter(x=[mean_usd,mean_usd], 
                         y=[dmin,dmax], 
                         mode='lines', 
                         line=dict(color='green', width=1, dash='dash'),
                         name=f'USD_mean_{mean_usd}'))
    fig.add_trace(go.Scatter(x=[mean_cny,mean_cny], 
                         y=[dmin,dmax], 
                         mode='lines', 
                         line=dict(color='green', width=1, dash='dash'),
                         name=f'CNY_mean_{mean_cny}'))
    fig.add_trace(go.Scatter(x=[mean_eur,mean_eur], 
                         y=[dmin,dmax], 
                         mode='lines', 
                         line=dict(color='green', width=1, dash='dash'),
                         name=f'EUR_mean_{mean_eur}'))
    
    fig.add_trace(go.Scatter(x=[median_hkd,median_hkd], 
                         y=[dmin,dmax], 
                         mode='lines', 
                         line=dict(color='red', width=1, dash='dash'),
                         name=f'HKD_median_{median_hkd}'))
    fig.add_trace(go.Scatter(x=[median_usd,median_usd], 
                         y=[dmin,dmax], 
                         mode='lines', 
                         line=dict(color='red', width=1, dash='dash'),
                         name=f'USD_median_{median_usd}'))
    fig.add_trace(go.Scatter(x=[median_cny,median_cny], 
                         y=[dmin,dmax], 
                         mode='lines', 
                         line=dict(color='red', width=1, dash='dash'),
                         name=f'CNY_median_{median_cny}'))
    fig.add_trace(go.Scatter(x=[median_eur,median_eur], 
                         y=[dmin,dmax], 
                         mode='lines', 
                         line=dict(color='red', width=1, dash='dash'),
                         name=f'EUR_median_{median_eur}'))
   
    fig.write_html(f'./normal_distribution/{title}.html')

In [60]:
#data preparation
df_usd = read_table_all_from_factor_processed_ratio_20_pct_with_specified_market('USD')
df_hkd = read_table_all_from_factor_processed_ratio_20_pct_with_specified_market('HKD')
df_cny = read_table_all_from_factor_processed_ratio_20_pct_with_specified_market('CNY')
df_eur = read_table_all_from_factor_processed_ratio_20_pct_with_specified_market('EUR')
df_all = pd.concat([df_usd, df_hkd, df_cny, df_eur])

Reading table all with market = USD
Reading table all with market = HKD
Reading table all with market = CNY
Reading table all with market = EUR


In [61]:
# Period comparison
df_reversed_and_categorized_period = period_comparison_data_processing(df_all)
for_each_pillar_compare_different_periods(df_reversed_and_categorized_period)

Reading table


In [62]:

# Market comparison
df_reversed_and_categorized_market = market_comparison_data_processing(df_all)
for_each_pillar_compare_different_markets(df_reversed_and_categorized_market)


Reading table


In [136]:
# Plotting histogram distribution
def plot_histogram_for_different_markets_between(field:str = None, start_date: datetime.date = datetime.date(2000,1,1), end_date: datetime.date = datetime.date(2023,1,1)):
    df_processed = merge_factor_ratio_with_smb_and_retain_active_factor(df_all)
    df_reversed = reversing_heuristics(df_processed)
    df = df_reversed.reset_index()
    df = df.loc[(df['field']==field)&(df['testing_period']>start_date)&(df['testing_period']<end_date)][['value','group']]
    
    mean = df.set_index('group').groupby('group').mean()
    median = df.set_index('group').groupby('group').median()

    plot_histogram(df, title=f'Histogram_for_{field}_between_{start_date}_and_{end_date}', bin=400, x_title='value',mean=mean, median=median)


In [141]:
plot_histogram_for_different_markets_between('ebitda_to_ev', start_date=datetime.date(2019,1,1), end_date=datetime.date(2023,1,1))

Reading table
