### **This jupyter notebook is for analysing factor premium. Specifically, we design this notebook to plot positive and negative factor premiums, median heatplots etc, for deciding whether to reverse heuristics or not. Please refer to Factor model confluence page to understand better about heuristics.**

In [1]:
# Block 1 import
import pandas as pd 
import glob
import os 
from typing import List
from sqlalchemy import create_engine
from sqlalchemy.engine import Engine
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
from plotly.subplots import make_subplots
import datetime
import numpy as np
from typing import List
import openpyxl
from openpyxl.styles import Color
from openpyxl.formatting.rule import ColorScale, FormatObject, ColorScaleRule
from openpyxl.formatting.rule import Rule
import itertools


#### Utility functions and data preparation are the two function blocks that are essential for running the rest of other functions. Hence, please run these two function blocks before running other blocks.

In [2]:
# Blcok 2 Utility functions
url = f"postgresql://quant_factor:quant_factor@192.168.1.156:5432/quant_dev"
kwargs = dict(pool_size=1, max_overflow=-1, isolation_level="AUTOCOMMIT", pool_pre_ping=True, pool_recycle=600,
                      echo=False)

def creating_engine():
    """This function creates a sqlalchemy engine using kwargs and url defined in this cell for query.

    Returns:
        engine (sqlalchemy.engine.Engine): sqlalchmey engine for connection
    """
    print(f"Creating sqlalchemy engine")
    engine = create_engine(url,**kwargs)
    return engine

engine = creating_engine()

def read_factor_formula_ratios():
    """This function reads name, is_active, smb_positive and pillar columns from factor_formula_ratios table.  

    Returns:
        df (pd.DataFrame): Dataframe with 'name' as index
    """
    query = f"""SELECT name, is_active, smb_positive , pillar FROM factor.factor_formula_ratios;"""
    with engine.connect() as connection:
        df = pd.read_sql(query,connection, index_col = ['name'])
    return df

def read_table_all_from_factor_processed_premium_20_pct(weeks_to_expire:int = 8):
    """This function all columns from factor_formula_premium_20_pct table. 
    
    Args:
        weeks_to_expire (int): Weeks to expire for factor premium, Defaults to 8 
        
    Returns:
        df_20_pct (pd.DataFrame): Dataframe with 'testing_period','field','group','weeks_to_expire','average_days' as index.
    """
    query = 'SELECT * FROM factor.factor_processed_premium_20_pct AND weeks_to_expire = {weeks_to_expire};'
    with engine.connect() as connection:
        df_20_pct = pd.read_sql(query,connection, index_col = ['testing_period','field','group','weeks_to_expire','average_days'])
    return df_20_pct
    
def read_table_all_from_factor_processed_premium_20_pct_with_specified_market(market: str = 'USD', weeks_to_expire:int = 8):
    """This function reads all columns from factor_formula_premium_20_pct table with a specific market.

   Args:
        market (str, optional): Market for factor premiums. Defaults to 'USD'.
        weeks_to_expire (int): Weeks to expire for factor premium, Defaults to 8 

    Returns:
        df_20_pct_specified_mkt (pd.DataFrame): Dataframe with 'testing_period','field','group' as index and 'testing_period','field','group' as columns.
    """
    query = f"""SELECT * FROM factor.factor_processed_premium_20_pct WHERE "group"='{market.upper()}' AND weeks_to_expire = {weeks_to_expire};"""
    with engine.connect() as connection:
        print(f"Reading table all with market = {market.upper()}")
        df_20_pct_specified_mkt = pd.read_sql(query,connection, index_col = ['testing_period','field','group'])
        df_20_pct_specified_mkt = df_20_pct_specified_mkt.drop(columns=['weeks_to_expire','average_days', 'updated'])
    return df_20_pct_specified_mkt

def merge_factor_premium_with_smb_and_retain_active_factor(df_unprocessed: pd.DataFrame = None):
    """This function merges factor premium with factor formula ratios for smb_positive column and filters out inactive factors. Steps include:

        1. [Read] factor formula ratios table for merging
        2. [Filter and merge] to remove inactive factors and merge formula ratios table to ratio tables
        3. [Multi-indexing] newly formed table with index (testing_periods, field, group and pillar)
    Args:
        df_unprocessed (pd.DataFrame, optional): _description_. Defaults to None.

    Returns:
        df_processed (pd.DataFrame): processed dataframe
    """

    # 1.[Read]
    formula = read_factor_formula_ratios()
    formula = formula.reset_index()

    # 2. [Filter and merge]
    active_factor = formula.loc[(formula['is_active']==True) & ~(formula['smb_positive'].isnull())]['name'].tolist()
    formula_merge = formula.loc[(formula['is_active']==True) & ~(formula['smb_positive'].isnull())][['name','smb_positive','pillar']]
    df_unprocessed = df_unprocessed.loc[df_unprocessed.index.isin(active_factor,level='field')]
    df_unprocessed = df_unprocessed.reset_index()
    df_unprocessed = df_unprocessed.merge(formula_merge, how='left', left_on=['field'], right_on=['name'])

    # 3. [Multi-indexing]
    df_processed = df_unprocessed.set_index(['testing_period','field','group', 'pillar'])

    return df_processed

def categorizing_larger_between_smaller(df_reversed: pd.DataFrame = None):
    """This function takes in factor premium dataframe with factor premium reversed according to smb_positive column and categorise the factors into three categories, namely >0.001, <-0.001 and between 0.001 and -0.001.

    Args:
        df_reversed (pd.DataFrame, optional): Dataframe with factor premium reversed. Defaults to None.

    Returns:
        df_categorized: Dataframe with values categorized
    """
    df_reversed = df_reversed.reset_index().set_index(['field','testing_period','group', 'pillar'])
    df_categorized = df_reversed.groupby(['testing_period','group', 'field', 'pillar'])['value'].transform(lambda x : '>0.001' if x.sum() >0.001 else '<-0.001' if x.sum() <-0.001 else '[-0.001,0.001]').rename('value')
    return df_categorized

def reversing_heuristics(df_processed: pd.DataFrame = None):
    """This function takes in a dataframe with factor ratios and their corresponding smb_positive column and reverse factor ratios' signs according to smb_positive column. Steps include:

        1. [Reverse] the sign of factor ratios' values according the smb_positive column. Specifically, if smb_positive is False, we times negative 1 to the existing values. (smb_positive is a bad name. This should be translated into 'Should be small minus big?' If False, then we should flip the sign because our model is currently assuming small minus big method.)

    Args:
        df_processed (pd.DataFrame, optional): Dataframe with factor ratios and smb_columns. Defaults to None.

    Returns:
        df_reversed (pd.DataFrame): Dataframe with the signs reversed according to smb_positive column
    """
    # 1.[Reverse]
    df_processed['sign'] = np.where(df_processed['smb_positive']==False,-1, 1)
    df_processed['value'] = df_processed['value']*df_processed['sign']
    df_reversed = df_processed.drop(columns=['sign','smb_positive'])
    return df_reversed




Creating sqlalchemy engine


In [3]:
# Block 3 Data preparation 

def data_preparation(weeks_to_expire:int = 8):
    """Data preparation with specified weeks to expire parameter

    Args:
        weeks_to_expire (int, optional): Specified weeks to expire. Defaults to 8.

    Returns:
        df_all (pd.DataFrame): Factor processed premiums for all markets
    """
    df_usd = read_table_all_from_factor_processed_premium_20_pct_with_specified_market('USD', weeks_to_expire)
    df_hkd = read_table_all_from_factor_processed_premium_20_pct_with_specified_market('HKD', weeks_to_expire)
    df_cny = read_table_all_from_factor_processed_premium_20_pct_with_specified_market('CNY', weeks_to_expire)
    df_eur = read_table_all_from_factor_processed_premium_20_pct_with_specified_market('EUR', weeks_to_expire)
    df_all = pd.concat([df_usd, df_hkd, df_cny, df_eur])
    return df_all

#### Below two blocks contain functions blocks that are essential for generating plotly graphs for analysing >0.001, <-0.001 and between for different periods and markets

In [13]:
# Block 4 Data processing before period and market comparisons
def data_processing_for_comparison(df_all: pd.DataFrame = None):
    """This function takes in factor ratios from USD, HKD, CNY and EUR all concatenated and do the following data processing for market comparison. 
    1. [Merge and filter] input dataframe with factor formula ratios and filter out inactive factors
    2. [Reverse] factor premiums according to smb_positive column
    3. [Categorise] factor premiums values into three, namely >0.001, <-0.001 and between 0.001 and -0.001

    Args:
        df_all (pd.DataFrame, optional): Raw factor formula ratios for all markets. Must contain columns 'testing_period','field','group' and 'values' for subsequent processing. Defaults to None.

    Returns:
        df_reversed_and_categorized (pd.DataFrame): Dataframe processed.
    """
    # 1. [Merge and filter]
    df_processed = merge_factor_premium_with_smb_and_retain_active_factor(df_all)
    # 2. [Reverse]
    df_reversed = reversing_heuristics(df_processed=df_processed)
    # 3. [Categorise]
    df_reversed_and_categorized = categorizing_larger_between_smaller(df_reversed)
    return df_reversed_and_categorized



In [14]:
# Blcok 5 Comparing different periods and different markets for >0.001, <-0.001 and between distribution

def for_each_pillar_compare_different_markets(df_grouped: pd.DataFrame = None):
    """Function for generating plotly graphs that indicate the percentage of factor premiums that are between 0.001 and -0.001, below -0.001 and larger than 0.001 for each pillar different markets and different time periods.Steps include:
    
    1. [Get] pillars and markets to be loop through
    2. [Loop pillar] to get the dataframe that is within that pillar
    3. [Loop period] to get the dataframe that is within that period
    4  [Calculate percentage] for three categories for each field and pillar using dataframe groupby function
    5. [Get fields] that are unique within the dataframe for looping
    6. [Loop field] to get the dataframe that is within that field
        7. [For each field], plot a subplot for the percentage distribution of three categories for four markets, meaning that there should be four categorry bars for four different markets
        8. [Aggregate] the subplots(number of subplots should be equal to number of fields) for each time period
        9. [Write] each plot with subplots inside it to html or show it

    Args:
        df_grouped (pd.DataFrame, optional): Dataframe with all the factor premiums reversed and its values categorized into either >0.001, <-0.001 or between. Defaults to None.
    """

    # 1. [Get] pillars and markets to be loop through
    pillars = sorted(list(df_grouped.index.levels[3]))
    markets = sorted(list(df_grouped.index.levels[2]))

    # 2. [Loop pillar]
    for pillar in pillars:
        year_ranges = [datetime.date(2019,1,1),datetime.date(2019,1,1),datetime.datetime.today().date()]

        # 3. [Loop period]
        for year_range in year_ranges:
            df_group = df_grouped.reset_index()
            if year_range == datetime.date(2008,1,1):
                df_group =  df_group.loc[df_group['testing_period']<year_range].set_index(['field','testing_period', 'group','pillar'])
            elif year_range == datetime.date(2019,1,1):
                df_group =  df_group.loc[(df_group['testing_period']>datetime.date(2008,1,1))&(df_group['testing_period']<year_range)].set_index(['field','testing_period', 'group','pillar'])
            else:
                df_group =  df_group.loc[(df_group['testing_period']>datetime.date(2019,1,1))&(df_group['testing_period']<year_range)].set_index(['field','testing_period', 'group','pillar'])

            # 4  [Calculate percentage]
            df_percent = df_group.groupby(['field','group', 'pillar']).value_counts(normalize=True).rename('value') # normalize to give percentage
            
            # 5. [Get fields]
            fields_in_pillar = sorted(list(df_percent.loc[df_percent.index.isin([pillar],level='pillar')].index.get_level_values('field').drop_duplicates()))
    
            i=1
            j=1
            fig = make_subplots(rows=3, cols=10, subplot_titles=fields_in_pillar)
            show_legend=True
            df = df_percent

            # 6. [Loop field]
            for field in fields_in_pillar:
                if i==1 and j==1:
                    show_legend=True
                else:
                    show_legend=False

                
                
                bigger_than_pos_001 = df.loc[df.index.isin(['>0.001'],level='value')&(df.index.isin([pillar],level='pillar'))&(df.index.isin([field],level='field'))]*100


                between = df.loc[df.index.isin(['[-0.001,0.001]'],level='value')&(df.index.isin([pillar],level='pillar'))&(df.index.isin([field],level='field'))]*100

                between_hkd = between.loc[between.index.isin(['HKD'],level='group')]
                between_cny = between.loc[between.index.isin(['CNY'],level='group')]
                between_usd = between.loc[between.index.isin(['USD'],level='group')]
                between_eur = between.loc[between.index.isin(['EUR'],level='group')]

                between.loc[(field, 'HKD', pillar, '[-0.001,0.001]')] = 0 if len(between_hkd)==0 else between_hkd[0]
                between.loc[(field, 'CNY', pillar, '[-0.001,0.001]')]  = 0 if len(between_cny)==0 else between_cny[0]
                between.loc[(field, 'USD', pillar, '[-0.001,0.001]')]  = 0 if len(between_usd)==0 else between_usd[0]
                between.loc[(field, 'EUR', pillar, '[-0.001,0.001]')]  = 0 if len(between_eur)==0 else between_eur[0]

                between = between.sort_index(level='group')

                less_than_neg_001 = df.loc[df.index.isin(['<-0.001'],level='value')&(df.index.isin([pillar],level='pillar'))&(df.index.isin([field],level='field'))]*100

                # 7. [For each field]
                fig.add_trace(go.Bar(x=['CNY', 'EUR', 'HKD', 'USD'], y=less_than_neg_001.tolist(), name='<-0.001', showlegend=show_legend, legendgroup='<-0.001',marker_color='red'), row=i, col =j)
                fig.add_trace(go.Bar(x=['CNY', 'EUR', 'HKD', 'USD'], y=between.tolist(), name='[-0.001,0.001]', showlegend=show_legend, legendgroup='[-0.001,0.001]',marker_color='blue'), row=i, col =j)
                fig.add_trace(go.Bar(x=['CNY', 'EUR', 'HKD', 'USD'], y=bigger_than_pos_001.tolist(), name='>0.001', showlegend=show_legend, legendgroup='>0.001',marker_color='yellow'), row=i, col =j)
                fig.update_layout(barmode='stack')

                if j>=9:
                    j=1
                    i = i+1
                else:    
                    j = j+1

            # 8. [Aggregate]
            if year_range == datetime.date(2019,1,1):
                text = 'before 2008'
            elif year_range ==datetime.date(2019,1,1):
                text = 'between 2008 and 2019'
            else:
                text = 'after 2019'

            # 9. [Write]
            fig.update_layout(height=2000, width=2100, title_text=f"Factor premium for different markets for pillar {pillar.upper()} "+text)
            # fig.show()
            fig.write_html(f"./market_comparison/market_comparison_{pillar}_{year_range}.html")


def for_each_pillar_compare_different_periods(df_grouped: pd.DataFrame = None):
    """Function for generating plotly graphs that indicate the percentage of factor premiums that are between 0.001 and -0.001, below -0.001 and larger than 0.001 for each pillar, each market and different time periods. Steps include:
    
    1. [Get] pillars and markets to be loop through
    2. [Loop pillar] to get the dataframe that is within that pillar
    3. [Loop market] to get the dataframe that is within that period \n
    For each market we generate one plot with subplots in it by steps below
        4. [Get fields] that are unique within the dataframe for looping
        5. [Loop field] to get the dataframe that is within that field
        6. [For each field]
            a. [Get different periods] for dataframe
            b. [Calculate percentage] for three categories for each period
            c. [Subplot] with three category bars (each bar for each period)
        7. [Aggregate] the subplots(number of subplots should be equal to number of fields) for each time period
        8. [Write] each plot with subplots inside it to html or show it

    Args:
        df_grouped (pd.DataFrame, optional): Dataframe with all the factor premiums reversed and its values categorized into either >0.001, <-0.001 or between. Defaults to None.
    """
    # 1. [Get]
    pillars = sorted(list(df_grouped.index.levels[3]))
    markets = sorted(list(df_grouped.index.levels[2]))

    # 2. [Loop pillar]
    for pillar in pillars:
        df_group = df_grouped.loc[df_grouped.index.isin([pillar],level='pillar')]
        # 3. [Loop market]
        for market in markets:
            df_group_market = df_group.loc[df_group.index.isin([market],level='group')]

            # 4. [Get fields]
            fields_in_pillar = sorted(df_group_market.reset_index()['field'].drop_duplicates().tolist())

            i=1
            j=1

            fig = make_subplots(rows=3, cols=10, subplot_titles=fields_in_pillar)
            show_legend=True

            # 5. [Loop field]
            for field in fields_in_pillar:
                if i==1 and j==1:
                    show_legend=True
                else:
                    show_legend=False
        
                df_group_field = df_group_market.loc[df_group_market.index.isin([field],level='field')]
                df_group_year = df_group_field.reset_index()

                # 6.a [Get different periods]
                df_group_before_2008_for_each_field =  df_group_year.loc[df_group_year['testing_period']<datetime.date(2008,1,1)].set_index(['field','testing_period', 'group','pillar']) 
                df_group_between_for_each_field =  df_group_year.loc[(df_group_year['testing_period']>datetime.date(2008,1,1))&(df_group_year['testing_period']<datetime.date(2019,1,1))].set_index(['field','testing_period', 'group','pillar'])
                df_group_after_for_each_field =  df_group_year.loc[(df_group_year['testing_period']>datetime.date(2019,1,1))&(df_group_year['testing_period']<datetime.datetime.today().date())].set_index(['field','testing_period', 'group','pillar'])

                # 6.b [Calculate percentage] 
                df_group_before_2008_total_percentage = df_group_before_2008_for_each_field.groupby(['field']).value_counts(normalize=True).rename('value')   # normalize to give percentage
                df_group_between_total_percentage = df_group_between_for_each_field.groupby(['field']).value_counts(normalize=True).rename('value')  
                df_group_after_total_percentage = df_group_after_for_each_field.groupby(['field']).value_counts(normalize=True).rename('value')  

                # 6.c [Subplot]
                val_bigger_than_pos_001_before_2008 = df_group_before_2008_total_percentage.loc[df_group_before_2008_total_percentage.index.isin(['>0.001'],level='value')]
                val_bigger_than_pos_001_between_2008_2019 = df_group_between_total_percentage.loc[df_group_between_total_percentage.index.isin(['>0.001'],level='value')]
                val_bigger_than_pos_001_after = df_group_after_total_percentage.loc[df_group_after_total_percentage.index.isin(['>0.001'],level='value')]

                val_less_than_neg_001_before_2008 = df_group_before_2008_total_percentage.loc[df_group_before_2008_total_percentage.index.isin(['<-0.001'],level='value')]
                val_less_than_neg_001_between_2008_2019 = df_group_between_total_percentage.loc[df_group_between_total_percentage.index.isin(['<-0.001'],level='value')]
                val_less_than_neg_001_after = df_group_after_total_percentage.loc[df_group_after_total_percentage.index.isin(['<-0.001'],level='value')]

                val_between_before_2008 = df_group_before_2008_total_percentage.loc[df_group_before_2008_total_percentage.index.isin(['[-0.001,0.001]'],level='value')]
                val_between_between_2008_2019 = df_group_between_total_percentage.loc[df_group_between_total_percentage.index.isin(['[-0.001,0.001]'],level='value')]
                val_between_after = df_group_after_total_percentage.loc[df_group_after_total_percentage.index.isin(['[-0.001,0.001]'],level='value')]

                val_between_before_2008_len = len(val_between_before_2008)
                val_between_between_2008_2019_len = len(val_between_between_2008_2019)
                val_between_after_len = len(val_between_after)


                bigger_than_pos_001 = [val_bigger_than_pos_001_before_2008[0],val_bigger_than_pos_001_between_2008_2019[0],val_bigger_than_pos_001_after[0]]
                less_than_neg_001 = [val_less_than_neg_001_before_2008[0],val_less_than_neg_001_between_2008_2019[0],val_less_than_neg_001_after[0]]
                

                val_between_before_2008 = 0 if val_between_before_2008_len == 0 else val_between_before_2008[0]
                val_between_between_2008_2019 = 0 if val_between_between_2008_2019_len == 0 else val_between_between_2008_2019[0]
                val_between_after = 0 if val_between_after_len == 0 else val_between_after[0]

                between = [val_between_before_2008, val_between_between_2008_2019, val_between_after]

                fig.add_trace(go.Bar(x=['Before 2008', 'Between 2008 and 2019', 'After 2019'], y=less_than_neg_001, name='<-0.001', showlegend=show_legend, legendgroup='<-0.001',marker_color='red'), row=i, col =j)
                fig.add_trace(go.Bar(x=['Before 2008', 'Between 2008 and 2019', 'After 2019'], y=between, name='[-0.001,0.001]', showlegend=show_legend, legendgroup='[-0.001,0.001]',marker_color='blue'), row=i, col =j)
                fig.add_trace(go.Bar(x=['Before 2008', 'Between 2008 and 2019', 'After 2019'], y=bigger_than_pos_001, name='>0.001', showlegend=show_legend, legendgroup='>0.001',marker_color='yellow'), row=i, col =j)
                fig.update_layout(barmode='stack')

                if j>=10:
                    j=1
                    i = i+1
                else:    
                    j = j+1

            # 7. [Aggregate]
            fig.update_layout(height=2000, width=2100, title_text=f"Factor premium for different year periods for pillar {pillar.upper()} for market {market.upper()}")

            # 8. [Write]
            fig.write_html(f"./period_comparison/periods_comparison_{pillar}_{market}.html")


    


#### Below two blocks contain functions for plotting histogram for probability vs value for different markets. Blcok 7 is the main function that plots the histogram with speicified start_date and end_date, field and title. Hence to run block 7 we need to run block 6.

In [11]:
# Block 6 Utility function for pylotting histogram 
def plot_histogram(data: pd.DataFrame = None, bin: int = 20, title: str = None, x_title: str = None, mean: pd.DataFrame = None, median: pd.DataFrame =None):
    """This function plots histogram of values and their probability for four different markets with lines indicating the mean and median position.

    Args:
        data (pd.DataFrame, optional): Dataframe with values. Defaults to None.
        bin (int, optional): Bin for value axis (x-axis). Defaults to 20.
        title (str, optional): Title for the histogram. Defaults to None.
        x_title (str, optional): Name of dataframe column whose values are to be plot. Meaning that it is the x-axis value. Defaults to None.
        mean (pd.DataFrame, optional): Mean for each market. Defaults to None.
        median (pd.DataFrame, optional): Median for each market. Defaults to None.
    """
    fig = px.histogram(data, nbins=bin, title=title, x=x_title, histnorm='probability', color='group')
    mean_hkd = mean.loc[mean.index.isin(['HKD'],level='group')].values[0][0]
    mean_cny = mean.loc[mean.index.isin(['CNY'],level='group')].values[0][0]
    mean_eur = mean.loc[mean.index.isin(['EUR'],level='group')].values[0][0]
    mean_usd = mean.loc[mean.index.isin(['USD'],level='group')].values[0][0]
    
    median_hkd = median.loc[median.index.isin(['HKD'],level='group')].values[0][0]
    median_cny = median.loc[median.index.isin(['CNY'],level='group')].values[0][0]
    median_eur = median.loc[median.index.isin(['EUR'],level='group')].values[0][0]
    median_usd = median.loc[median.index.isin(['USD'],level='group')].values[0][0]
    
    dmax = data['value'].max()
    dmin = data['value'].min()
    
    fig.add_trace(go.Scatter(x=[mean_hkd,mean_hkd], 
                         y=[dmin,dmax], 
                         mode='lines', 
                         line=dict(color='green', width=1, dash='dash'),
                         name=f'HKD_mean_{mean_hkd}'))
    fig.add_trace(go.Scatter(x=[mean_usd,mean_usd], 
                         y=[dmin,dmax], 
                         mode='lines', 
                         line=dict(color='green', width=1, dash='dash'),
                         name=f'USD_mean_{mean_usd}'))
    fig.add_trace(go.Scatter(x=[mean_cny,mean_cny], 
                         y=[dmin,dmax], 
                         mode='lines', 
                         line=dict(color='green', width=1, dash='dash'),
                         name=f'CNY_mean_{mean_cny}'))
    fig.add_trace(go.Scatter(x=[mean_eur,mean_eur], 
                         y=[dmin,dmax], 
                         mode='lines', 
                         line=dict(color='green', width=1, dash='dash'),
                         name=f'EUR_mean_{mean_eur}'))
    
    fig.add_trace(go.Scatter(x=[median_hkd,median_hkd], 
                         y=[dmin,dmax], 
                         mode='lines', 
                         line=dict(color='red', width=1, dash='dash'),
                         name=f'HKD_median_{median_hkd}'))
    fig.add_trace(go.Scatter(x=[median_usd,median_usd], 
                         y=[dmin,dmax], 
                         mode='lines', 
                         line=dict(color='red', width=1, dash='dash'),
                         name=f'USD_median_{median_usd}'))
    fig.add_trace(go.Scatter(x=[median_cny,median_cny], 
                         y=[dmin,dmax], 
                         mode='lines', 
                         line=dict(color='red', width=1, dash='dash'),
                         name=f'CNY_median_{median_cny}'))
    fig.add_trace(go.Scatter(x=[median_eur,median_eur], 
                         y=[dmin,dmax], 
                         mode='lines', 
                         line=dict(color='red', width=1, dash='dash'),
                         name=f'EUR_median_{median_eur}'))

    fig.write_html(f'./normal_distribution/{title}.html')

In [9]:
# Block 7 Plotting histogram distribution for probability vs value for different markets
def plot_histogram_for_different_markets_between(df_reversed: pd.DataFrame= None, field:str = 'ebitda_ev', start_date: datetime.date = datetime.date(2000,1,1), end_date: datetime.date = datetime.date(2023,1,1)):
    """This function plots histogram for a particular field within a particular time period for USD, CNY, HKD and EUR markets using Plotly. The plot includes a histogram with y axis equals to the probability and x axis the corresponding value. The graph also contains lines indicating median and mean position. 

    Args:
        df_reversed (pd.DataFrame, optional): Reversed Dataframe
        field (str, optional): Factor field to be plotted. Defaults to 'ebitda_ev'.
        start_date (datetime.date, optional): Starting date of the factor premiums. Defaults to datetime.date(2000,1,1).
        end_date (datetime.date, optional): Ending date of the factor premiums.. Defaults to datetime.date(2023,1,1).
    """
    df = df_reversed.reset_index()
    df = df.loc[(df['field']==field)&(df['testing_period']>start_date)&(df['testing_period']<end_date)][['value','group']]


    
    mean = df.set_index('group').groupby('group').mean()
    median = df.set_index('group').groupby('group').median()

    mean['value'] = mean['value'].apply(lambda x: np.format_float_positional(x,3,False,False,'k'))
    median['value'] = median['value'].apply(lambda x: np.format_float_positional(x,3,False,False,'k'))

    plot_histogram(df, title=f'Histogram_for_{field}_between_{start_date}_and_{end_date}', bin=400, x_title='value',mean=mean, median=median)


#### Below blcok 8 contains functions to generate median heatmap

In [5]:
# Block 8 median heatmap production function

def excel_writer(file_name:str = None):
    "Return workbook object with defined filename"
    wb = openpyxl.load_workbook(file_name)
    return wb

def excel_conditional_formatting(data, ws, num_fields):
    """Function for excel conditional formatting. Steps include:

        1. [Rewrite] market names
        2. [Cell position] range for conditional formatting to be applied. num_fields is for calculating the position of the final row in the xlsx file.
        3. [Maximum finding] for each market in the data dataframe for setting the color scale of conditional formatting.
        4. [Apply] conditional formatting for each currency with different start and final cell positions. Specifically, we apply maximum and -maximum and the max and min for conditional formatting, with 0 as the mid value.

    Args:
        data (pd.DataFrame): Dataframe for finding the maximum value for each market
        ws (worksheet object): openyxl worksheet object
        num_fields (int): Number of fields for calculating the position of the final row
    """
    # 1. [Rewrite]
    ws['B2']='HKD'
    ws['E2']='CNY'
    ws['H2']='USD'
    ws['K2']='EUR'

    for col in ['A','B','C','D','E','F','G','H','I','J','K','L','M']:
        ws.column_dimensions[col].width = 5

    # 2. [Cell position]
    currencies = [('HKD',f"B5:D{5+num_fields-1}"),('CNY',f"E5:G{5+num_fields-1}"),('USD',f"H5:J{5+num_fields-1}"),('EUR',f"K5:M{5+num_fields-1}")]

    # 3. [Maximum finding]
    max_of_all = data.T.groupby('group').max().max(axis=1).rename(index={'aHKD': 'HKD','bCNY': 'CNY','cUSD': 'USD','dEUR': 'EUR'})

    # 4. [Apply]
    for currency in currencies:

        maximum = max_of_all[currency[0]] 
        
        rule = ColorScaleRule(start_type='num', start_value=maximum, start_color='FF0000',
                                mid_type='num', mid_value=0, mid_color='FFFFFF',                     end_type='num', end_value=-maximum, end_color='228B22')

        ws.conditional_formatting.add(currency[1], rule)

def change_name(df):
    "Renaming column names for desired positioning"
    df['group'] = df['group'].replace(['HKD','CNY','USD','EUR'],['aHKD','bCNY','cUSD','dEUR'])
    return df

def heatmap_median_for_different_markets(file_name:str = None, df_reversed: pd.DataFrame = None, pillar:str = None, sheetname:str = None):
    """Plotting heuristic research heatmap for specified pillar. Steps include:

    1. [Partition] dataframe into three time periods
    For each time period
    2. [Calculate median] and round up numbers to three significant figures
    3. [Write] dataframe into xlsx file
    4. [Conditional formatting] for each market
    
    Args:
        file_name (str, optional): File path to write the xlsx file. Defaults to None.
        df_reversed (pd.DataFrame, optional): Dataframe with factor premium value reversed. Defaults to None.
        pillar (str, optional): Specified pillar. Defaults to None.
        sheetname (str, optional): The sheetname to write the dataframe into
    """
    
    df = df_reversed.reset_index()
    df = change_name(df)
    df = df.loc[df['pillar']==pillar]
    num_fields = len(df['field'].drop_duplicates())
    
    # 1. [Partition] dataframe into three time periods
    year_ranges = [(datetime.date(1998,1,1),datetime.date(2008,1,1),'between_1998_2008'), (datetime.date(2008,1,1), datetime.date(2019,1,1), 'between_2008_2019'), (datetime.date(2019,1,1), datetime.date(2023,1,1), 'between_2019_2022')]

    dataframe_with_different_year_ranges = [(df.loc[(df['testing_period']>year_range[0])&(df['testing_period']<year_range[1])][['value','group','field']],year_range[2]) for year_range in year_ranges]

    df_list = []

    # 2. [Calculate median] and round up numbers to three significant figures
    for tup in dataframe_with_different_year_ranges:
        df = tup[0]
        period = tup[1]
        df = df.set_index('group','field').groupby(['group','field'])['value'].median().to_frame()
        df['period'] = period
        df['value'] = df['value'].apply(lambda x: np.format_float_positional(x,3,False,False,'k'))
        df_list.append(df)

    data = pd.concat(df_list).reset_index().pivot(index=['field'],columns=['group','period'],values=['value'])
    data = data.T.reset_index().set_index(['level_0','group','period']).sort_index().T
    data = data.astype(float)

    # 3. [Write] dataframe into xlsx file
    with pd.ExcelWriter(file_name, mode='a',if_sheet_exists='replace', engine='openpyxl') as writer:
        data.to_excel(writer, sheet_name=sheetname)

    workbook = excel_writer(file_name)
    ws=workbook[sheetname]

    # 4. [Conditional formatting]
    excel_conditional_formatting(data, ws, num_fields)
    workbook.save(file_name)
    workbook.close()

def median_heat_map_for_all_pillars_and_all_weeks():
    """This function generates median heat map for all pillars and all weeks. Steps include:
        1. [Create] an empty xlsx file called 'median_heatmap.xlsx' in directory median_color_map if not exist for writing the data
        2. [Preparing iteration list] for all pillars for different weeks
        3. [Loop each pillar and weeks_to_expire combination]
        4. [Data preparation] Specifically, generate reversed-heuristic dataframe for specified weeks_to_expire.
        5. [Generate heatmap] and write into separate sheet
    """
    file_to_write = f'./median_color_map/median_heatmap.xlsx'

    # 1. [Create]
    if os.path.isfile(file_to_write):
        pass
    else:
        wb = openpyxl.Workbook()
        wb.save(file_to_write)


    # 2. [Preparing iteration list]
    iteration_list = [[2,4,8,26,52],['quality','value','momentum']]

    # 3. [Loop each pillar and weeks_to_expire combination]
    for element in itertools.product(*iteration_list):
        pillar = element[1]
        weeks_to_expire = element[0]

        # 4. [Data preparation]
        df_all = data_preparation(element[0])
        df_processed = merge_factor_premium_with_smb_and_retain_active_factor(df_all)
        df_reversed = reversing_heuristics(df_processed=df_processed)

        # 5. [Generate heatmap]
        heatmap_median_for_different_markets(file_to_write, df_reversed, pillar, f"{pillar}_{weeks_to_expire}_weeks")



# **Demonstration and example**

### *Period* comparison for >0.001, <-0.001 and between
We first prepare the needed df_reversed for specified weeks_to_expire. Then, we get the reversed and categorized(i.e. >0.001, <-0.001 and between) factor premium and execute plot generating function in **block 5**

In [None]:
# Period comparison
df_all = data_preparation(8)
df_processed = merge_factor_premium_with_smb_and_retain_active_factor(df_all)
df_reversed = reversing_heuristics(df_processed=df_processed)
df_reversed_and_categorized = data_processing_for_comparison(df_all)
for_each_pillar_compare_different_periods(df_reversed_and_categorized)

### *Market* comparison for >0.001, <-0.001 and between.

In [16]:
# Market comparison
for_each_pillar_compare_different_markets(df_reversed_and_categorized)


### Histogram for specified field and start_date/end_date for different markets. 
We first prepare the needed df_reversed for specified weeks_to_expire and run plot histogram functions in block 7

In [None]:
# Plotting histogram for different markets for different fields
df_all = data_preparation(8)
df_processed = merge_factor_premium_with_smb_and_retain_active_factor(df_all)
df_reversed = reversing_heuristics(df_processed=df_processed)

plot_histogram_for_different_markets_between(df_reversed,'market_cap_usd', start_date=datetime.date(2019,1,1), end_date=datetime.date(2023,1,1))

plot_histogram_for_different_markets_between(df_reversed,'stock_return_r12_7', start_date=datetime.date(2019,1,1), end_date=datetime.date(2023,1,1))


#### To generate median heatmaps for all pillars and all weeks, we run function in block 8

In [None]:
# Preparing heatmap for all pillars and all weeks
median_heat_map_for_all_pillars_and_all_weeks()