In [30]:
from __future__ import annotations

import os
import json
import pickle

from dataclasses import dataclass
from pathlib import Path
from typing import Any, Iterable

import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display
from matplotlib import pyplot as plt
from scipy import stats
from dotenv import load_dotenv
from collections import defaultdict

In [44]:
load_dotenv(r"C:\Users\by003457\workspace\perfectdays\.env")

CAPIQ_DATA_DIR = os.environ["CAPIQ_DATA_DIR"]
compinfo_file = os.path.join(CAPIQ_DATA_DIR, "comp_naics_code_common_stock_kr.parquet")
df_compinfo = pd.read_parquet(compinfo_file)

# Configure directories
NOTEBOOK_DIR = Path.cwd()
DEFAULT_RESULTS_DIR = Path(r"C:\Users\by003457\Downloads\strategy01_aggressive")
analysis_output_dir = (DEFAULT_RESULTS_DIR / "analysis_outputs").resolve()
analysis_output_dir.mkdir(parents=True, exist_ok=True)

results_dir = DEFAULT_RESULTS_DIR

In [45]:
RESULT_FILE_PATTERNS = {
    "simulation": "simulation_results_*.pkl",
    "holdings": "shares_owned_*.pkl",
    "revenue": "revenue_records_*.pkl",
}

def discover_result_files(root: Path) -> pd.DataFrame:
    records: list[dict[str, Any]] = []
    for kind, pattern in RESULT_FILE_PATTERNS.items():
        for path in root.glob(pattern):
            prefix = path.stem.split("_")[-1]
            records.append({
                "kind": kind,
                "prefix": prefix,
                "path": path.resolve(),
                "modified": pd.Timestamp(path.stat().st_mtime, unit="s"),
                "size_kb": path.stat().st_size / 1024,
            })
    return pd.DataFrame(records).sort_values(["prefix", "kind"])

discovered_files = discover_result_files(results_dir)



def load_pickle(path: Path) -> Any:
    with open(path, "rb") as fh:
        return pickle.load(fh)

def flatten_simulation(simulation: dict[str, list[dict[str, Any]]]) -> pd.DataFrame:
    records: list[dict[str, Any]] = []
    for ticker, rows in simulation.items():
        for row in rows:
            record = row.copy()
            record["ticker"] = ticker
            records.append(record)
    return pd.DataFrame(records)

def flatten_holdings(holdings: dict[str, list[dict[str, Any]]]) -> pd.DataFrame:
    records: list[dict[str, Any]] = []
    for ticker, rows in holdings.items():
        for row in rows:
            record = row.copy()
            record["ticker"] = ticker
            records.append(record)
    return pd.DataFrame(records)

def flatten_revenue(revenue: dict[str, list[dict[str, Any]]]) -> pd.DataFrame:
    records: list[dict[str, Any]] = []
    for ticker, rows in revenue.items():
        for row in rows:
            record = row.copy()
            record["ticker"] = ticker
            records.append(record)
    return pd.DataFrame(records)

if discovered_files.empty:
    simulation_df = pd.DataFrame()
    holdings_df = pd.DataFrame()
    revenue_df = pd.DataFrame()
else:
    available_prefixes = discovered_files["prefix"].unique().tolist()
    target_prefix = available_prefixes[0]

    paths = {
        row.kind: row.path for row in discovered_files.itertuples() if row.prefix == target_prefix
    }

    simulation_df = flatten_simulation(load_pickle(paths["simulation"])) if "simulation" in paths else pd.DataFrame()
    holdings_df = flatten_holdings(load_pickle(paths["holdings"])) if "holdings" in paths else pd.DataFrame()
    revenue_df = flatten_revenue(load_pickle(paths["revenue"])) if "revenue" in paths else pd.DataFrame()


In [46]:
revenue_df.head(2)

Unnamed: 0,ticker,sell_date,revenue
0,A018500,2015-02-05,109880
1,A018500,2015-02-12,-53470


In [47]:
holdings_df.head(2)

Unnamed: 0,ticker,shares,buy_price,buy_date,sold_price,sold_date
0,A013520,3683,5428.0,2017-04-19,,NaT
1,A013520,-3683,,2017-04-19,5145.0,2017-05-18


In [None]:
def prepare_simulation(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty:
        return df
    df = df.copy()
    df["simuldate"] = pd.to_datetime(df["simuldate"], errors="coerce")
    numeric_fields = ["invest_amount", "shares_bought", "vwap_stability", "popularity"]
    for field in numeric_fields:
        if field in df.columns:
            df[field] = pd.to_numeric(df[field], errors="coerce")
    df.dropna(subset=["simuldate", "ticker"], inplace=True)
    df.sort_values(["ticker", "simuldate"], inplace=True)
    df["invest_amount"].fillna(0, inplace=True) if "invest_amount" in df else None
    df["shares_bought"].fillna(0, inplace=True) if "shares_bought" in df else None
    df["invest_flag"].fillna("no_action", inplace=True) if "invest_flag" in df else None
    return df

def prepare_holdings(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty:
        return df
    df = df.copy()
    for col in ["buy_date", "sold_date"]:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors="coerce")
    for col in ["shares", "buy_price", "sold_price"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")
    df.dropna(subset=["ticker", "buy_date", "shares"], inplace=True)
    df["holding_days"] = (df["sold_date"] - df["buy_date"]).dt.days
    return df

def prepare_revenue(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty:
        return df
    df = df.copy()
    if "sell_date" in df.columns:
        df["sell_date"] = pd.to_datetime(df["sell_date"], errors="coerce")
    for col in ["revenue", "shares_sold", "sold_price", "bought_price"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")
    df.dropna(subset=["ticker", "sell_date", "revenue"], inplace=True)
    df["profit_pct"] = np.where(
    (df["bought_price"] > 0),
        (df["sold_price"] - df["bought_price"]) / df["bought_price"],
        np.nan,
    ) if "bought_price" in df.columns and "sold_price" in df.columns else np.nan
    return df

simulation_df = prepare_simulation(simulation_df)
holdings_df = prepare_holdings(holdings_df)
revenue_df = prepare_revenue(revenue_df)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["invest_amount"].fillna(0, inplace=True) if "invest_amount" in df else None
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["shares_bought"].fillna(0, inplace=True) if "shares_bought" in df else None
The behavior will change in pandas 3.0. This inplace method will never wor

In [43]:
simulation_df[simulation_df.invest_amount > 0].head()

Unnamed: 0,simuldate,ticker,popularity,vwap_stability,investment_signal,invest_amount,shares_bought,invest_flag
1438673,2015-10-05,A000040,True,True,True,19972030.0,871.0,investing
1439364,2018-08-20,A000040,True,True,True,19991652.0,2643.0,investing
1439650,2019-10-22,A000040,True,True,True,19991400.0,3029.0,investing
1439655,2019-10-29,A000040,True,True,True,19988320.0,3047.0,investing
1440183,2022-01-04,A000040,True,True,True,19994613.0,6739.0,investing


In [40]:
holdings_df.head()

Unnamed: 0,ticker,shares,buy_price,buy_date,sold_price,sold_date,holding_days
0,A013520,3683,5428.0,2017-04-19,,NaT,
1,A013520,-3683,,2017-04-19,5145.0,2017-05-18,29.0
2,A013520,6078,3290.0,2018-09-18,,NaT,
3,A013520,-6078,,2018-09-18,3242.0,2018-09-21,3.0
4,A013520,6619,3021.0,2019-03-06,,NaT,


In [38]:
revenue_df

Unnamed: 0,ticker,sell_date,revenue,profit_pct
0,A018500,2015-02-05,109880,
1,A018500,2015-02-12,-53470,
2,A018500,2015-03-30,154620,
3,A018500,2015-10-05,392080,
4,A018500,2015-10-12,239775,
...,...,...,...,...
22037,A272450,2025-09-26,-704410,
22038,A069460,2025-09-26,721629,
22039,A014470,2025-09-29,-157560,
22040,A008700,2025-09-29,108101,


In [35]:
df_compinfo2 = df_compinfo[['tickerSymbol','companyName', 'indu_desc','desc_1' ,'desc_2','desc_3','desc_4', 'desc_5']].drop_duplicates(subset=['tickerSymbol'], keep='first').copy()
revenue_df2 = revenue_df.merge(df_compinfo2, left_on='ticker', right_on='tickerSymbol', how='left').copy()
comp_rev_df = revenue_df2.groupby('ticker').revenue.sum().sort_values(ascending=False)
comp_rev_df = comp_rev_df.to_frame().merge(df_compinfo2, left_index=True, right_on='tickerSymbol', how='left')


In [36]:
comp_rev_df.head(2)

Unnamed: 0,revenue,tickerSymbol,companyName,indu_desc,desc_1,desc_2,desc_3,desc_4,desc_5
2519,24063390,A000760,"Rifa Co.,Ltd.",Chemical Manufacturing,Manufacturing,Chemical Manufacturing,,,
2507,16400865,A004090,Korea Petroleum Industries Company,"Asphalt Paving, Roofing, and Saturated Materia...",Manufacturing,Petroleum and Coal Products Manufacturing,Petroleum and Coal Products Manufacturing,"Asphalt Paving, Roofing, and Saturated Materia...",


In [37]:
revenue_df2.head(2)

Unnamed: 0,ticker,sell_date,revenue,profit_pct,tickerSymbol,companyName,indu_desc,desc_1,desc_2,desc_3,desc_4,desc_5
0,A018500,2015-02-05,109880,,A018500,"Dongwon Metal Co., Ltd.",Motor Vehicle Parts Manufacturing,Manufacturing,Transportation Equipment Manufacturing,Motor Vehicle Parts Manufacturing,,
1,A018500,2015-02-12,-53470,,A018500,"Dongwon Metal Co., Ltd.",Motor Vehicle Parts Manufacturing,Manufacturing,Transportation Equipment Manufacturing,Motor Vehicle Parts Manufacturing,,


In [19]:
# make the bins of profit_pct absolute instead of relative
bins = [-np.inf, -0.2, -0.05, 0.05, 0.2, np.inf]
labels = ['p_neg20below', 'p_neg5to-20', 'p_neg5to5', 'p_5to20', 'p_above20']
revenue_df2['profit_bin'] = pd.cut(revenue_df2['profit_pct'], bins=bins, labels=labels)
revenue_df2[['ticker', 'sell_date', 'revenue', 'profit_pct', 'profit_bin']].head(10)

Unnamed: 0,ticker,sell_date,revenue,profit_pct,profit_bin
0,A015230,2015-02-09,-48776,-0.00488,p_neg5to5
1,A015230,2015-02-16,876680,0.087694,p_5to20
2,A094800,2015-03-09,12845,0.001285,p_neg5to5
3,A094800,2015-03-16,90160,0.009021,p_neg5to5
4,A094800,2015-05-19,0,0.0,p_neg5to5
5,A094800,2015-10-26,114880,0.011494,p_neg5to5
6,A094800,2016-11-07,-57375,-0.005741,p_neg5to5
7,A094800,2016-11-21,-305370,-0.030552,p_neg5to5
8,A094800,2017-04-05,0,0.0,p_neg5to5
9,A094800,2017-04-11,25890,0.002591,p_neg5to5


In [22]:
# count profit_bin occurrences by ticker
profit_bin_counts = revenue_df2.groupby(['ticker', 'profit_bin']).size().unstack(fill_value=0)
profit_bin_counts['total_trades'] = profit_bin_counts.sum(axis=1)
profit_bin_counts.head(2)

  profit_bin_counts = revenue_df2.groupby(['ticker', 'profit_bin']).size().unstack(fill_value=0)


profit_bin,p_neg20below,p_neg5to-20,p_neg5to5,p_5to20,p_above20,total_trades
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A000040,0,0,1,0,0,1
A000060,0,0,5,0,0,5


In [25]:
# merge profit_bin_counts back to comp_rev_df
comp_rev_df = comp_rev_df.merge(profit_bin_counts, left_on='tickerSymbol', right_index=True, how='left')

In [28]:
# calculate p_above20 / total_trades and sort by it
comp_rev_df['p_above20_ratio'] = comp_rev_df.apply(
    lambda row: row['p_above20'] / row['total_trades'] if row['total_trades'] > 0 else 0,
    axis=1)
comp_rev_df.sort_values(by='p_above20_ratio', ascending=False).head(10)

Unnamed: 0,revenue,tickerSymbol,companyName,indu_desc,desc_1,desc_2,desc_3,desc_4,desc_5,p_above20_ratio,p_neg20below,p_neg5to-20,p_neg5to5,p_5to20,p_above20,total_trades
209,6829060,A900270,Heng Sheng Holding Group Limited,"Doll, Toy, and Game Manufacturing",Manufacturing,Miscellaneous Manufacturing,Other Miscellaneous Manufacturing,"Doll, Toy, and Game Manufacturing","Doll, Toy, and Game Manufacturing",1.0,0,0,0,0,1,1
2519,8410800,A000760,"Rifa Co.,Ltd.",Chemical Manufacturing,Manufacturing,Chemical Manufacturing,,,,1.0,0,0,0,0,1,1
1729,3636760,A008020,"Kyungnam Energy Co.,Ltd",Natural Gas Distribution,Utilities,Utilities,Natural Gas Distribution,Natural Gas Distribution,Natural Gas Distribution,0.333333,0,0,2,0,1,3
74,4500515,A256630,"NH Special Purpose Acquisition 10 Co., Ltd.",Other Financial Vehicles,Finance and Insurance,"Funds, Trusts, and Other Financial Vehicles",Other Investment Pools and Funds,Other Financial Vehicles,Other Financial Vehicles,0.071429,0,0,13,0,1,14
25,2520620,A328380,Mirae Asset Daewoo Special Purpose Acquisition...,Other Financial Vehicles,Finance and Insurance,"Funds, Trusts, and Other Financial Vehicles",Other Investment Pools and Funds,Other Financial Vehicles,Other Financial Vehicles,0.035714,0,0,27,0,1,28
405,13473610,A331380,"FOCUS AI Co., Ltd.",Audio and Video Equipment Manufacturing,Manufacturing,Computer and Electronic Product Manufacturing,Audio and Video Equipment Manufacturing,Audio and Video Equipment Manufacturing,Audio and Video Equipment Manufacturing,0.035714,0,0,27,0,1,28
2241,2181590,A001070,"Taihan Textile Co., Ltd.",Textile Mills,Manufacturing,Textile Mills,,,,0.0,0,0,20,2,0,22
2415,1813950,A002000,"Saint-Gobain Korea Holdings Co., Ltd.",Flat Glass Manufacturing,Manufacturing,Nonmetallic Mineral Product Manufacturing,Glass and Glass Product Manufacturing,Glass and Glass Product Manufacturing,Flat Glass Manufacturing,0.0,0,0,3,1,0,4
1451,1529840,A003100,"SUN KWANG CO.,Ltd.",Water Transportation,Transportation and Warehousing,Water Transportation,,,,0.0,0,0,4,1,0,5
2164,1371580,A293940,"Shinhan Alpha REIT Co., Ltd.",Land Subdivision,Construction,Heavy and Civil Engineering Construction (eff ...,Land Subdivision (eff from 6/15/2002),Land Subdivision (eff from 6/15/2002),Land Subdivision (eff from 6/15/2002),0.0,0,0,17,0,0,17


In [29]:
# save comp_rev_df to xlsx to DEFAULT_RESULTS_DIR
comp_rev_df.to_excel(analysis_output_dir / "company_revenue_analysis.xlsx", index=False)