In [1]:
import pandas as pd
import numpy as np
from google.colab import drive
from sklearn.impute import KNNImputer
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm

In [3]:
drive.mount('/content/drive')
folder = '/content/drive/MyDrive/Quantitative Investment Portfolio/'
data_dir = '/content/drive/MyDrive/Quantitative Investment Portfolio/Data/'

Mounted at /content/drive


In [4]:
data = pd.read_parquet(data_dir + 'standardized_factors.parquet')
data.head(2)

Unnamed: 0,permno,yyyymm,hsiccd,ind,s,year,month,zret,zlnP,zlnsize,...,zRD,zReturnSkew3F,zREV6,zRIVolSpread,zShareIss1Y,zSmileSlope,zSurpriseRD,zDebtIssuance,zTK,zmarket_age
0,10001,199001,4925.0,31,2,1990,1,0.0,0.0,0.0,...,,0.0,,,0.0,,,0.0,0.0,0.0
1,10001,199002,4925.0,31,2,1990,2,0.0,0.0,0.0,...,,0.0,,,0.0,,,0.0,0.0,0.0


In [5]:
# === KNN Implementation - V. Hard due to size of data === #

In [6]:
# === MI - Forrest === #
data['permno'] = data['permno'].astype('category')
data['ind'] = data['ind'].astype('category')
D = data[data['s']==0]
D = D.drop(columns=['date','hsiccd','s'],errors='ignore')
D = D.sample(frac=0.01)
D.reset_index(drop=True,inplace=True)

In [7]:
!pip install dask[dataframe]
!pip install dask-expr
!pip install miceforest
import dask
import miceforest as mf

Collecting dask-expr<1.2,>=1.1 (from dask[dataframe])
  Downloading dask_expr-1.1.21-py3-none-any.whl.metadata (2.6 kB)
INFO: pip is looking at multiple versions of dask-expr to determine which version is compatible with other requirements. This could take a while.
  Downloading dask_expr-1.1.20-py3-none-any.whl.metadata (2.6 kB)
  Downloading dask_expr-1.1.19-py3-none-any.whl.metadata (2.6 kB)
  Downloading dask_expr-1.1.18-py3-none-any.whl.metadata (2.6 kB)
  Downloading dask_expr-1.1.16-py3-none-any.whl.metadata (2.5 kB)
Downloading dask_expr-1.1.16-py3-none-any.whl (243 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m243.2/243.2 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dask-expr
Successfully installed dask-expr-1.1.16
Collecting miceforest
  Downloading miceforest-6.0.3-py3-none-any.whl.metadata (35 kB)
Downloading miceforest-6.0.3-py3-none-any.whl (40 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/

In [8]:
# Create kernel.
kernel = mf.ImputationKernel(D,random_state=104)
kernel.mice(iterations=2, n_estimators=10)

optimal_params = kernel.tune_parameters(dataset=0,use_gbdt=True,num_iterations=25,random_state=1,verbose=False)

optimal_params = pd.DataFrame(optimal_params)
optimal_params.loc['num_iterations'] = optimal_params.loc['num_iterations'].apply(lambda x: min(x, 3))
optimal_params.loc['num_leaves'] = optimal_params.loc['num_leaves'].apply(lambda x: min(x, 15))
optimal_params.loc['max_depth'] = optimal_params.loc['max_depth'].apply(lambda x: min(x, 4))
#######################
### Also change n_estimators ig
optimal_params = optimal_params.to_dict()

kernel.mice(iterations=1, variable_parameters=optimal_params)

In [9]:
D2 = data.drop(columns=['date','hsiccd','s'],errors='ignore')
D2.reset_index(drop=True,inplace=True)

D3 = kernel.impute_new_data(D2)
D4 = D3.complete_data(0)
print(D4.isna().sum().sum())
D4.fillna(0,inplace=True)

ValueError: not enough values to unpack (expected 3, got 1)

In [16]:
data_divison = {'insample':[1993,2013],'outsample':[2014,2024],'presample':[1990,1992]}
D4['s'] = 0
D4.loc[D4['year']>=data_divison['outsample'][0],'s'] = 1
D4.loc[D4['year']<=data_divison['presample'][1],'s'] = 2

In [17]:
D4.to_parquet(data_dir + 'imputed/mice.parquet')

In [18]:
optimal_params

{'zChNAnalyst': {'boosting': 'gbdt',
  'data_sample_strategy': 'bagging',
  'num_iterations': 3,
  'max_depth': 2,
  'num_leaves': 15,
  'min_data_in_leaf': 1,
  'min_sum_hessian_in_leaf': 0.01,
  'min_gain_to_split': 0.0,
  'bagging_fraction': 0.478996862504547,
  'feature_fraction_bynode': 0.9621005771354517,
  'bagging_freq': 1,
  'verbosity': -1,
  'learning_rate': 0.02,
  'objective': 'regression'},
 'zCPVolSpread': {'boosting': 'gbdt',
  'data_sample_strategy': 'bagging',
  'num_iterations': 3,
  'max_depth': 3,
  'num_leaves': 15,
  'min_data_in_leaf': 6,
  'min_sum_hessian_in_leaf': 0.01,
  'min_gain_to_split': 0.0,
  'bagging_fraction': 0.3589978047277139,
  'feature_fraction_bynode': 0.2170257149064499,
  'bagging_freq': 1,
  'verbosity': -1,
  'learning_rate': 0.02,
  'objective': 'regression'},
 'zRIVolSpread': {'boosting': 'gbdt',
  'data_sample_strategy': 'bagging',
  'num_iterations': 3,
  'max_depth': 4,
  'num_leaves': 11,
  'min_data_in_leaf': 1,
  'min_sum_hessian_in

In [None]:
optimal_params