<a href="https://colab.research.google.com/github/adamDucken/adamDucken/blob/main/research_cpi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install lightgbm
!pip install optuna
!pip install fredapi

Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.13.2-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.0/233.0 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Downloading Mako-1.3.5-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [None]:
import lightgbm as lgb
import numpy as np, pandas as pd
import sklearn
from fredapi import Fred

#1.Data gathering, cleaning, engineering

In [None]:
fred_key = '51455cd20c3cc922caededa08ef3d816'
fred = Fred(api_key=fred_key)

In [None]:
def fred_to_df(series_arr:list, grpby_freq:str=None, grpby_method:str=None):
    """
    Takes as argument list of series_id, Example : ['GDP', 'FEDFUNDS'],
    All series should be the same frequency if they are not specify
    groupby freq, method for more info check pandas.DataFrame.resample,
    Follow Fred API docs for more info about series, their names, etc,
    This function returns pd.DataFrame() containing all series
    Example usage: series_arr = ['FEDFUNDS', 'CPIAUCSL', 'UNRATE', 'GDP']
    df = fred_to_df(series_arr, grpby_freq='Y', grpby_method='mean')
    """
    df = pd.DataFrame()
    freq = []
    import warnings

    for series_id in series_arr:
        # Get series from Fred and add to df
        df_series = pd.DataFrame(fred.get_series(series_id=series_id), columns=[series_id])
        df = pd.concat([df, df_series], axis=1)

        # Get info about series and extract frequency
        series_info = fred.get_series_info(series_id)
        freq.append(series_info['frequency'])

    same = all(i == freq[0] for i in freq)

    if not same and grpby_freq is None:
        raise Warning("Your series data is not the same frequency, please provide groupby frequency and groupby method")
        return None

    df.reset_index(inplace=True)
    df.rename(columns = {'index':'DATE'},inplace=True)
    if not same:
        df.set_index('DATE', inplace=True)
        df = df.resample(grpby_freq).apply(grpby_method)
        df.reset_index(inplace=True)
    return df

In [None]:
series_arr = ['CPIAUCSL','MICH','PPIACO','DTB3','DGS1','DFF','DGS20','DPRIME','PI','DSPI','CPIENGSL','USALOLITONOSTSAM','PCEPILFE','UNRATE']

##1.1 Data gathering

In [None]:
fred_df= fred_to_df(series_arr, grpby_freq='M', grpby_method='mean')
df = fred_df.copy()

In [None]:
df = df.set_index('DATE')
target_col = ['CPIAUCSL','PCEPILFE','CPIENGSL','PPIACO']

for col in target_col:
    for lag in range(1, 2):  # creating 3 months of lagged features as an example
        df[f'{col}_lag{lag}'] = df[col].shift(lag)
df = df.drop(columns = target_col)
df

Unnamed: 0_level_0,MICH,DTB3,DGS1,DFF,DGS20,DPRIME,PI,DSPI,USALOLITONOSTSAM,UNRATE,CPIAUCSL_lag1,PCEPILFE_lag1,CPIENGSL_lag1,PPIACO_lag1
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1913-01-31,,,,,,,,,,,,,,
1913-02-28,,,,,,,,,,,,,,12.100
1913-03-31,,,,,,,,,,,,,,12.000
1913-04-30,,,,,,,,,,,,,,12.000
1913-05-31,,,,,,,,,,,,,,12.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-03-31,2.9,5.239000,4.992500,5.33,4.463000,8.5,23746.6,20779.7,,3.8,311.054,121.224,284.175,254.926
2024-04-30,3.2,5.241364,5.135000,5.33,4.767727,8.5,23809.6,20831.7,,3.9,312.230,121.629,287.399,254.963
2024-05-31,3.3,5.252727,5.159545,5.33,4.711364,8.5,23923.7,20925.6,,4.0,313.207,121.944,290.631,256.772
2024-06-30,,5.244211,5.110526,5.33,4.541053,8.5,,,,4.1,313.225,122.045,284.742,255.094


##1.2 Feature engineering

In [None]:
df['CPI_PI'] = df['CPIAUCSL_lag1'] / df['PI']
df['CPI_DSPI'] = df['CPIAUCSL_lag1'] / df['DSPI']
df['UNRATE_PI'] = df['UNRATE'] / df['PI']
df['UNRATE_DSPI'] = df['UNRATE'] / df['DSPI']
df['DGS20_CPI'] = df['DGS20'] / df['CPIAUCSL_lag1']
df['DFF_CPI'] = df['DFF'] / df['CPIAUCSL_lag1']
df['CPIENGSL_CPI'] = df['CPIENGSL_lag1'] / df['CPIAUCSL_lag1']
df['PI_PCEPILFE'] = df['PI'] / df['PCEPILFE_lag1']
df['DSPI_PCEPILFE'] = df['DSPI'] / df['PCEPILFE_lag1']
df['DGS20_DGS1'] = df['DGS20'] - df['DGS1']
df['DGS1_DTB3'] = df['DGS1'] - df['DTB3']

In [None]:
df = df.dropna()

In [None]:
pd.set_option('display.max_columns', None)
df

Unnamed: 0_level_0,MICH,DTB3,DGS1,DFF,DGS20,DPRIME,PI,DSPI,USALOLITONOSTSAM,UNRATE,CPIAUCSL_lag1,PCEPILFE_lag1,CPIENGSL_lag1,PPIACO_lag1,CPI_PI,CPI_DSPI,UNRATE_PI,UNRATE_DSPI,DGS20_CPI,DFF_CPI,CPIENGSL_CPI,PI_PCEPILFE,DSPI_PCEPILFE,DGS20_DGS1,DGS1_DTB3
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1978-01-31,5.2,6.440476,7.283333,6.703548,8.139524,7.931818,1760.1,1547.4,101.41070,6.4,62.300,31.227,51.600,66.200,0.035396,0.040261,0.003636,0.004136,0.130650,0.107601,0.828250,56.364684,49.553271,0.856190,0.842857
1978-02-28,6.4,6.445556,7.337778,6.784286,8.216111,8.000000,1776.2,1563.8,101.44650,6.3,62.700,31.414,51.100,66.800,0.035300,0.040095,0.003547,0.004029,0.131038,0.108202,0.814992,56.541669,49.780353,0.878333,0.892222
1978-03-31,6.3,6.293636,7.310455,6.793226,8.206364,8.000000,1797.9,1587.1,101.55220,6.3,63.000,31.535,50.600,67.500,0.035041,0.039695,0.003504,0.003970,0.130260,0.107829,0.803175,57.012843,50.328207,0.895909,1.016818
1978-04-30,6.7,6.286000,7.454500,6.888667,8.324000,8.000000,1821.9,1604.6,101.70130,6.1,63.400,31.706,51.000,68.100,0.034799,0.039511,0.003348,0.003802,0.131293,0.108654,0.804416,57.462310,50.608718,0.869500,1.168500
1978-05-31,6.9,6.408095,7.820000,7.360000,8.441429,8.250000,1837.0,1615.1,101.85210,6.0,63.900,31.911,51.400,69.000,0.034785,0.039564,0.003266,0.003715,0.132104,0.115180,0.804382,57.566356,50.612641,0.621429,1.411905
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-09-30,3.2,5.319500,5.436000,5.330000,4.653000,8.500000,23176.2,20392.5,99.48235,3.8,306.187,119.449,288.392,257.680,0.013211,0.015015,0.000164,0.000186,0.015197,0.017408,0.941882,194.025902,170.721396,-0.783000,0.116500
2023-10-31,4.2,5.335238,5.424762,5.330000,5.129524,8.500000,23189.4,20403.8,99.53435,3.8,307.288,119.842,291.710,258.934,0.013251,0.015060,0.000164,0.000186,0.016693,0.017345,0.949305,193.499775,170.255837,-0.295238,0.089524
2023-11-30,4.5,5.270000,5.281429,5.330000,4.842381,8.500000,23241.5,20449.8,99.60859,3.7,307.531,120.015,285.488,255.192,0.013232,0.015038,0.000159,0.000181,0.015746,0.017332,0.928323,193.654960,170.393701,-0.439048,0.011429
2023-12-31,3.1,5.241500,4.959000,5.330000,4.320000,8.500000,23311.6,20511.5,99.71788,3.7,308.024,120.122,281.042,252.856,0.013213,0.015017,0.000159,0.000180,0.014025,0.017304,0.912403,194.066033,170.755565,-0.639000,-0.282500


In [None]:
percentage_columns = ['MICH','UNRATE', 'DTB3', 'DGS1', 'DFF', 'DGS20', 'DPRIME']

# Apply percentage change to non-percentage columns
df_pct_change = df.copy()
for col in df.columns:
    if col in percentage_columns:
        # Apply first difference
        df_pct_change[col] = df[col].diff()
    elif col != 'DATE':
        # Apply percentage change
        df_pct_change[col] = df[col].pct_change()

# Drop the first row because it will contain NaN values after pct_change and diff
df_pct_change.dropna(inplace=True)

In [None]:
X = df_pct_change
df = X
df.reset_index(inplace=True)
df['DATE'] = pd.to_datetime(df['DATE'])

# Set 'DATE' as the index
df = df.set_index('DATE')

# Format the date to 'yyyy-mm' and set it back as index
df.index = df.index.to_period('M')

# If you want to keep 'DATE' as a column formatted as 'yyyy-mm'
df['DATE'] = df.index.to_timestamp().strftime('%Y-%m')
df = df.drop(columns = ['DATE'])
X = df
X

Unnamed: 0_level_0,MICH,DTB3,DGS1,DFF,DGS20,DPRIME,PI,DSPI,USALOLITONOSTSAM,UNRATE,CPIAUCSL_lag1,PCEPILFE_lag1,CPIENGSL_lag1,PPIACO_lag1,CPI_PI,CPI_DSPI,UNRATE_PI,UNRATE_DSPI,DGS20_CPI,DFF_CPI,CPIENGSL_CPI,PI_PCEPILFE,DSPI_PCEPILFE,DGS20_DGS1,DGS1_DTB3
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1978-02,1.2,0.005079,0.054444,0.080737,0.076587,0.068182,0.009147,0.010598,0.000353,-0.1,0.006421,0.005988,-0.009690,0.009063,-0.002702,-0.004134,-0.024548,-0.025948,0.002970,0.005588,-0.016008,0.003140,0.004583,0.025862,0.058569
1978-03,-0.1,-0.151919,-0.027323,0.008940,-0.009747,0.000000,0.012217,0.014900,0.001042,0.0,0.004785,0.003852,-0.009785,0.010479,-0.007343,-0.009966,-0.012070,-0.014681,-0.005943,-0.003450,-0.014500,0.008333,0.011005,0.020010,0.139647
1978-04,0.4,-0.007636,0.144045,0.095441,0.117636,0.000000,0.013349,0.011026,0.001468,-0.2,0.006349,0.005423,0.007905,0.008889,-0.006907,-0.004626,-0.044501,-0.042306,0.007935,0.007652,0.001546,0.007884,0.005574,-0.029477,0.149173
1978-05,0.2,0.122095,0.365500,0.471333,0.117429,0.250000,0.008288,0.006544,0.001483,-0.1,0.007886,0.006466,0.007843,0.013216,-0.000398,0.001334,-0.024479,-0.022788,0.006172,0.060061,-0.000043,0.001811,0.000078,-0.285304,0.208305
1978-06,-0.4,0.325996,0.274545,0.236667,0.085390,0.386364,0.010234,0.006563,0.001119,-0.1,0.009390,0.005891,0.005837,0.007246,-0.000836,0.002808,-0.026628,-0.023078,0.000719,0.022554,-0.003520,0.004317,0.000668,-0.304389,-0.036440
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-09,-0.3,0.022978,0.068609,0.000000,0.196478,0.000000,0.003538,0.003252,0.000623,0.0,0.005118,0.000980,0.043568,0.015148,0.001575,0.001860,-0.003525,-0.003241,0.038772,-0.005092,0.038254,0.002555,0.002269,-0.140382,0.643865
2023-10,1.0,0.015738,-0.011238,0.000000,0.476524,0.000000,0.000570,0.000554,0.000523,0.0,0.003596,0.003290,0.011505,0.004867,0.003025,0.003040,-0.000569,-0.000554,0.098462,-0.003583,0.007881,-0.002712,-0.002727,-0.622940,-0.231555
2023-11,0.3,-0.065238,-0.143333,0.000000,-0.287143,0.000000,0.002247,0.002254,0.000746,-0.1,0.000791,0.001444,-0.021329,-0.014452,-0.001453,-0.001460,-0.028498,-0.028506,-0.056724,-0.000790,-0.022103,0.000802,0.000810,0.487097,-0.872340
2023-12,-1.4,-0.028500,-0.322429,0.000000,-0.522381,0.000000,0.003016,0.003017,0.001097,0.0,0.001603,0.000892,-0.015573,-0.009154,-0.001409,-0.001410,-0.003007,-0.003008,-0.109305,-0.001601,-0.017149,0.002123,0.002124,0.455423,-25.718750


##1.3 Data preparing, label creation

In [None]:
series = ['CPIAUCSL']
fred_df = fred_to_df(series_arr =series, grpby_freq='ME', grpby_method='mean')

In [None]:
y = fred_df

In [None]:
df= y
df['DATE'] = pd.to_datetime(df['DATE'])

# Set 'DATE' as the index
df = df.set_index('DATE')

# Format the date to 'yyyy-mm' and set it back as index
df.index = df.index.to_period('M')

# If you want to keep 'DATE' as a column formatted as 'yyyy-mm'
df['DATE'] = df.index.to_timestamp().strftime('%Y-%m')
df = df.drop(columns = ['DATE'])
y = df

In [None]:
y = y.pct_change()
y.dropna(inplace= True)

In [None]:
X.index.dtype, y.index.dtype

(period[M], period[M])

In [None]:
X.index = X.index.to_timestamp()
y.index = y.index.to_timestamp()

# Ensure that the indices match by aligning y to X
y_aligned = y.loc[X.index]

# Verify the alignment
print(f"Shape of y: {y_aligned.shape}")
print(f"Shape of X: {X.shape}")

# Concatenate the DataFrames along columns (axis=1)
combined_df = pd.concat([y_aligned, X], axis=1)

Shape of y: (471, 1)
Shape of X: (471, 25)


In [None]:
# combined_df.drop(columns=['level_0','index'],inplace=True)

In [None]:
data = combined_df
data

Unnamed: 0_level_0,CPIAUCSL,MICH,DTB3,DGS1,DFF,DGS20,DPRIME,PI,DSPI,USALOLITONOSTSAM,UNRATE,CPIAUCSL_lag1,PCEPILFE_lag1,CPIENGSL_lag1,PPIACO_lag1,CPI_PI,CPI_DSPI,UNRATE_PI,UNRATE_DSPI,DGS20_CPI,DFF_CPI,CPIENGSL_CPI,PI_PCEPILFE,DSPI_PCEPILFE,DGS20_DGS1,DGS1_DTB3
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
1978-02-01,0.004785,1.2,0.005079,0.054444,0.080737,0.076587,0.068182,0.009147,0.010598,0.000353,-0.1,0.006421,0.005988,-0.009690,0.009063,-0.002702,-0.004134,-0.024548,-0.025948,0.002970,0.005588,-0.016008,0.003140,0.004583,0.025862,0.058569
1978-03-01,0.006349,-0.1,-0.151919,-0.027323,0.008940,-0.009747,0.000000,0.012217,0.014900,0.001042,0.0,0.004785,0.003852,-0.009785,0.010479,-0.007343,-0.009966,-0.012070,-0.014681,-0.005943,-0.003450,-0.014500,0.008333,0.011005,0.020010,0.139647
1978-04-01,0.007886,0.4,-0.007636,0.144045,0.095441,0.117636,0.000000,0.013349,0.011026,0.001468,-0.2,0.006349,0.005423,0.007905,0.008889,-0.006907,-0.004626,-0.044501,-0.042306,0.007935,0.007652,0.001546,0.007884,0.005574,-0.029477,0.149173
1978-05-01,0.009390,0.2,0.122095,0.365500,0.471333,0.117429,0.250000,0.008288,0.006544,0.001483,-0.1,0.007886,0.006466,0.007843,0.013216,-0.000398,0.001334,-0.024479,-0.022788,0.006172,0.060061,-0.000043,0.001811,0.000078,-0.285304,0.208305
1978-06-01,0.007752,-0.4,0.325996,0.274545,0.236667,0.085390,0.386364,0.010234,0.006563,0.001119,-0.1,0.009390,0.005891,0.005837,0.007246,-0.000836,0.002808,-0.026628,-0.023078,0.000719,0.022554,-0.003520,0.004317,0.000668,-0.304389,-0.036440
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-09-01,0.003596,-0.3,0.022978,0.068609,0.000000,0.196478,0.000000,0.003538,0.003252,0.000623,0.0,0.005118,0.000980,0.043568,0.015148,0.001575,0.001860,-0.003525,-0.003241,0.038772,-0.005092,0.038254,0.002555,0.002269,-0.140382,0.643865
2023-10-01,0.000791,1.0,0.015738,-0.011238,0.000000,0.476524,0.000000,0.000570,0.000554,0.000523,0.0,0.003596,0.003290,0.011505,0.004867,0.003025,0.003040,-0.000569,-0.000554,0.098462,-0.003583,0.007881,-0.002712,-0.002727,-0.622940,-0.231555
2023-11-01,0.001603,0.3,-0.065238,-0.143333,0.000000,-0.287143,0.000000,0.002247,0.002254,0.000746,-0.1,0.000791,0.001444,-0.021329,-0.014452,-0.001453,-0.001460,-0.028498,-0.028506,-0.056724,-0.000790,-0.022103,0.000802,0.000810,0.487097,-0.872340
2023-12-01,0.002331,-1.4,-0.028500,-0.322429,0.000000,-0.522381,0.000000,0.003016,0.003017,0.001097,0.0,0.001603,0.000892,-0.015573,-0.009154,-0.001409,-0.001410,-0.003007,-0.003008,-0.109305,-0.001601,-0.017149,0.002123,0.002124,0.455423,-25.718750


In [None]:
X = data.iloc[:,1:]

In [None]:
y = data.iloc[:,:1]

In [None]:
def y_to_bin(y):
    y['CPIAUCSL'] = y['CPIAUCSL'].apply(lambda x: 2 if x > 0 else (0 if x < 0 else 1))
    return y

In [None]:
y = y_to_bin(y)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['CPIAUCSL'] = y['CPIAUCSL'].apply(lambda x: 2 if x > 0 else (0 if x < 0 else 1))


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((376, 25), (95, 25), (376, 1), (95, 1))

# 2.Model training and optimization

In [None]:
import numpy as np
import optuna
import lightgbm as lgb
import pandas as pd
import os
import joblib
import json
from sklearn.metrics import  log_loss
class HyperparameterResearch:
    def __init__(self, research_name, base_params, num_studies, num_trials):
        self.research_name = research_name
        self.base_params = base_params
        self.num_studies = num_studies
        self.num_trials = num_trials

        # Ensure research directory exists
        if not os.path.exists(research_name):
            os.makedirs(research_name)

    def adjust_param_distribution(self, df_describe, original_params):
        new_params = original_params.copy()
        for param in original_params:
            param_stats = df_describe[param]
            if param_stats['max'] >= original_params[param][1]:
                new_max = round(param_stats['max'] + param_stats['std'])
                new_params[param] = (original_params[param][0], new_max)
        return new_params

    def run_study(self, study_number, params):
        def objective(trial):
            train_x, valid_x, train_y, valid_y = X_train, X_test, y_train, y_test
            dtrain = lgb.Dataset(train_x, label=train_y)

            param = {
                "objective": "multiclass",
                "metric": "multi_logloss",
                "num_class": 3,
                "seed": 42,
                "deterministic":True,
                "device_type": "cpu",
                "verbosity": -1,
                "is_unbalance": True,
                "boosting_type": "gbdt",
                "lambda_l1": trial.suggest_float("lambda_l1", params["lambda_l1"][0], params["lambda_l1"][1], log=True),
                "lambda_l2": trial.suggest_float("lambda_l2", params["lambda_l2"][0], params["lambda_l2"][1], log=True),
                "num_leaves": trial.suggest_int("num_leaves", params["num_leaves"][0], params["num_leaves"][1]),
                "feature_fraction": trial.suggest_float("feature_fraction", params["feature_fraction"][0], params["feature_fraction"][1]),
                "bagging_fraction": trial.suggest_float("bagging_fraction", params["bagging_fraction"][0], params["bagging_fraction"][1]),
                "bagging_freq": trial.suggest_int("bagging_freq", params["bagging_freq"][0], params["bagging_freq"][1]),
                "min_child_samples": trial.suggest_int("min_child_samples", params["min_child_samples"][0], params["min_child_samples"][1]),
            }

            gbm = lgb.train(param, dtrain)
            preds = gbm.predict(valid_x)
            pred_labels = np.rint(preds)
            logloss = log_loss(valid_y, pred_labels)
            return logloss

        optuna.logging.set_verbosity(optuna.logging.WARN)
        study = optuna.create_study(direction="minimize")
        study.optimize(objective, n_trials=self.num_trials, show_progress_bar=True, n_jobs=-1)

        trial_results = []
        for trial in study.trials:
            trial_data = {
                'Trial Number': trial.number,
                'Value': trial.value
            }
            trial_data.update(trial.params)
            trial_results.append(trial_data)

        df_trials = pd.DataFrame(trial_results)
        study_dir = os.path.join(self.research_name, f'study_{study_number}')
        if not os.path.exists(study_dir):
            os.makedirs(study_dir)
        df_trials.to_csv(os.path.join(study_dir, 'trials.csv'), index=False)

        # Save the study
        joblib.dump(study, os.path.join(study_dir, 'study.pkl'))

        # Save the parameter space
        with open(os.path.join(study_dir, 'param_space.json'), 'w') as f:
            json.dump(params, f, indent=4)

        return df_trials

    def run_research(self):
        params = self.base_params
        for study_number in range(self.num_studies):
            df_trials = self.run_study(study_number, params)
            if study_number < self.num_studies - 1:
                df_best = df_trials.sort_values(by='Value').iloc[:int(len(df_trials)/5)]
                df_describe = df_best.describe()
                params = self.adjust_param_distribution(df_describe, params)

        print("Research completed and results saved.")


In [None]:
# Define the initial parameter ranges
l1_start, l1_end = 1e-8, 10.0
l2_start, l2_end = 1e-8, 10.0
num_leaves_start, num_leaves_end = 2, 256
feature_fraction_start, feature_fraction_end = 0.4, 1.0
bagging_fraction_start, bagging_fraction_end = 0.4, 1.0
bagging_freq_start, bagging_freq_end = 1, 7
min_child_samples_start, min_child_samples_end = 5, 100

base_params = {
    "lambda_l1": (l1_start, l1_end),
    "lambda_l2": (l2_start, l2_end),
    "num_leaves": (num_leaves_start, num_leaves_end),
    "feature_fraction": (feature_fraction_start, feature_fraction_end),
    "bagging_fraction": (bagging_fraction_start, bagging_fraction_end),
    "bagging_freq": (bagging_freq_start, bagging_freq_end),
    "min_child_samples": (min_child_samples_start, min_child_samples_end),
}

# Initialize the research
research_name = "research_about_cpi_1"
num_studies = 10
num_trials = 100

research = HyperparameterResearch(research_name, base_params, num_studies, num_trials)

research.run_research()


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Research completed and results saved.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp -r /content/research_about_cpi_1 /content/drive/MyDrive

In [None]:
# /content/drive/MyDrive/research_about_cpi_1

In [None]:
# from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, accuracy_score, log_loss, classification_report
# import lightgbm as lgb
# from optuna.visualization import (
#     plot_optimization_history,
#     plot_param_importances,
#     plot_parallel_coordinate,
# )

# params = best_trial.params

# model = lgb.LGBMClassifier(**params, verbose=-1)

# model.fit(X_train, y_train)

# y_pred = model.predict(X_test)
# y_prob = model.predict_proba(X_test)

# recall = recall_score(y_test, y_pred, average='macro')
# precision = precision_score(y_test, y_pred, average='macro')
# f1 = f1_score(y_test, y_pred, average='macro')
# accuracy = accuracy_score(y_test, y_pred)
# logloss = log_loss(y_test, y_prob)

# print(f"Log Loss: {logloss}")
# print(f"Accuracy: {accuracy}")
# print(f"Macro Recall: {recall}")
# print(f"Macro Precision: {precision}")
# print(f"Macro F1-score: {f1}")
# print("Classification Report:")
# print(classification_report(y_test, y_pred, target_names=['0', '1', '2']))

In [None]:
import os
import optuna
import joblib
import json
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, accuracy_score, log_loss
from sklearn.model_selection import train_test_split
import sklearn.datasets

class HyperparameterResearch:
    def __init__(self, research_name, base_params, num_studies, num_trials):
        self.research_name = research_name
        self.base_params = base_params
        self.num_studies = num_studies
        self.num_trials = num_trials

        # Ensure research directory exists
        if not os.path.exists(research_name):
            os.makedirs(research_name)

    def adjust_param_distribution(self, df_describe, original_params):
        new_params = original_params.copy()
        for param in original_params:
            param_stats = df_describe[param]
            if param_stats['max'] >= original_params[param][1]:
                new_max = round(param_stats['max'] + param_stats['std'])
                new_params[param] = (original_params[param][0], new_max)
        return new_params

    def run_study(self, study_number, params):
        def objective(trial):
            train_x, valid_x, train_y, valid_y = X_train, X_test, y_train, y_test
            dtrain = lgb.Dataset(train_x, label=train_y)

            param = {
                "objective": "multiclass",
                "metric": "multi_logloss",
                "num_class": 3,
                "seed": 42,
                "deterministic":True,
                "device_type": "cpu",
                "verbosity": -1,
                "is_unbalance": True,
                "boosting_type": "gbdt",
                "lambda_l1": trial.suggest_float("lambda_l1", params["lambda_l1"][0], params["lambda_l1"][1], log=True),
                "lambda_l2": trial.suggest_float("lambda_l2", params["lambda_l2"][0], params["lambda_l2"][1], log=True),
                "num_leaves": trial.suggest_int("num_leaves", params["num_leaves"][0], params["num_leaves"][1]),
                "feature_fraction": trial.suggest_float("feature_fraction", params["feature_fraction"][0], params["feature_fraction"][1]),
                "bagging_fraction": trial.suggest_float("bagging_fraction", params["bagging_fraction"][0], params["bagging_fraction"][1]),
                "bagging_freq": trial.suggest_int("bagging_freq", params["bagging_freq"][0], params["bagging_freq"][1]),
                "min_child_samples": trial.suggest_int("min_child_samples", params["min_child_samples"][0], params["min_child_samples"][1]),
            }

            gbm = lgb.train(param, dtrain)
            preds = gbm.predict(valid_x)
            pred_labels = np.rint(preds)
            logloss = log_loss(valid_y, pred_labels)
            return logloss

        study_name = f"{self.research_name}_study_{study_number}"
        study = optuna.create_study(direction="minimize", study_name=study_name)
        study.optimize(objective, n_trials=self.num_trials, show_progress_bar=True, n_jobs=-1)

        trial_results = []
        for trial in study.trials:
            trial_data = {
                'Trial Number': trial.number,
                'Value': trial.value
            }
            trial_data.update(trial.params)
            trial_results.append(trial_data)

        df_trials = pd.DataFrame(trial_results)
        study_dir = os.path.join(self.research_name, f'study_{study_number}')
        if not os.path.exists(study_dir):
            os.makedirs(study_dir)
        df_trials.to_csv(os.path.join(study_dir, 'trials.csv'), index=False)

        # Save the study
        joblib.dump(study, os.path.join(study_dir, 'study.pkl'))

        # Save the parameter space
        with open(os.path.join(study_dir, 'param_space.json'), 'w') as f:
            json.dump(params, f, indent=4)

        return df_trials

    def run_research(self):
        params = self.base_params
        for study_number in range(self.num_studies):
            df_trials = self.run_study(study_number, params)
            if study_number < self.num_studies - 1:
                df_best = df_trials.sort_values(by='Value').iloc[:int(len(df_trials)/5)]
                df_describe = df_best.describe()
                params = self.adjust_param_distribution(df_describe, params)

        print("Research completed and results saved.")


In [None]:
# Define the initial parameter ranges
l1_start, l1_end = 1e-8, 10.0
l2_start, l2_end = 1e-8, 10.0
num_leaves_start, num_leaves_end = 2, 256
feature_fraction_start, feature_fraction_end = 0.4, 1.0
bagging_fraction_start, bagging_fraction_end = 0.4, 1.0
bagging_freq_start, bagging_freq_end = 1, 7
min_child_samples_start, min_child_samples_end = 5, 100

base_params = {
    "lambda_l1": (l1_start, l1_end),
    "lambda_l2": (l2_start, l2_end),
    "num_leaves": (num_leaves_start, num_leaves_end),
    "feature_fraction": (feature_fraction_start, feature_fraction_end),
    "bagging_fraction": (bagging_fraction_start, bagging_fraction_end),
    "bagging_freq": (bagging_freq_start, bagging_freq_end),
    "min_child_samples": (min_child_samples_start, min_child_samples_end),
}

# Initialize the research
research_name = "research_about_cpi_3"
num_studies = 10
num_trials = 100

research = HyperparameterResearch(research_name, base_params, num_studies, num_trials)

research.run_research()
# 2.655848

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Research completed and results saved.


In [None]:
import os

class StudyAnalyzer:
    def __init__(self, research_folder, X, y):
        self.research_folder = research_folder
        self.X = X
        self.y = y
        self.results = []

    def load_study(self, study_path):
        study = joblib.load(study_path)
        return study

    def analyze_study(self, study):
        best_trial = study.best_trial
        params = best_trial.params

        # Train-test split
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.25, random_state=42)

        # Train the model
        model = lgb.LGBMClassifier(**params, verbose=-1)
        model.fit(X_train, y_train)

        # Predict and evaluate
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)

        recall = recall_score(y_test, y_pred, average='macro')
        precision = precision_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')
        accuracy = accuracy_score(y_test, y_pred)
        logloss = log_loss(y_test, y_prob)
        class_report = classification_report(y_test, y_pred, output_dict=True)

        flat_report = self.flatten_classification_report(class_report)

        return {
            "study_name": study.study_name,
            "best_trial_value": best_trial.value,
            "log_loss": logloss,
            "accuracy": accuracy,
            "macro_recall": recall,
            "macro_precision": precision,
            "macro_f1": f1,
            **flat_report
        }

    def flatten_classification_report(self, report):
        flat_report = {}
        for label, metrics in report.items():
            if isinstance(metrics, dict):
                for metric_name, value in metrics.items():
                    column_name = f"{label}_{metric_name}".replace(' ', '_')
                    flat_report[column_name] = value
            else:
                column_name = label.replace(' ', '_')
                flat_report[column_name] = metrics
        return flat_report

    def run_analysis(self):
        for study_dir in os.listdir(self.research_folder):
            study_path = os.path.join(self.research_folder, study_dir, "study.pkl")
            if os.path.exists(study_path):
                study = self.load_study(study_path)
                result = self.analyze_study(study)
                self.results.append(result)

    def get_results_df(self):
        return pd.DataFrame(self.results)

    def save_results(self):
        results_df = self.get_results_df()
        research_folder_name = os.path.basename(self.research_folder.rstrip('/'))
        csv_file_name = f"{research_folder_name}_best_trials.csv"
        output_path = os.path.join(self.research_folder, csv_file_name)

        results_df.to_csv(output_path, index=False)
        print(f"Results saved to {output_path}")


In [None]:
%%capture
research_folder = '/content/research_about_cpi_2'

analyzer = StudyAnalyzer(research_folder, X, y)
analyzer.run_analysis()
analyzer.save_results()

In [None]:
df = pd.read_csv('/content/research_about_cpi_2/research_about_cpi_2_best_trials.csv')

In [None]:
df

Unnamed: 0,study_name,best_trial_value,log_loss,accuracy,macro_recall,macro_precision,macro_f1,0_precision,0_recall,0_f1-score,0_support,1_precision,1_recall,1_f1-score,1_support,2_precision,2_recall,2_f1-score,2_support,macro_avg_precision,macro_avg_recall,macro_avg_f1-score,macro_avg_support,weighted_avg_precision,weighted_avg_recall,weighted_avg_f1-score,weighted_avg_support
0,research_about_cpi_2_study_5,2.655848,0.455006,0.881356,0.384416,0.434218,0.39526,0.4,0.181818,0.25,11,0.0,0.0,0.0,2,0.902655,0.971429,0.93578,105,0.434218,0.384416,0.39526,118,0.840498,0.881356,0.855991,118
1,research_about_cpi_2_study_4,2.655848,0.482302,0.898305,0.417893,0.503835,0.439985,0.6,0.272727,0.375,11,0.0,0.0,0.0,2,0.911504,0.980952,0.944954,105,0.503835,0.417893,0.439985,118,0.867017,0.898305,0.875807,118
2,research_about_cpi_2_study_3,2.655848,0.344169,0.90678,0.58456,0.839881,0.663658,0.6,0.272727,0.375,11,1.0,0.5,0.666667,2,0.919643,0.980952,0.949309,105,0.839881,0.58456,0.663658,118,0.891208,0.90678,0.890981,118
3,research_about_cpi_2_study_8,2.655848,0.328888,0.90678,0.393939,0.635057,0.419306,1.0,0.181818,0.307692,11,0.0,0.0,0.0,2,0.905172,1.0,0.950226,105,0.635057,0.393939,0.419306,118,0.89867,0.90678,0.874223,118
4,research_about_cpi_2_study_0,1.920163,0.330646,0.898305,0.390765,0.526316,0.411829,0.666667,0.181818,0.285714,11,0.0,0.0,0.0,2,0.912281,0.990476,0.949772,105,0.526316,0.390765,0.411829,118,0.873922,0.898305,0.87177,118
5,research_about_cpi_2_study_6,2.655848,0.319078,0.90678,0.421068,0.554094,0.449924,0.75,0.272727,0.4,11,0.0,0.0,0.0,2,0.912281,0.990476,0.949772,105,0.554094,0.421068,0.449924,118,0.88169,0.90678,0.882424,118
6,research_about_cpi_2_study_7,2.655848,0.402552,0.90678,0.421068,0.554094,0.449924,0.75,0.272727,0.4,11,0.0,0.0,0.0,2,0.912281,0.990476,0.949772,105,0.554094,0.421068,0.449924,118,0.88169,0.90678,0.882424,118
7,research_about_cpi_2_study_9,2.655848,0.56737,0.915254,0.424242,0.637681,0.461039,1.0,0.272727,0.428571,11,0.0,0.0,0.0,2,0.913043,1.0,0.954545,105,0.637681,0.424242,0.461039,118,0.905674,0.915254,0.889335,118
8,research_about_cpi_2_study_2,2.29957,0.328407,0.915254,0.451371,0.573451,0.484709,0.8,0.363636,0.5,11,0.0,0.0,0.0,2,0.920354,0.990476,0.954128,105,0.573451,0.451371,0.484709,118,0.893535,0.915254,0.895623,118
9,research_about_cpi_2_study_1,2.655848,0.597893,0.915254,0.424242,0.637681,0.461039,1.0,0.272727,0.428571,11,0.0,0.0,0.0,2,0.913043,1.0,0.954545,105,0.637681,0.424242,0.461039,118,0.905674,0.915254,0.889335,118
