In [7]:
# !pip uninstall -y scikit-learn
# !pip install scikit-learn==1.3.1


In [23]:
# !pip install catboost
# !pip install xgboost
# !pip install --upgrade xgboost
!pip install optuna



[notice] A new release of pip is available: 23.3.2 -> 24.3.1
[notice] To update, run: C:\Users\Asus\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting sqlalchemy>=1.4.2 (from optuna)
  Downloading SQLAlchemy-2.0.36-cp310-cp310-win_amd64.whl.metadata (9.9 kB)
Collecting PyYAML (from optuna)
  Downloading PyYAML-6.0.2-cp310-cp310-win_amd64.whl.metadata (2.1 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Collecting greenlet!=0.4.17 (from sqlalchemy>=1.4.2->optuna)
  Downloading greenlet-3.1.1-cp310-cp310-win_amd64.whl.metadata (3.9 kB)
Collecting MarkupSafe>=0.9.2 (from Mako->alembic>=1.5.0->optuna)
  Downloading MarkupSafe-3.0.2-cp310-cp310-win_amd64.whl.metadata (4.1 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
   ---------------------------------------- 0.0/364.4 kB ? eta -:--

In [30]:
import optuna
from optuna.samplers import TPESampler

In [21]:
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score, cross_val_predict, RandomizedSearchCV
from sklearn.feature_selection import mutual_info_regression, mutual_info_classif
from sklearn.metrics import r2_score

from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline

from sklearn import linear_model, ensemble, naive_bayes
import xgboost as xgb
import lightgbm as lgbm
import catboost as cb

from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from sklearn.metrics import f1_score



# ====================================
# Import necessary libraries for modeling
# ====================================

# Models
from catboost import CatBoostRegressor

# Preprocessing
from sklearn.preprocessing import StandardScaler

# Evaluation metrics
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error


import warnings
warnings.filterwarnings('ignore')

In [10]:
RANDOM_SEED =42

In [11]:

# Define a function to create new features
def create_new_features(df):
    # Avoid division by zero by replacing zeros with a small number
    df['net_sales'].replace(0, 1e-6, inplace=True)
    df['total_assets'].replace(0, 1e-6, inplace=True)
    df['total_assets_minus_liabilities'] = df['total_assets'] - df['total_liabilities']
    df['total_assets_minus_liabilities'].replace(0, 1e-6, inplace=True)
    df['total_receivables'].replace(0, 1e-6, inplace=True)
    
    # Gross Profit Margin
    df['gross_profit_margin'] = df['gross_profit'] / df['net_sales']
    
    # EBITDA Margin
    df['ebitda_margin'] = df['ebitda'] / df['net_sales']
    
    # EBIT Margin
    df['ebit_margin'] = df['ebit'] / df['net_sales']
    
    # Asset Turnover
    df['asset_turnover'] = df['net_sales'] / df['total_assets']
    
    # Debt-to-Equity Ratio
    df['debt_to_equity'] = df['total_liabilities'] / df['total_assets_minus_liabilities']
    
    # Receivables Turnover
    df['receivables_turnover'] = df['net_sales'] / df['total_receivables']
    
    # Operating Expense Ratio
    df['operating_expense_ratio'] = df['total_operating_expenses'] / df['net_sales']
    
    # Depreciation and Amortization Ratio
    df['depreciation_amortization_ratio'] = df['depreciation_and_amortization'] / df['total_assets']
    
    # Drop intermediate columns if not needed
    df.drop(['total_assets_minus_liabilities'], axis=1, inplace=True)
    
    return df

In [12]:
import pandas as pd
import numpy as np

# Load the training data
train = pd.read_csv('company_train.csv', index_col='ID')
df_regress = train.copy()

# Map 'status_label' to numerical values
status_mapping = {'alive': 1, 'failed': 0}
df_regress['status_label'] = df_regress['status_label'].map(status_mapping)



# Create new features in the training data
df_regress = create_new_features(df_regress)

# One-Hot Encode 'company_name' along with other categorical variables
# Assuming 'company_name' is the only categorical variable remaining
X_reg = pd.get_dummies(
    df_regress.drop(columns=['retained_earnings', 'net_profit_margin_category']),
    drop_first=True
)

# Define target variable
y_reg = df_regress['retained_earnings']

print("Features used for training:")
print(X_reg.columns.tolist())

# Load the test dataset
X_test_ori = pd.read_csv("company_test_regress.csv", index_col='ID')

# Map 'status_label' to numerical values
X_test_ori['status_label'] = X_test_ori['status_label'].map(status_mapping)

# Create new features in the test data
X_test_ori = create_new_features(X_test_ori)

# One-Hot Encode 'company_name' and other categorical variables in the test set
X_test_final = pd.get_dummies(
    X_test_ori,
    drop_first=True
)
    
# Align the test data with the training data by adding missing columns with zeros
X_reg, X_test_final = X_reg.align(X_test_final, join='left', axis=1, fill_value=0)

# Ensure no columns are missing or extra
missing_cols = set(X_reg.columns) - set(X_test_final.columns)
for col in missing_cols:
    X_test_final[col] = 0

# Reorder columns in test set to match training set
X_test_final = X_test_final[X_reg.columns]

# Final check
print("\nFinal training features shape:", X_reg.shape)
print("Final test features shape:", X_test_final.shape)

# Optional: Drop 'year' if it's not needed or create time-based features
# For example, creating a 'age' feature assuming data is up to a certain year
# current_year = 2023
# X_reg['age'] = current_year - X_reg['year']
# X_test_final['age'] = current_year - X_test_final['year']
# X_reg.drop('year', axis=1, inplace=True)
# X_test_final.drop('year', axis=1, inplace=True)


Features used for training:
['status_label', 'year', 'cost_of_goods_sold', 'depreciation_and_amortization', 'ebitda', 'inventory', 'total_receivables', 'market_value', 'net_sales', 'total_assets', 'total_long_term_debt', 'ebit', 'gross_profit', 'total_liabilities', 'total_operating_expenses', 'current_ratio', 'gross_profit_margin', 'ebitda_margin', 'ebit_margin', 'asset_turnover', 'debt_to_equity', 'receivables_turnover', 'operating_expense_ratio', 'depreciation_amortization_ratio', 'company_name_C_10', 'company_name_C_100', 'company_name_C_1000', 'company_name_C_1001', 'company_name_C_1002', 'company_name_C_1003', 'company_name_C_1004', 'company_name_C_1005', 'company_name_C_1006', 'company_name_C_1007', 'company_name_C_1008', 'company_name_C_1009', 'company_name_C_101', 'company_name_C_1010', 'company_name_C_1011', 'company_name_C_1012', 'company_name_C_1013', 'company_name_C_1014', 'company_name_C_1015', 'company_name_C_1016', 'company_name_C_1017', 'company_name_C_1018', 'company_n

In [13]:
# feature_names = pd.get_dummies(X_reg).columns

# mi = mutual_info_regression(pd.get_dummies(X_reg), y_reg, random_state=0)

# mi_df = pd.DataFrame({'Feature': feature_names, 'Importance': mi})

# mi_df = mi_df.sort_values(by='Importance', ascending=False)

# plt.figure(figsize=(6, 6))
# plt.barh(mi_df['Feature'], mi_df['Importance'], color='skyblue')
# plt.xlabel('Mutual Information')
# plt.title('Feature Importance based on Mutual Information')
# plt.gca().invert_yaxis()
# plt.show()

In [14]:
models_reg = {
    'xgboost': xgb.XGBRegressor(random_state=RANDOM_SEED),
    'lightgbm': lgbm.LGBMRegressor(random_state=RANDOM_SEED, verbose=0),
    'random_forest': ensemble.RandomForestRegressor(random_state=RANDOM_SEED, n_jobs=-1),
    'hist_gradient_boosting': ensemble.HistGradientBoostingRegressor(random_state=RANDOM_SEED),
    'extra_trees': ensemble.ExtraTreesRegressor(random_state=RANDOM_SEED, n_jobs=-1),
    'catboost': cb.CatBoostRegressor(random_state=RANDOM_SEED, verbose=0),
    'ridge': make_pipeline(
        StandardScaler(),
        linear_model.Ridge()),
}

In [15]:
X_reg.head()

Unnamed: 0_level_0,status_label,year,cost_of_goods_sold,depreciation_and_amortization,ebitda,inventory,total_receivables,market_value,net_sales,total_assets,...,company_name_C_990,company_name_C_991,company_name_C_992,company_name_C_993,company_name_C_994,company_name_C_995,company_name_C_996,company_name_C_997,company_name_C_998,company_name_C_999
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CQZW3V9G,1,1999,833.107,18.373,89.031,336.018,128.348,372.7519,1024.333,740.998,...,False,False,False,False,False,False,False,False,False,False
CIRJ6YT8,1,2000,713.811,18.577,64.367,320.59,115.187,377.118,874.255,701.854,...,False,False,False,False,False,False,False,False,False,False
CAHLFH0N,1,2001,526.477,22.496,27.207,286.588,77.528,364.5928,638.721,710.199,...,False,False,False,False,False,False,False,False,False,False
CBU4UE1T,1,2002,496.747,27.172,30.745,259.954,66.322,143.3295,606.337,686.621,...,False,False,False,False,False,False,False,False,False,False
C0DQ4A9M,1,2003,523.302,26.68,47.491,247.245,104.661,308.9071,651.958,709.292,...,False,False,False,False,False,False,False,False,False,False


In [16]:
y_reg.head()

ID
CQZW3V9G    201.026
CIRJ6YT8    204.065
CAHLFH0N    139.603
CBU4UE1T    124.106
C0DQ4A9M    131.884
Name: retained_earnings, dtype: float64

In [17]:
# Define the custom cross-validation function
from sklearn.model_selection import KFold
def cross_val_custom_r2(estimator, X, _y, n_splits=5, random_state=42):
    """
    Perform k-fold cross-validation and compute R² scores.

    Parameters:
    - estimator: scikit-learn compatible regressor.
    - X (pd.DataFrame): Feature matrix.
    - _y (pd.Series or array-like): Target vector.
    - n_splits (int): Number of folds for cross-validation.
    - random_state (int): Seed for reproducibility.

    Returns:
    - np.ndarray: Array of R² scores for each fold.
    """
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    results = []

    y = _y

    for fold, (train_index, test_index) in enumerate(kf.split(X, y), start=1):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Fit the estimator on the training data
        estimator.fit(X_train, y_train)

        # Make predictions on the validation data
        y_pred = estimator.predict(X_test)

        # Compute the R² score for the current fold
        score = r2_score(y_test, y_pred)

        # Append the score to the results list
        results.append(score)

        # (Optional) Print progress
        print(f"Fold {fold}: R² Score = {score:.4f}")

    return np.array(results)




In [18]:
scores = pd.DataFrame(index=models_reg.keys(), columns=['mean r2', 'var r2', 'fit time', 'fold 1', 'fold 2', 'fold 3', 'fold 4', 'fold 5'])
n = 0
# Iterate through each model and perform cross-validation
for name, model in models_reg.items():
    print(f"\nEvaluating {name}...")
    try:
        start_time = time.time()
        # Perform cross-validation and get R² scores for each fold
        fold_scores = cross_val_custom_r2(model, X_reg, y_reg, n_splits=5, random_state=RANDOM_SEED)
        end_time = time.time()

        # Calculate mean and variance of R² scores
        mean_r2 = fold_scores.mean()
        var_r2 = fold_scores.var()

        # Calculate total fit time
        fit_time = end_time - start_time

        # Assign values to the scores DataFrame
        scores.loc[name] = [mean_r2, var_r2, fit_time, *fold_scores]

        print(f"{name} completed successfully.\n")
    except Exception as e:
        print(f"Error evaluating {name}: {e}")
        # Assign NaN to indicate failure
        scores.loc[name] = [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]

display(scores.sort_values(by='mean r2', ascending=False))


Evaluating xgboost...
Fold 1: R² Score = 0.9549
Fold 2: R² Score = 0.9252
Fold 3: R² Score = 0.9640
Fold 4: R² Score = 0.9519
Fold 5: R² Score = 0.9371
xgboost completed successfully.


Evaluating lightgbm...
Fold 1: R² Score = 0.6956
Fold 2: R² Score = 0.6859
Fold 3: R² Score = 0.7492
Fold 4: R² Score = 0.8182
Fold 5: R² Score = 0.8396
lightgbm completed successfully.


Evaluating random_forest...
Fold 1: R² Score = 0.9480
Fold 2: R² Score = 0.8929
Fold 3: R² Score = 0.9011
Fold 4: R² Score = 0.9147
Fold 5: R² Score = 0.9105
random_forest completed successfully.


Evaluating hist_gradient_boosting...
Fold 1: R² Score = 0.6925
Fold 2: R² Score = 0.6615
Fold 3: R² Score = 0.7217
Fold 4: R² Score = 0.7995
Fold 5: R² Score = 0.6213
hist_gradient_boosting completed successfully.


Evaluating extra_trees...
Fold 1: R² Score = 0.9756
Fold 2: R² Score = 0.9473
Fold 3: R² Score = 0.9489
Fold 4: R² Score = 0.9442
Fold 5: R² Score = 0.9532
extra_trees completed successfully.


Evaluating catboo

Unnamed: 0,mean r2,var r2,fit time,fold 1,fold 2,fold 3,fold 4,fold 5
extra_trees,0.953839,0.000127,13145.915026,0.975628,0.947259,0.948892,0.944198,0.953217
xgboost,0.946611,0.00019,423.653524,0.954943,0.925183,0.963965,0.951876,0.937086
catboost,0.945056,0.000137,433.806422,0.959861,0.928861,0.939724,0.957115,0.93972
random_forest,0.913426,0.000357,7334.193715,0.948048,0.892854,0.901109,0.914661,0.910457
ridge,0.861055,0.001526,731.393552,0.876502,0.832603,0.901614,0.799544,0.895011
lightgbm,0.757707,0.003888,103.621433,0.695635,0.68591,0.749218,0.818175,0.839596
hist_gradient_boosting,0.699294,0.00362,889.618777,0.692487,0.661511,0.721729,0.799474,0.621268


In [29]:
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, RegressorMixin


In [None]:
# Assuming X and y are your features and target variable
n_bins = 10  # Define the number of bins
y_binned = pd.qcut(y_reg, q=n_bins, duplicates='drop')  # Create quantile-based bins


X_train, X_test, y_train, y_test = train_test_split(
    X_reg, y_reg, 
    test_size=0.2, 
    random_state=RANDOM_SEED,
    stratify=y_binned  # Ensures class distribution is preserved
)



# 6. Define the Objective Function for Optuna
def objective(trial):
    # Define the hyperparameter space
    param = {
        'objective': 'reg:squarederror',
        'random_state': RANDOM_SEED,
        'verbosity': 0,
        'n_estimators': trial.suggest_int('n_estimators', 100, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        # Fixed or default hyperparameters:
        'gamma': 0,           # Fixed at 0
        'min_child_weight': 1,  # Fixed at 1
        'reg_alpha': 0,      # Fixed at 0
        'reg_lambda': 1.0,   # Fixed at 1.0
    }
    
    # Initialize the model with early stopping
    model = xgb.XGBRegressor(
        **param,
        eval_set=[(X_test, y_test)],  # Ideally, use a separate validation set
        tree_method='auto'  # Use 'gpu_hist' if GPU is available
    )
    
    # Perform cross-validation
    scores = cross_val_score(
        model, X_train, y_train,
        scoring='r2',
        cv=3,              # Number of cross-validation folds
        n_jobs=-1,         # Utilize all available cores
        verbose=0
    )
    
    # Return the mean R² score
    return scores.mean()
# 7. Create and Run the Optuna Study
print("\n=== Starting Optuna Hyperparameter Optimization ===")
study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=RANDOM_SEED))
study.optimize(objective, n_trials=10, timeout=None)  # You can set a timeout in seconds

# 8. Display the Best Hyperparameters
print("\n=== Optuna Hyperparameter Optimization Completed ===")
print(f"Best Trial: {study.best_trial.number}")
print("Best Parameters:")
for key, value in study.best_trial.params.items():
    print(f"  {key}: {value}")
print(f"Best R² Score: {study.best_trial.value:.4f}")

# 9. Train the Best Model on the Entire Training Set
best_params = study.best_trial.params
best_params.update({
    'objective': 'reg:squarederror',
    'random_state': RANDOM_SEED,
    'verbosity': 0,
    'gamma': 0,
    'min_child_weight': 1,
    'reg_alpha': 0,
    'reg_lambda': 1.0
})

# Initialize the best model with early stopping
best_model = xgb.XGBRegressor(
    **best_params,
    eval_set=[(X_test, y_test)],  # Ideally, use a separate validation set
    tree_method='auto'  # Use 'gpu_hist' if GPU is available
)

# Fit the best model
print("\nTraining the Best Model on the Entire Training Set...")
best_model.fit(X_train, y_train)

# 10. Evaluate the Best Model on the Test Set
print("\n=== Evaluating the Best Model on the Test Set ===")
y_pred_best = best_model.predict(X_test)
test_r2_best = r2_score(y_test, y_pred_best)
test_rmse_best = np.sqrt(mean_squared_error(y_test, y_pred_best))
print(f"Test R² Score: {test_r2_best:.4f}")
print(f"Test RMSE: {test_rmse_best:.4f}")



[I 2025-01-01 12:58:39,915] A new study created in memory with name: no-name-b3f13dd2-4641-4f0f-b2c7-a25309dd95f2



=== Starting Optuna Hyperparameter Optimization ===


[I 2025-01-01 13:05:19,359] Trial 0 finished with value: 0.8831614905054429 and parameters: {'n_estimators': 137, 'max_depth': 8, 'learning_rate': 0.14907884894416698, 'subsample': 0.8394633936788146, 'colsample_bytree': 0.6624074561769746}. Best is trial 0 with value: 0.8831614905054429.
[I 2025-01-01 13:10:39,612] Trial 1 finished with value: 0.8535624404349699 and parameters: {'n_estimators': 115, 'max_depth': 3, 'learning_rate': 0.1745734676972377, 'subsample': 0.8404460046972835, 'colsample_bytree': 0.8832290311184181}. Best is trial 0 with value: 0.8831614905054429.
[I 2025-01-01 13:15:47,295] Trial 2 finished with value: 0.8743488524349595 and parameters: {'n_estimators': 102, 'max_depth': 8, 'learning_rate': 0.16816410175208013, 'subsample': 0.6849356442713105, 'colsample_bytree': 0.6727299868828402}. Best is trial 0 with value: 0.8831614905054429.
[I 2025-01-01 13:21:32,220] Trial 3 finished with value: 0.8601262847526625 and parameters: {'n_estimators': 118, 'max_depth': 4, '


=== Optuna Hyperparameter Optimization Completed ===
Best Trial: 0
Best Parameters:
  n_estimators: 137
  max_depth: 8
  learning_rate: 0.14907884894416698
  subsample: 0.8394633936788146
  colsample_bytree: 0.6624074561769746
Best R² Score: 0.8832

Training the Best Model on the Entire Training Set...

=== Evaluating the Best Model on the Test Set ===
Test R² Score: 0.9190
Test RMSE: 1470.5498

=== Applying Custom Cross-Validation Function on Best Model ===


AttributeError: 'XGBRegressor' object has no attribute 'model'

In [36]:

# 11. Apply the Custom Cross-Validation Function to the Best Model
print("\n=== Applying Custom Cross-Validation Function on Best Model ===")
custom_cv_scores = cross_val_custom_r2(best_model, X_reg, y_reg, n_splits=5, random_state=RANDOM_SEED)

print("\n=== Fold-wise R² Scores ===")
for fold_number, score in enumerate(custom_cv_scores, start=1):
    print(f"Fold {fold_number}: R² Score = {score:.4f}")


=== Applying Custom Cross-Validation Function on Best Model ===
Fold 1: R² Score = 0.9584
Fold 2: R² Score = 0.9053
Fold 3: R² Score = 0.9400
Fold 4: R² Score = 0.9290
Fold 5: R² Score = 0.9206

=== Fold-wise R² Scores ===
Fold 1: R² Score = 0.9584
Fold 2: R² Score = 0.9053
Fold 3: R² Score = 0.9400
Fold 4: R² Score = 0.9290
Fold 5: R² Score = 0.9206


0.9307

In [None]:
# best_model.fit(X_reg, y_reg)

In [None]:
# y_pred_final = best_model.predict(X_test_final)
# X_test_ori = pd.read_csv("company_test_regress.csv")
# # Create submission DataFrame
# submission = pd.DataFrame()
# submission['ID'] = X_test_ori['ID']
# submission['retained_earnings'] = y_pred_final
# # Save submission to CSV
# submission_filename = f'submission_xgb_tuned.csv'
# submission.to_csv(submission_filename, index=False)
# print(f"Submission file saved as: {submission_filename}\n")


Submission file saved as: submission_xgb_tuned.csv



In [62]:
# submission.head()

In [63]:
# scores.head()