In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error
import warnings
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings("ignore")


SEED = 42
n_splits = 8
number = 5000
early_stopping_rounds = 100

In [2]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
eq_df = pd.read_csv("eq.csv")
sample_submission = pd.read_csv("sample_submission.csv")



# test_df.describe()
# test_df.describe()
# test_df.info()
train_df.info()
train_df = train_df.drop(columns=["Unnamed: 0"])

print(f"Training data shape: {train_df.shape}")
print(f"Earthquake data shape: {eq_df.shape}")
print(f"Test data shape: {test_df.shape}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1250 entries, 0 to 1249
Data columns (total 21 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  1250 non-null   int64  
 1   year        1250 non-null   int64  
 2   month       1250 non-null   int64  
 3   day         1250 non-null   int64  
 4   hour        1250 non-null   int64  
 5   min         1250 non-null   int64  
 6   sec         1250 non-null   int64  
 7   lat         1250 non-null   float64
 8   lon         1250 non-null   float64
 9   depth       1250 non-null   int64  
 10  class       1250 non-null   float64
 11  year_as     1250 non-null   int64  
 12  month_as    1250 non-null   int64  
 13  day_as      1250 non-null   int64  
 14  hour_as     1250 non-null   int64  
 15  min_as      1250 non-null   int64  
 16  sec_as      1250 non-null   int64  
 17  lat_as      1250 non-null   float64
 18  lon_as      1250 non-null   float64
 19  depth_as    1250 non-null  

In [3]:
target_cols = ["year_as", "month_as", "day_as", "hour_as", "min_as", "sec_as",
                  "lat_as", "lon_as", "depth_as", "class_as"]

X = train_df.drop(columns=target_cols)
y = train_df[target_cols]

X_test = test_df[X.columns].copy()

In [4]:
nFolds = 7
cv = KFold(n_splits=nFolds, random_state=SEED, shuffle=True)

predictions = {col: np.zeros(len(X_test)) for col in target_cols}

for col in target_cols:
    fold_preds = np.zeros(len(X_test))
    print(f"Training for target: {col}")
    for train_idx, valid_idx in cv.split(X, y):
        X_train, y_train = X.iloc[train_idx].copy(), y.iloc[train_idx][col]
        X_valid, y_valid = X.iloc[valid_idx].copy(), y.iloc[valid_idx][col]
        
        model = lgb.LGBMRegressor(
            n_estimators=number,
            max_depth=-1,
            num_leaves=1024,
            colsample_bytree=0.7,
            learning_rate=0.03,
            objective='regression',
            metric='mae',
            verbosity=-1,  
            device='gpu',  
            random_state=SEED
        )
        
        model.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            eval_metric='mae',
            callbacks=[lgb.early_stopping(stopping_rounds=early_stopping_rounds, verbose=True)]
        )

        
        fold_preds += model.predict(X_test) / nFolds
    predictions[col] = fold_preds

Training for target: year_as


  File "c:\Users\saila\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\saila\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\saila\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\saila\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2876]	valid_0's l1: 0.410366
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[972]	valid_0's l1: 0.444361
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1890]	valid_0's l1: 0.472169
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[884]	valid_0's l1: 0.434098
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1580]	valid_0's l1: 0.499354
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1739]	valid_0's l1: 0.435797
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1190]	valid_0's l1: 0.439762
Training for target: month_as
Training until validation scores don't improve for 100 rounds
Early stopping, best 

In [5]:
lgb_preds = predictions

In [14]:
nFolds = 7
cv = KFold(n_splits=nFolds, random_state=SEED, shuffle=True)

predictions = {col: np.zeros(len(X_test)) for col in target_cols}

for col in target_cols:
    fold_preds = np.zeros(len(X_test))
    print(f"Training for target: {col}")
    for train_idx, valid_idx in cv.split(X, y):
        X_train, y_train = X.iloc[train_idx].copy(), y.iloc[train_idx][col]
        X_valid, y_valid = X.iloc[valid_idx].copy(), y.iloc[valid_idx][col]

        
        model = model = xgb.XGBRegressor(
            objective='reg:squarederror',
            eval_metric='mae',
            n_estimators=1500,        
            learning_rate=0.08,
            max_depth=15,
            subsample=1.0,
            colsample_bytree=0.7,
            reg_alpha=0.8,
            reg_lambda=4,
            seed=SEED,                 
            tree_method='gpu_hist',        
            device='gpu' ,
        )


        model.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            verbose = True, 
        )

        fold_preds += np.maximum(model.predict(X_test), 0) / nFolds
    predictions[col] = fold_preds

Training for target: year_as
[0]	validation_0-mae:8.96016
[1]	validation_0-mae:8.26339
[2]	validation_0-mae:7.86054
[3]	validation_0-mae:7.25071
[4]	validation_0-mae:6.69382
[5]	validation_0-mae:6.46692
[6]	validation_0-mae:5.97850
[7]	validation_0-mae:5.53304
[8]	validation_0-mae:5.12259
[9]	validation_0-mae:4.73981
[10]	validation_0-mae:4.38888
[11]	validation_0-mae:4.05905
[12]	validation_0-mae:3.76425
[13]	validation_0-mae:3.60078
[14]	validation_0-mae:3.34460
[15]	validation_0-mae:3.20527
[16]	validation_0-mae:3.06092
[17]	validation_0-mae:2.98791
[18]	validation_0-mae:2.79092
[19]	validation_0-mae:2.61137
[20]	validation_0-mae:2.58427
[21]	validation_0-mae:2.42335
[22]	validation_0-mae:2.34332
[23]	validation_0-mae:2.20439
[24]	validation_0-mae:2.07640
[25]	validation_0-mae:1.95729
[26]	validation_0-mae:1.84609
[27]	validation_0-mae:1.74686
[28]	validation_0-mae:1.65552
[29]	validation_0-mae:1.57089
[30]	validation_0-mae:1.53653
[31]	validation_0-mae:1.51805
[32]	validation_0-mae

In [15]:
xgb_preds = predictions

In [16]:
nFolds = 7
cv = KFold(n_splits=nFolds, random_state=SEED, shuffle=True)

predictions = {col: np.zeros(len(X_test)) for col in target_cols}


for col in target_cols:
    fold_preds = np.zeros(len(X_test))
    print(f"Training for target: {col}")
    
    
    for train_idx, valid_idx in cv.split(X, y):
        X_train, y_train = X.iloc[train_idx].copy(), y.iloc[train_idx][col]
        X_valid, y_valid = X.iloc[valid_idx].copy(), y.iloc[valid_idx][col]
        
        
        model = RandomForestRegressor(
            n_estimators = number,  
            max_depth = 15,  
            min_samples_split = 2, 
            min_samples_leaf = 1,  
            max_features = 1.0,  
            bootstrap = True,
            random_state = SEED,
            n_jobs = -1,  
        )
        model.fit(X_train, y_train)
        
        
        val_pred = model.predict(X_valid)
        score = mean_absolute_error(y_valid, val_pred)
        print(f"Fold MAE: {score}")
        
        fold_preds += np.maximum(model.predict(X_test), 0) / nFolds
    
    predictions[col] = fold_preds


Training for target: year_as
Fold MAE: 0.008043575418995
Fold MAE: 0.00878435754189318
Fold MAE: 0.01675977653630904
Fold MAE: 0.01618324022345598
Fold MAE: 0.018878651685390855
Fold MAE: 0.020260674157301498
Fold MAE: 0.006189887640447666
Training for target: month_as
Fold MAE: 0.235872625698324
Fold MAE: 0.15017094972067044
Fold MAE: 0.1797262569832402
Fold MAE: 0.17869944134078214
Fold MAE: 0.26556853932584273
Fold MAE: 0.26593146067415735
Fold MAE: 0.24668651685393256
Training for target: day_as
Fold MAE: 3.7991314119145208
Fold MAE: 4.219843613605938
Fold MAE: 4.154640829640513
Fold MAE: 4.509557799125589
Fold MAE: 4.424999627539099
Fold MAE: 4.186296352148971
Fold MAE: 4.216555554317063
Training for target: hour_as
Fold MAE: 5.37342461656131
Fold MAE: 5.243950189009149
Fold MAE: 5.132301259123658
Fold MAE: 5.348846741450107
Fold MAE: 4.6372378598641
Fold MAE: 5.2918347873077405
Fold MAE: 5.227968133694918
Training for target: min_as
Fold MAE: 14.394040993191215
Fold MAE: 14.91845

In [17]:
rgb_preds = predictions

In [None]:
final_preds = {}
for key in lgb_preds.keys():
    pred_lgb = np.array(lgb_preds[key])
    pred_xgb = np.array(xgb_preds[key])
    pred_rgb = np.array(rgb_preds[key])
    final_preds[key] = 0.6 * pred_lgb + 0.3 * pred_xgb + pred_rgb * 0.1


submission = pd.DataFrame({
    'id_eq': test_df['id_eq'],
    'year_as': final_preds['year_as'],
    'month_as': final_preds['month_as'],
    'day_as': final_preds['day_as'],
    'hour_as': final_preds['hour_as'],
    'min_as': final_preds['min_as'],
    'sec_as': final_preds['sec_as'],
    'lat_as': final_preds['lat_as'],
    'lon_as': final_preds['lon_as'],
    'depth_as': final_preds['depth_as'],
    'class_as': final_preds['class_as']
})
submission.to_csv('submission.csv', index=False)
print('Submission file saved as submission.csv')


Submission file saved as submission.csv
