In [18]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.multioutput import MultiOutputRegressor
import warnings
warnings.filterwarnings("ignore")


SEED = 42
n_splits = 8
n_estimators = 5000
early_stopping_rounds = 100

In [19]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
eq_df = pd.read_csv("eq.csv")
sample_submission = pd.read_csv("sample_submission.csv")



# test_df.describe()
# test_df.describe()
# test_df.info()
train_df.info()
train_df = train_df.drop(columns=["Unnamed: 0"])

print(f"Training data shape: {train_df.shape}")
print(f"Earthquake data shape: {eq_df.shape}")
print(f"Test data shape: {test_df.shape}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1250 entries, 0 to 1249
Data columns (total 21 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  1250 non-null   int64  
 1   year        1250 non-null   int64  
 2   month       1250 non-null   int64  
 3   day         1250 non-null   int64  
 4   hour        1250 non-null   int64  
 5   min         1250 non-null   int64  
 6   sec         1250 non-null   int64  
 7   lat         1250 non-null   float64
 8   lon         1250 non-null   float64
 9   depth       1250 non-null   int64  
 10  class       1250 non-null   float64
 11  year_as     1250 non-null   int64  
 12  month_as    1250 non-null   int64  
 13  day_as      1250 non-null   int64  
 14  hour_as     1250 non-null   int64  
 15  min_as      1250 non-null   int64  
 16  sec_as      1250 non-null   int64  
 17  lat_as      1250 non-null   float64
 18  lon_as      1250 non-null   float64
 19  depth_as    1250 non-null  

In [20]:
target_cols = ["year_as", "month_as", "day_as", "hour_as", "min_as", "sec_as",
                  "lat_as", "lon_as", "depth_as", "class_as"]

X = train_df.drop(columns=target_cols)
y = train_df[target_cols]

X_test = test_df[X.columns].copy()

In [25]:
nFolds = 7
cv = KFold(n_splits=nFolds, random_state=SEED, shuffle=True)

predictions = {col: np.zeros(len(X_test)) for col in target_cols}

for col in target_cols:
    fold_preds = np.zeros(len(X_test))
    print(f"Training for target: {col}")
    for train_idx, valid_idx in cv.split(X, y):
        X_train, y_train = X.iloc[train_idx].copy(), y.iloc[train_idx][col]
        X_valid, y_valid = X.iloc[valid_idx].copy(), y.iloc[valid_idx][col]
        
        model = lgb.LGBMRegressor(
            n_estimators=1000,
            max_depth=-1,
            num_leaves=1024,
            colsample_bytree=0.7,
            learning_rate=0.03,
            objective='regression',
            metric='mae',
            verbosity=-1,  
            device='gpu',  
            random_state=SEED
        )
        
        model.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            eval_metric='mae',
            callbacks=[lgb.early_stopping(stopping_rounds=early_stopping_rounds, verbose=True)]
        )

        
        fold_preds += model.predict(X_test) / nFolds
    predictions[col] = fold_preds

Training for target: year_as
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[950]	valid_0's l1: 0.413181
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[972]	valid_0's l1: 0.444716
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l1: 0.476083
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[884]	valid_0's l1: 0.434438
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l1: 0.50166
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[990]	valid_0's l1: 0.437355
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l1: 0.439709
Training 

In [26]:
y_preds_lgb = fold_preds

In [29]:
nFolds = 7
cv = KFold(n_splits=nFolds, random_state=SEED, shuffle=True)

predictions = {col: np.zeros(len(X_test)) for col in target_cols}

for col in target_cols:
    fold_preds = np.zeros(len(X_test))
    print(f"Training for target: {col}")
    for train_idx, valid_idx in cv.split(X, y):
        X_train, y_train = X.iloc[train_idx].copy(), y.iloc[train_idx][col]
        X_valid, y_valid = X.iloc[valid_idx].copy(), y.iloc[valid_idx][col]

        
        model = model = xgb.XGBRegressor(
            objective='reg:squarederror',
            eval_metric='mae',
            n_estimators=1000,        
            learning_rate=0.08,
            max_depth=15,
            subsample=1.0,
            colsample_bytree=0.7,
            reg_alpha=0.8,
            reg_lambda=4,
            seed=SEED,                 
            tree_method='hist',        
            device='gpu'  
        )


        model.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
        )

        
        fold_preds += np.maximum(model.predict(X_test), 0) / nFolds
    predictions[col] = fold_preds

Training for target: year_as
[0]	validation_0-mae:8.96016
[1]	validation_0-mae:8.26339
[2]	validation_0-mae:7.86054
[3]	validation_0-mae:7.25071
[4]	validation_0-mae:6.69382
[5]	validation_0-mae:6.46692
[6]	validation_0-mae:5.97850
[7]	validation_0-mae:5.53304
[8]	validation_0-mae:5.12259
[9]	validation_0-mae:4.73981
[10]	validation_0-mae:4.38888
[11]	validation_0-mae:4.05905
[12]	validation_0-mae:3.76425
[13]	validation_0-mae:3.60078
[14]	validation_0-mae:3.34460
[15]	validation_0-mae:3.20527
[16]	validation_0-mae:3.06092
[17]	validation_0-mae:2.98791
[18]	validation_0-mae:2.79092
[19]	validation_0-mae:2.61137
[20]	validation_0-mae:2.58427
[21]	validation_0-mae:2.42335
[22]	validation_0-mae:2.34332
[23]	validation_0-mae:2.20439
[24]	validation_0-mae:2.07640
[25]	validation_0-mae:1.95729
[26]	validation_0-mae:1.84609
[27]	validation_0-mae:1.74686
[28]	validation_0-mae:1.65552
[29]	validation_0-mae:1.57089
[30]	validation_0-mae:1.53653
[31]	validation_0-mae:1.51805
[32]	validation_0-mae

In [30]:
submission = pd.DataFrame({
    'id_eq': test_df['id_eq'],
    'year_as': predictions['year_as'],
    'month_as': predictions['month_as'],
    'day_as': predictions['day_as'],
    'hour_as': predictions['hour_as'],
    'min_as': predictions['min_as'],
    'sec_as': predictions['sec_as'],
    'lat_as': predictions['lat_as'],
    'lon_as': predictions['lon_as'],
    'depth_as': predictions['depth_as'],
    'class_as': predictions['class_as']
})

submission.to_csv('submission.csv', index=False)
print('Submission file saved as submission.csv')


Submission file saved as submission.csv
