In [None]:
#Import the libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor, early_stopping, log_evaluation
import os
import gc

In [None]:
def optimize_dtypes(df):
    """Optimize data types to reduce memory usage"""
    initial_memory = df.memory_usage(deep=True).sum() / 1024**3

    # Optimize float64 to float32
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = df[col].astype('float32')
    
    # Optimize int64 to smaller int types
    for col in df.select_dtypes(include=['int64']).columns:
        if df[col].min() >= 0 and df[col].max() < 65536:
            df[col] = df[col].astype('uint16')
        elif df[col].min() >= -32768 and df[col].max() < 32768:
            df[col] = df[col].astype('int16')
        else:
            df[col] = df[col].astype('int32')

    optimized_memory = df.memory_usage(deep=True).sum() / 1024**3
    memory_saved = initial_memory - optimized_memory

    print(f"{memory_saved/initial_memory*100:.1f}% reduction in memory use from datatype optimisation")

    return df

In [None]:
#Train data
train_df = pd.read_parquet('/kaggle/input/drw-crypto-market-prediction/train.parquet')
train_df = optimize_dtypes(train_df)

features = [c for c in train_df.columns if c not in ['timestamp','label']]

X_train_full = train_df[features].fillna(0)
y_train_full = train_df['label'].astype(int)

del train_df  # Free memory immiately
gc.collect()

X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, 
    test_size=0.2,  # 20% for validation
    random_state=42,
)

del X_train_full, y_train_full #again free mem immediately
gc.collect()

print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_val shape: {y_val.shape}")

In [None]:
model = LGBMRegressor(
    objective='regression',
    learning_rate=0.05,
    num_leaves=31,
    max_depth=-1,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=5,
    n_estimators=1000,
    random_state=42,
    verbosity=-1
)

In [None]:
#Train with early stopping

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='rmse',                        # we monitor RMSE here
    callbacks=[
        early_stopping(stopping_rounds=50),    # stop if no RMSE improvement
        log_evaluation(period=50)              # print RMSE every 50 rounds
    ]
)

print("Best boosting rounds:", model.best_iteration_)

In [None]:
#Check Pearson on your validation set
import scipy.stats as st

val_preds = model.predict(X_val)
pearson_corr = st.pearsonr(y_val, val_preds)[0]
print(f"Validation Pearson correlation: {pearson_corr:.4f}")

In [None]:
# Load and process test data
test_df = pd.read_parquet('/kaggle/input/drw-crypto-market-prediction/test.parquet')
test_df['row_id'] = range(1, len(test_df)+1)
test_df = optimize_dtypes(test_df)


# 1) Prepare test features
X_test = test_df[features].fillna(0).values


# 2) Generate continuous predictions
preds = model.predict(X_test)


# 3) Build submission DataFrame using test_df['row_id']
submission = pd.DataFrame({
    'ID': test_df['row_id'], 
    'prediction': preds       
})

del test_df, X_test
gc.collect()

# 4) Write to the working directory
submission_path = '/kaggle/working/submission.csv'
submission.to_csv(submission_path, index=False)
print(f"Wrote {submission_path}")

# 5) Confirm it’s there
import os
print("Files in /kaggle/working:", os.listdir('/kaggle/working'))

# 6) Preview
submission.head()