In [None]:
#imports

%pip install fastparquet

import pandas as pd
import os
import numpy as np
import xgboost as xgb
import fastparquet
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [None]:
# Load the data
print("Loading data...")
try:
    df = pd.read_parquet('train.parquet')
    print("Successfully loaded parquet file with existing engine")
except ImportError as e:
    print("ParquetEngine not found. Installing pyarrow...")
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "pyarrow"])
    print("pyarrow installed. Attempting to load parquet file again...")
    df = pd.read_parquet('train.parquet')
except FileNotFoundError:
    print("train.parquet not found. Please ensure the file is in the current directory.")
    print("Alternative: If you have a CSV file, change the filename below:")
    # df = pd.read_csv('train.csv')  # Uncomment and modify if using CSV
    raise
except Exception as e:
    print(f"Error loading parquet file: {e}")
    print("Trying with fastparquet engine...")
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "fastparquet"])
        df = pd.read_parquet('train.parquet', engine='fastparquet')
    except:
        print("Failed to load with fastparquet. Please install manually:")
        print("Run: pip install pyarrow")
        print("Or: pip install fastparquet")
        raise

In [None]:
# print data
print(f"Data shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst few rows:")
print(df.head())

In [None]:
def preprocess_df(df):
    # Drop specified columns (mix of unique and duplicate columns)
    cols_to_drop = ['X697', 'X698', 'X699', 'X700', 'X701', 'X702', 'X703', 'X704', 'X705', 'X706', 'X707', 'X708', 'X709', 'X710', 'X711', 'X712', 'X713', 'X714', 'X715', 'X716', 'X717', 'X864', 'X867', 'X869', 'X870', 'X871', 'X872'] + ['X104', 'X146', 'X110', 'X152', 'X116', 'X158', 'X122', 'X164', 'X128', 'X170', 'X134', 'X176', 'X140', 'X182', 'X351', 'X393', 'X357', 'X399', 'X363', 'X405', 'X369', 'X411', 'X375', 'X417', 'X381', 'X423', 'X387', 'X429']
    
    # Only drop columns that actually exist in the dataframe
    cols_to_drop_existing = [col for col in cols_to_drop if col in df.columns]
    df.drop(columns=cols_to_drop_existing, inplace=True)

preprocess_df(df)

In [None]:
# Basic data exploration
print("\n" + "="*50)
print("DATA EXPLORATION")
print("="*50)
print(f"Dataset shape: {df.shape}")
print(f"\nData types:")
print(df.dtypes)
print(f"\nMissing values:")
print(df.isnull().sum())
print(f"\nBasic statistics:")
print(df.describe())

In [None]:
# Check for missing values
missing_count = df.isnull().sum().sum()
print(f"Missing values found: {missing_count}")

if missing_count > 0:
    print("Handling missing values...")
    
    # Check for columns that are entirely NaN
    all_nan_cols = df.columns[df.isnull().all()].tolist()
    if all_nan_cols:
        print(f"Found {len(all_nan_cols)} columns with all NaN values - dropping them")
        df = df.drop(columns=all_nan_cols)
    
    # Handle remaining missing values
    for col in df.columns:
        if df[col].isnull().sum() > 0:
            if df[col].dtype in ['object', 'category']:
                mode_val = df[col].mode()
                df[col] = df[col].fillna(mode_val[0] if len(mode_val) > 0 else 'unknown')
            else:
                # Check if column has any non-NaN values before calculating median
                non_nan_count = df[col].count()
                if non_nan_count > 0:
                    median_val = df[col].median()
                    df[col] = df[col].fillna(median_val)
                else:
                    # If somehow still all NaN, fill with 0
                    df[col] = df[col].fillna(0)
    print("Missing values filled")
else:
    print("No missing values found")

# Check for infinite values
numeric_cols = df.select_dtypes(include=[np.number]).columns
inf_count = np.isinf(df[numeric_cols]).sum().sum()
print(f"Infinite values found: {inf_count}")

if inf_count > 0:
    print("Handling infinite values...")
    # Replace inf with NaN, then fill with median
    df[numeric_cols] = df[numeric_cols].replace([np.inf, -np.inf], np.nan)
    
    for col in numeric_cols:
        if df[col].isnull().sum() > 0:
            # Check if column has any non-NaN values before calculating median
            non_nan_count = df[col].count()
            if non_nan_count > 0:
                median_val = df[col].median()
                df[col] = df[col].fillna(median_val)
            else:
                # If all NaN, fill with 0
                df[col] = df[col].fillna(0)
    print("Infinite values replaced with median values")
else:
    print("No infinite values found")

# Check for very large values that might cause issues
large_value_threshold = 1e10
large_values = (np.abs(df[numeric_cols]) > large_value_threshold).sum().sum()
print(f"Very large values (>{large_value_threshold}): {large_values}")

if large_values > 0:
    print("Clipping very large values...")
    for col in numeric_cols:
        df[col] = np.clip(df[col], -large_value_threshold, large_value_threshold)
    print("Large values clipped to reasonable range")
else:
    print("No extremely large values found")

print("Data preprocessing complete - ready for XGBoost training")
print(f"Final dataset shape: {df.shape}")

In [None]:
# Prepare features and target
target_col = 'label'  # Target column name
if target_col not in df.columns:
    print(f"Target column '{target_col}' not found. Available columns: {list(df.columns)}")
    raise ValueError(f"Column '{target_col}' not found in dataset")

X = df.drop(columns=[target_col])
y = df[target_col]

In [None]:
# Encode target if it's categorical
if y.dtype == 'object' or y.dtype.name == 'category':
    target_encoder = LabelEncoder()
    y = target_encoder.fit_transform(y)
    print(f"Target classes: {target_encoder.classes_}")

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target distribution:\n{pd.Series(y).value_counts()}")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
import numpy as np
from scipy.stats import pearsonr

def pearson_eval(y_true, y_pred):
    """Custom evaluation metric for XGBoost using Pearson correlation"""
    try:
        # Ensure arrays are numpy arrays and not empty
        y_true = np.asarray(y_true).flatten()
        y_pred = np.asarray(y_pred).flatten()
        
        # Handle edge cases
        if len(y_true) == 0 or len(y_pred) == 0:
            return 'pearson', -1.0
            
        # Check for constant arrays (zero variance)
        if np.var(y_true) == 0 or np.var(y_pred) == 0:
            return 'pearson', -1.0
            
        # Calculate correlation
        correlation, _ = pearsonr(y_true, y_pred)
        
        # Handle NaN case
        if np.isnan(correlation):
            correlation = -1.0
            
        # Return negative correlation because XGBoost minimizes metrics
        return 'pearson', -float(correlation)
        
    except Exception as e:
        print(f"Error in pearson_eval: {e}")
        return 'pearson', -1.0

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

# Train XGBoost model
print("\n" + "="*50)
print("XGBOOST TRAINING")
print("="*50)

xgb_model = xgb.XGBRegressor(
    n_estimators=100,
    max_depth=8,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric=pearson_eval  # Use custom Pearson correlation metric
)

In [None]:
# Train the model
print("Training the model...")
xgb_model.fit(
    X_train, y_train, 
    eval_set=[(X_train, y_train), (X_test, y_test)], 
    verbose=10  # Show progress every 10 rounds to see the pearson metric
)

# Make predictions
y_pred = xgb_model.predict(X_test)

# Evaluate the model
print("\n" + "="*50)
print("MODEL EVALUATION")
print("="*50)
    
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from scipy.stats import pearsonr
    
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
pearson_corr, p_value = pearsonr(y_test, y_pred)
    
print(f"Mean Squared Error: {mse:.4f}")
print(f"Root Mean Squared Error: {rmse:.4f}")
print(f"Mean Absolute Error: {mae:.4f}")
print(f"R² Score: {r2:.4f}")
print(f"Pearson Correlation: {pearson_corr:.4f}")
print(f"Pearson p-value: {p_value:.6f}")

# The training output should now show the pearson metric (as negative values)
# Remember: XGBoost shows -correlation, so -0.8 means correlation of 0.8

In [None]:
# Feature importance
print("\n" + "="*50)
print("FEATURE IMPORTANCE")
print("="*50)

feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 10 most important features:")
print(feature_importance.head(10))

# Plot feature importance
plt.figure(figsize=(10, 8))
top_features = feature_importance.head(15)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Feature Importance')
plt.title('Top 15 Feature Importances')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# Save the model
print("\n" + "="*50)
print("SAVING MODEL")
print("="*50)

import joblib
joblib.dump(xgb_model, 'xgboost_model.pkl')
print("Model saved as 'xgboost_model.pkl'")
print("(No label encoders needed - all numeric data)")

print("\n" + "="*50)
print("TRAINING COMPLETE!")
print("="*50)

In [None]:
test_df = pd.read_parquet('test.parquet')
preprocess_df(test_df)

In [None]:
# Prepare test data for prediction
# Remove the 'label' column since it's just placeholder zeros
if 'label' in test_df.columns:
    X_test_submission = test_df.drop('label', axis=1)
else:
    X_test_submission = test_df.copy()

# Make sure we have an ID column for the submission
# If your test_df has an 'ID' column, use that
# If not, you might need to create one or check what the ID column is called
if 'ID' in test_df.columns:
    test_ids = test_df['ID']
    # Remove ID from features if it exists
    if 'ID' in X_test_submission.columns:
        X_test_submission = X_test_submission.drop('ID', axis=1)
else:
    # If no ID column, create one (check your competition requirements)
    test_ids = range(1, len(test_df) + 1)
    print("Warning: No ID column found, using row indices")

# Make predictions on test data
print("Making predictions on test data...")
test_predictions = xgb_model.predict(X_test_submission)

# Create submission DataFrame
submission_df = pd.DataFrame({
    'ID': test_ids,
    'prediction': test_predictions
})

# Save submission file
submission_df.to_csv('submission.csv', index=False)
print(f"Submission file saved! Shape: {submission_df.shape}")
print("First few rows:")
print(submission_df.head())

# Optional: Quick sanity checks
print(f"\nPrediction statistics:")
print(f"Min: {test_predictions.min():.4f}")
print(f"Max: {test_predictions.max():.4f}")
print(f"Mean: {test_predictions.mean():.4f}")
print(f"Std: {test_predictions.std():.4f}")

In [None]:
# Check if running on Kaggle or locally
IN_KAGGLE = os.path.exists('/kaggle/working')

if IN_KAGGLE:
    # If running on Kaggle, use the built-in submission mechanism
    print("Running on Kaggle - please use the 'Submit' button in the UI to submit your results")
else:
    # If running locally, use the Kaggle API to submit
    print("Submitting via Kaggle API...")
    
    # Ensure Kaggle API is installed
    try:
        import kaggle
    except ImportError:
        print("Kaggle API not found. Installing...")
        !pip install kaggle
        import kaggle
    
    # Submit the file
    # Note: Make sure you have Kaggle API credentials set up (~/.kaggle/kaggle.json)
    competition_name = "drw-crypto-market-prediction"
    submission_message = "initial xgboost"
    
    # Command to submit 
    submission_command = f"kaggle competitions submit -c {competition_name} -f submission.csv -m \"{submission_message}\""
    
    print(f"Running command: {submission_command}")
    !{submission_command}
    
    # Check your submissions (optional)
    print("\nYour recent submissions:")
    !kaggle competitions submissions -c {competition_name}