# DRW Market Challenge - Price Forecasting

This notebook contains the solution for the DRW Market Challenge focused on price forecasting.

## Setup and Requirements

The following cells will automatically install required packages and set up the environment for both local execution and Google Colab.

In [6]:
# Environment setup and package installation
import sys
import subprocess
import importlib

def install_package(package):
    """Install a package using pip"""
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print(f"✅ Successfully installed {package}")
    except subprocess.CalledProcessError as e:
        print(f"❌ Failed to install {package}: {e}")

def check_and_install_packages():
    """Check if required packages are installed, install if missing"""
    required_packages = [
        "numpy",
        "pandas", 
        "fastai",
        "matplotlib",
        "seaborn",
        "scikit-learn"
    ]
    
    missing_packages = []
    
    for package in required_packages:
        try:
            importlib.import_module(package)
            print(f"✅ {package} is already installed")
        except ImportError:
            print(f"⚠️  {package} is not installed")
            missing_packages.append(package)
    
    if missing_packages:
        print(f"\n📦 Installing missing packages: {', '.join(missing_packages)}")
        for package in missing_packages:
            install_package(package)
    else:
        print("\n🎉 All required packages are already installed!")

# Run the package check and installation
check_and_install_packages()


✅ numpy is already installed
✅ pandas is already installed
✅ fastai is already installed
✅ matplotlib is already installed
✅ seaborn is already installed
⚠️  scikit-learn is not installed

📦 Installing missing packages: scikit-learn
✅ Successfully installed scikit-learn


## Data Download

Since you need to set up Kaggle API credentials, here are your options:

### Option 1: Manual Download (Recommended for now)
Run this command in your terminal:
```bash
kaggle competitions download -c drw-crypto-market-prediction
```

### Option 2: Set up Kaggle API
1. Go to https://www.kaggle.com/account
2. Scroll to "API" section and click "Create New API Token"
3. Download the kaggle.json file
4. Move it to ~/.kaggle/kaggle.json
5. Accept the competition rules on Kaggle


In [None]:
# Data loading - run this after downloading the data
import os
import zipfile
from pathlib import Path

def load_drw_data():
    """Load DRW competition data"""
    data_path = Path("./data")
    
    # Check if data directory exists
    if not data_path.exists():
        print("📁 Data directory not found.")
        print("💡 Please download the data first using:")
        print("   kaggle competitions download -c drw-crypto-market-prediction")
        return None, None
    
    # Look for zip files first
    zip_files = list(data_path.glob("*.zip"))
    if zip_files:
        print(f"📦 Found {len(zip_files)} zip files. Extracting...")
        for zip_file in zip_files:
            with zipfile.ZipFile(zip_file, 'r') as zip_ref:
                zip_ref.extractall(data_path)
        print("✅ Files extracted successfully!")
    
    # Look for CSV files
    csv_files = list(data_path.glob("*.csv"))
    
    if not csv_files:
        print("📄 No CSV files found in data directory")
        return None, None
    
    print(f"📊 Found {len(csv_files)} CSV files:")
    for file in csv_files:
        print(f"   - {file.name}")
    
    # Try to identify train and test files
    train_file = None
    test_file = None
    
    for file in csv_files:
        if 'train' in file.name.lower():
            train_file = file
        elif 'test' in file.name.lower():
            test_file = file
    
    # Load the data
    train_data = None
    test_data = None
    
    if train_file:
        print(f"📈 Loading training data: {train_file.name}")
        train_data = pd.read_csv(train_file)
        print(f"   Shape: {train_data.shape}")
        print(f"   Columns: {list(train_data.columns)}")
    
    if test_file:
        print(f"📉 Loading test data: {test_file.name}")
        test_data = pd.read_csv(test_file)
        print(f"   Shape: {test_data.shape}")
        print(f"   Columns: {list(test_data.columns)}")
    
    return train_data, test_data

# Load the data
train_df, test_df = load_drw_data()


### Need not run this cell on Kaggle

In [7]:
def kaggle_check_and_install_packages():
    """Check if required packages are installed, install if missing"""
    required_packages = [
        "fastai"
    ]
    
    missing_packages = []
    
    for package in required_packages:
        try:
            importlib.import_module(package)
            print(f"✅ {package} is already installed")
        except ImportError:
            print(f"⚠️  {package} is not installed")
            missing_packages.append(package)
    
    if missing_packages:
        print(f"\n📦 Installing missing packages: {', '.join(missing_packages)}")
        for package in missing_packages:
            install_package(package)
    else:
        print("\n🎉 All required packages are already installed!")

# Run the package check and installation
kaggle_check_and_install_packages()

✅ fastai is already installed

🎉 All required packages are already installed!


### Import Packages

In [3]:
#| export
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# FastAI imports
import fastai
from fastai.tabular.all import *
from fastai.vision.all import *
from fastai.text.all import *

# Additional useful imports
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, LabelEncoder
import plotly.express as px
import plotly.graph_objects as go

print("📚 All libraries imported successfully!")
print(f"🔢 NumPy version: {np.__version__}")
print(f"🐼 Pandas version: {pd.__version__}")
print(f"🚀 FastAI version: {fastai.__version__}")

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


📚 All libraries imported successfully!
🔢 NumPy version: 2.3.2
🐼 Pandas version: 2.3.2
🚀 FastAI version: 2.8.4


## Data Download / Load

The DRW Market Challenge data can be downloaded from Kaggle. You'll need to:

1. Set up Kaggle API credentials
2. Download the competition data
3. Extract and explore the data

### Option 1: When you need to get data from Kaggle

In [8]:
# Data download setup
import os
import zipfile
from pathlib import Path

def setup_kaggle():
    """Setup Kaggle API for data download"""
    try:
        import kaggle
        print("✅ Kaggle API is available")
        return True
    except ImportError:
        print("⚠️  Kaggle API not found. Installing...")
        try:
            import subprocess
            subprocess.check_call([sys.executable, "-m", "pip", "install", "kaggle"])
            print("✅ Kaggle API installed successfully")
            return True
        except Exception as e:
            print(f"❌ Failed to install Kaggle API: {e}")
            return False

def download_competition_data(competition_name="drw-crypto-market-prediction"):
    """Download competition data from Kaggle"""
    try:
        from kaggle.api.kaggle_api_extended import KaggleApi
        api = KaggleApi()
        api.authenticate()
        
        print(f"📥 Downloading data for competition: {competition_name}")
        api.competition_download_files(competition_name, path="./data", unzip=True)
        print("✅ Data downloaded and extracted successfully!")
        return True
    except Exception as e:
        print(f"❌ Failed to download data: {e}")
        print("💡 Make sure you have:")
        print("   1. Kaggle API credentials in ~/.kaggle/kaggle.json")
        print("   2. Accepted the competition rules")
        return False

In [9]:
# Create data directory
os.makedirs("data", exist_ok=True)

# Setup Kaggle and download data
if setup_kaggle():
    download_competition_data()
else:
    print("🔧 Please install Kaggle API manually: pip install kaggle")
    print("📋 Then run: kaggle competitions download -c drw-crypto-market-prediction")

⚠️  Kaggle API not found. Installing...
Collecting kaggle
  Downloading kaggle-1.7.4.5-py3-none-any.whl.metadata (16 kB)
Collecting bleach (from kaggle)
  Using cached bleach-6.2.0-py3-none-any.whl.metadata (30 kB)
Collecting protobuf (from kaggle)
  Using cached protobuf-6.32.0-cp39-abi3-macosx_10_9_universal2.whl.metadata (593 bytes)
Collecting python-slugify (from kaggle)
  Downloading python_slugify-8.0.4-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting text-unidecode (from kaggle)
  Downloading text_unidecode-1.3-py2.py3-none-any.whl.metadata (2.4 kB)
Collecting webencodings (from kaggle)
  Using cached webencodings-0.5.1-py2.py3-none-any.whl.metadata (2.1 kB)
Downloading kaggle-1.7.4.5-py3-none-any.whl (181 kB)
Using cached bleach-6.2.0-py3-none-any.whl (163 kB)
Using cached protobuf-6.32.0-cp39-abi3-macosx_10_9_universal2.whl (426 kB)
Downloading python_slugify-8.0.4-py2.py3-none-any.whl (10 kB)
Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
Using cached webencodin

### Option 2: When you have the data, just need to load it

In [None]:
# Alternative: Manual data loading if you already have the data
def load_data_manually():
    """Load data if you already have it downloaded"""
    data_path = Path("./data")
    
    if not data_path.exists():
        print("📁 Data directory not found. Please download the data first.")
        return None, None
    
    # Look for common data files
    csv_files = list(data_path.glob("*.csv"))
    
    if not csv_files:
        print("📄 No CSV files found in data directory")
        return None, None
    
    print(f"📊 Found {len(csv_files)} CSV files:")
    for file in csv_files:
        print(f"   - {file.name}")
    
    # Try to identify train and test files
    train_file = None
    test_file = None
    
    for file in csv_files:
        if 'train' in file.name.lower():
            train_file = file
        elif 'test' in file.name.lower():
            test_file = file
    
    # Load the data
    train_data = None
    test_data = None
    
    if train_file:
        print(f"📈 Loading training data: {train_file.name}")
        train_data = pd.read_csv(train_file)
        print(f"   Shape: {train_data.shape}")
    
    if test_file:
        print(f"📉 Loading test data: {test_file.name}")
        test_data = pd.read_csv(test_file)
        print(f"   Shape: {test_data.shape}")
    
    return train_data, test_data

# Try to load data if it exists
train_df, test_df = load_data_manually()