<a href="https://colab.research.google.com/github/aaln/aaln/blob/main/NVIDIA_DATA_HACKATHON_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
get_ipython().kernel.do_shutdown(restart=True)

{'status': 'ok', 'restart': True}

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"
# Installing GPU driver for LightGBM:-
!mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
!sudo apt install nvidia-driver-460 nvidia-cuda-toolkit clinfo
!apt-get update --fix-missing
!pip install -q  lightgbm==4.1.0 \
  --config-settings=cmake.define.USE_GPU=ON \
  --config-settings=cmake.define.OpenCL_INCLUDE_DIR="/usr/local/cuda/include/" \
  --config-settings=cmake.define.OpenCL_LIBRARY="/usr/local/cuda/lib64/libOpenCL.so"

In [None]:
# Colab warns and provides remediation steps if the GPUs is not compatible with RAPIDS.

!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

In [1]:
%load_ext cudf.pandas

import cudf  # this should work without any errors
import cupy as cp
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, KBinsDiscretizer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, mutual_info_regression

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy import stats
import gc


In [2]:
from google.colab import files
files.upload()  # Select kaggle.json from your local files (the api key)

Saving kaggle.json to kaggle (1).json


{'kaggle (1).json': b'{"username":"aalndy","key":"3d3aac3e0773862d88152e8c280a92c2"}'}

In [3]:
!mkdir -p ~/.kaggle  # Use -p to avoid errors if the directory exists
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [4]:
!pip install -q kaggle

In [8]:

!kaggle competitions download -c odsc-2024-nvidia-hackathon

Downloading odsc-2024-nvidia-hackathon.zip to /content
100% 3.70G/3.72G [00:09<00:00, 439MB/s]
100% 3.72G/3.72G [00:09<00:00, 431MB/s]


In [9]:
import zipfile
with zipfile.ZipFile("odsc-2024-nvidia-hackathon.zip", "r") as zip_ref:
    zip_ref.extractall("odsc-2024-nvidia-hackathon")

In [4]:
def round_float_columns(df, decimals=2):
    """Round all float columns to specified decimal places."""
    float_cols = df.select_dtypes(include=['float64']).columns
    df[float_cols] = df[float_cols].round(decimals)
    return df

def get_important_numeric_features(df, target, n_features=20):
    """Select most important numeric features for binning using correlation."""
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    numeric_cols = numeric_cols.drop(['id', 'y']) if 'y' in numeric_cols else numeric_cols.drop('id')

    # Calculate correlations
    correlations = np.abs(df[numeric_cols].corrwith(target))

    # Get top n_features
    top_features = correlations.nlargest(n_features).index.tolist()
    return top_features

def create_bins_optimized(df, numeric_cols):
    """Create optimized binned versions of numeric features."""
    binned_features = pd.DataFrame()

    for col in numeric_cols:
        if df[col].nunique() > 10:  # Only bin if enough unique values
            # Simple quartile binning (faster than KBinsDiscretizer)
            binned_features[f'{col}_quartile'] = pd.qcut(
                df[col], q=4, labels=False, duplicates='drop')

            # Decile binning for more granularity where needed
            binned_features[f'{col}_decile'] = pd.qcut(
                df[col], q=10, labels=False, duplicates='drop')

            # Simple binary flag for values above median
            median_val = df[col].median()
            binned_features[f'{col}_above_median'] = (df[col] > median_val).astype(int)

            # Add extreme value flags
            q1, q3 = df[col].quantile([0.25, 0.75])
            iqr = q3 - q1
            binned_features[f'{col}_is_outlier'] = (
                (df[col] < (q1 - 1.5 * iqr)) |
                (df[col] > (q3 + 1.5 * iqr))
            ).astype(int)

    return binned_features

def create_statistical_features_optimized(df, numeric_cols):
    """Create statistical features using optimized calculations."""
    # Convert to numpy array for faster operations
    data = df[numeric_cols].values

    # Preallocate the DataFrame with NaN values
    stats_df = pd.DataFrame(index=df.index)

    # Calculate basic statistics using numpy (much faster than pandas)
    stats_df['row_mean'] = np.nanmean(data, axis=1).round(2)
    stats_df['row_std'] = np.nanstd(data, axis=1).round(2)

    # Use numpy percentile for quartiles (faster than pandas quantile)
    stats_df['row_q25'] = np.nanpercentile(data, 25, axis=1).round(2)
    stats_df['row_q75'] = np.nanpercentile(data, 75, axis=1).round(2)

    # Calculate IQR
    stats_df['row_iqr'] = (stats_df['row_q75'] - stats_df['row_q25']).round(2)

    # Add count of extreme values (outside 1.5 IQR)
    lower_bound = stats_df['row_q25'] - 1.5 * stats_df['row_iqr']
    upper_bound = stats_df['row_q75'] + 1.5 * stats_df['row_iqr']

    # Count extremes using numpy operations
    extremes = ((data < lower_bound.values.reshape(-1, 1)) |
               (data > upper_bound.values.reshape(-1, 1)))
    stats_df['extreme_count'] = np.sum(extremes, axis=1)

    return stats_df

def process_in_chunks(df, target, chunk_size=50000):
    """Process the data in chunks to reduce memory usage."""
    n_samples = len(df)

    # Create train-test indices first
    all_indices = np.arange(n_samples)
    train_idx, valid_idx = train_test_split(all_indices, test_size=0.2, random_state=42)

    # Initialize empty lists to store chunks
    train_data = []
    train_labels = []
    valid_data = []
    valid_labels = []

    # Process in chunks
    for start_idx in range(0, n_samples, chunk_size):
        end_idx = min(start_idx + chunk_size, n_samples)
        chunk_indices = np.arange(start_idx, end_idx)

        # Get chunk data
        chunk_data = df.iloc[chunk_indices].values
        chunk_target = target.iloc[chunk_indices].values

        # Split chunk into train and validation
        chunk_train_mask = np.isin(chunk_indices, train_idx)
        chunk_valid_mask = np.isin(chunk_indices, valid_idx)

        if chunk_train_mask.any():
            train_data.append(chunk_data[chunk_train_mask])
            train_labels.append(chunk_target[chunk_train_mask])

        if chunk_valid_mask.any():
            valid_data.append(chunk_data[chunk_valid_mask])
            valid_labels.append(chunk_target[chunk_valid_mask])

        # Clear memory
        del chunk_data, chunk_target
        gc.collect()

    # Combine chunks
    X_train = np.vstack(train_data)
    y_train = np.concatenate(train_labels)
    X_valid = np.vstack(valid_data)
    y_valid = np.concatenate(valid_labels)

    # Clear memory
    del train_data, train_labels, valid_data, valid_labels
    gc.collect()

    # Create LightGBM datasets
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

    # Clear memory
    del X_train, y_train, X_valid, y_valid
    gc.collect()

    return lgb_train, lgb_valid


In [5]:
# prompt: # the csvs should be in the odsc-2024-nvidia-hackathon directory
# train = cudf.read_csv('train.csv', nrows=1000000)  # Load a sample of 1 million rows
# test = cudf.read_csv('test.csv')

# Assuming 'train.csv' and 'test.csv' are in the 'odsc-2024-nvidia-hackathon' directory
# train = cudf.read_csv('odsc-2024-nvidia-hackathon/train.csv', nrows=1000000)  # Load a sample of 1 million rows
# test = cudf.read_csv('odsc-2024-nvidia-hackathon/test.csv')

# Load a sample of the train.csv using Pandas
# Note: Assuming train.csv is uploaded to Colab environment.
train = pd.read_csv('odsc-2024-nvidia-hackathon/train.csv')
test = pd.read_csv('odsc-2024-nvidia-hackathon/test.csv')


In [6]:
# Round float columns
train = round_float_columns(train)
test = round_float_columns(test)

In [7]:
# Handle missing values
for col in train.columns:
    if train[col].dtype in ['int64', 'float64']:
        # Replace -1 and -999 with NaN
        train[col] = train[col].replace([-1, -999], np.nan)
        test[col] = test[col].replace([-1, -999], np.nan)

        # Fill numeric missing values with median
        median_value = train[col].median()
        train[col] = train[col].fillna(median_value)
        test[col] = test[col].fillna(median_value)
    else:
        # Fill categorical missing values with 'Unknown'
        train[col] = train[col].fillna('Unknown')
        test[col] = test[col].fillna('Unknown')


  col_to_normalize_casted = np.array(col_to_normalize[0]).astype(
  col_to_normalize_casted = np.array(col_to_normalize[0]).astype(
  col_to_normalize_casted = np.array(col_to_normalize[0]).astype(
  col_to_normalize_casted = np.array(col_to_normalize[0]).astype(
  col_to_normalize_casted = np.array(col_to_normalize[0]).astype(
  col_to_normalize_casted = np.array(col_to_normalize[0]).astype(
  col_to_normalize_casted = np.array(col_to_normalize[0]).astype(
  col_to_normalize_casted = np.array(col_to_normalize[0]).astype(
  col_to_normalize_casted = np.array(col_to_normalize[0]).astype(
  col_to_normalize_casted = np.array(col_to_normalize[0]).astype(
  col_to_normalize_casted = np.array(col_to_normalize[0]).astype(
  col_to_normalize_casted = np.array(col_to_normalize[0]).astype(
  col_to_normalize_casted = np.array(col_to_normalize[0]).astype(
  col_to_normalize_casted = np.array(col_to_normalize[0]).astype(
  col_to_normalize_casted = np.array(col_to_normalize[0]).astype(
  col_to_n

In [8]:
target = train['y']

In [9]:
# Prepare numeric features
numeric_cols = train.select_dtypes(include=['int64', 'float64']).columns
numeric_cols = numeric_cols.drop(['id', 'y']) if 'y' in numeric_cols else numeric_cols.drop('id')


In [21]:
# Usage in main code:
# First, get the most important features for binning
important_numeric_features = get_important_numeric_features(train, target)



KeyboardInterrupt: 

In [22]:
# Then create bins only for important features
train_bins = create_bins_optimized(train, important_numeric_features)
test_bins = create_bins_optimized(test, important_numeric_features)

In [11]:
# Add simple aggregated bin features
train_bins['total_outliers'] = train_bins[[col for col in train_bins.columns if 'is_outlier' in col]].sum(axis=1)
test_bins['total_outliers'] = test_bins[[col for col in test_bins.columns if 'is_outlier' in col]].sum(axis=1)

train_bins['total_above_median'] = train_bins[[col for col in train_bins.columns if 'above_median' in col]].sum(axis=1)
test_bins['total_above_median'] = test_bins[[col for col in test_bins.columns if 'above_median' in col]].sum(axis=1)

In [24]:
# Create train-test split indices
train_indices, validation_indices = train_test_split(
    np.arange(len(target)),
    test_size=0.2,
    random_state=42
)

# Create LightGBM datasets
lgb_train = lgb.Dataset(train_bins.iloc[train_indices].values, target.iloc[train_indices])
lgb_valid = lgb.Dataset(train_bins.iloc[validation_indices].values, target.iloc[validation_indices], reference=lgb_train)


# Process data in chunks
print("Processing data in chunks...")
lgb_train, lgb_valid = process_in_chunks(train_bins, target)


gc.collect()

Processing data in chunks...


KeyboardInterrupt: 

In [13]:
# Train model
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'gpu_platform_id': 0,
    'gpu_device_id': 0,
    'learning_rate': 0.05,
    'num_leaves': 31,
    'max_depth': 6,
    'min_child_samples': 30,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.7,
    'bagging_freq': 5,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'verbose': -1,
    'device': 'gpu',
}

callbacks = [
    lgb.early_stopping(stopping_rounds=50),
    lgb.log_evaluation(period=100)
]

In [14]:

print("Training model...")
model = lgb.train(
    params,
    lgb_train,
    num_boost_round=2000,
    valid_sets=[lgb_valid],
    callbacks=callbacks
)


Training model...
Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 648.719
[200]	valid_0's rmse: 648.71
Early stopping, best iteration is:
[180]	valid_0's rmse: 648.709


In [28]:

# Make predictions on validation data
# valid_preds = model.predict(train_bins.iloc[validation_indices].values, num_iteration=model.best_iteration)

# # Calculate RMSE
# validation_rmse = np.sqrt(mean_squared_error(target.iloc[validation_indices], valid_preds))
# print(f'\nValidation RMSE: {validation_rmse:.4f}')

# # Make predictions on test data
# test_preds = model.predict(test_bins.values, num_iteration=model.best_iteration)
# test_preds = np.round(test_preds, 2)

# Create and save submission file
submission = pd.DataFrame({
    'id': test['id'],
    'y': test_preds
})
submission.to_csv('submission.csv', index=False)
print("\nSubmission file created successfully!")


Submission file created successfully!


In [None]:
print("Creating statistical features...")
train_stats = create_statistical_features_optimized(train, numeric_cols)
test_stats = create_statistical_features_optimized(test, numeric_cols)



Creating statistical features...


KeyboardInterrupt: 

In [None]:
# Basic feature extraction
train_word_features = extract_word_parts(train)
test_word_features = extract_word_parts(test)

NameError: name 'extract_word_parts' is not defined

In [None]:
train_flags = create_binary_flags(train)
test_flags = create_binary_flags(test)


In [None]:
# Create interaction features (with rounding)
train_interactions = create_interaction_features(train, numeric_cols[:10])
test_interactions = create_interaction_features(test, numeric_cols[:10])


In [None]:
# Round interaction features
train_interactions = round_float_columns(train_interactions)
test_interactions = round_float_columns(test_interactions)


In [None]:

# Combine all features
features = train.drop(['id', 'y'], axis=1)
test_features = test.drop(['id'], axis=1)

In [None]:
# Label Encoding for categorical columns
categorical_cols = features.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in categorical_cols:
    features[col] = le.fit_transform(features[col].astype(str))
    if col in test_features.columns:
        test_features[col] = le.transform(test_features[col].astype(str))


In [None]:
# Combine all feature sets
features = pd.concat([
    features,
    train_word_features,
    train_flags,
    train_stats,
    train_interactions,
    train_bins
], axis=1)

test_features = pd.concat([
    test_features,
    test_word_features,
    test_flags,
    test_stats,
    test_interactions,
    test_bins
], axis=1)

In [None]:
scaler = StandardScaler()
numeric_features = features.select_dtypes(include=['int64', 'float64']).columns
features[numeric_features] = scaler.fit_transform(features[numeric_features])
test_features[numeric_features] = scaler.transform(test_features[numeric_features])


In [None]:

# Round scaled features
features = round_float_columns(features)
test_features = round_float_columns(test_features)


In [None]:
# Feature selection
selected_features = select_features(features, target, k=min(100, features.shape[1]))
features = features.loc[:, selected_features]
test_features = test_features.loc[:, selected_features]

# Split data
X_train, X_valid, y_train, y_valid = train_test_split(features, target, test_size=0.2, random_state=42)

# Convert to NumPy arrays
X_train = X_train.values
y_train = y_train.values
X_valid = X_valid.values
y_valid = y_valid.values


In [None]:
# LightGBM Dataset Construction
lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False)
lgb_valid = lgb.Dataset(X_valid, y_valid, reference=lgb_train, free_raw_data=False)

# Set LightGBM Parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'gpu_platform_id': 0,  # Specify GPU platform (optional, depending on the setup)
    'gpu_device_id': 0,    # Specify GPU device ID (optional, depending on the setup)
    'learning_rate': 0.1,
    'max_depth': -1,
    'num_leaves': 31,
    'verbose': -1,
    'device': 'gpu',  # Enable GPU with CUDA for faster training  # Utilize GPU for faster training if available
}

# Train LightGBM Model with Callbacks
callbacks = [
    lgb.early_stopping(stopping_rounds=50),
    lgb.log_evaluation(period=100)
]

model = lgb.train(params, lgb_train, num_boost_round=1000, valid_sets=[lgb_valid], callbacks=callbacks)


Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 643.501
[200]	valid_0's rmse: 643.037
[300]	valid_0's rmse: 642.911
[400]	valid_0's rmse: 642.854
[500]	valid_0's rmse: 642.821
[600]	valid_0's rmse: 642.796
[700]	valid_0's rmse: 642.79
[800]	valid_0's rmse: 642.78
[900]	valid_0's rmse: 642.772
Early stopping, best iteration is:
[921]	valid_0's rmse: 642.77


In [None]:
# Feature importance analysis
feature_importance = pd.DataFrame({
    'feature': features.columns,
    'importance': model.feature_importance()
})
print("\nTop 10 Most Important Features:")
print(feature_importance.sort_values('importance', ascending=False).head(10))

# Evaluate and predict
y_pred_valid = model.predict(X_valid, num_iteration=model.best_iteration)
rmse_valid = np.sqrt(mean_squared_error(y_valid, y_pred_valid))
print(f'\nValidation RMSE: {rmse_valid}')

# Predict on Test Set
test_features = test_features.values
test_preds = model.predict(test_features, num_iteration=model.best_iteration)

# Round predictions to 2 decimal places
# test_preds = np.round(test_preds, 2)

Validation RMSE: 642.7704077659927


In [None]:
# Create Submission File
submission = pd.DataFrame({'id': test['id'], 'y': test_preds})
submission.to_csv('submission.csv', index=False)

print("Submission saved as 'submission.csv'")


Submission saved as 'submission.csv'


In [None]:
# prompt: read submission.csv and count the number of rows

submission = pd.read_csv('submission.csv')
num_rows = len(submission)
print(f"Number of rows in submission.csv: {num_rows}")

Number of rows in submission.csv: 1000000


In [29]:
# prompt: kaggle competitions ubmit -c odsc-2024-nvidia-hackathon -f submission.csv -m "First submission using 1M rows of training"

!kaggle competitions submit -c odsc-2024-nvidia-hackathon -f submission.csv -m "Third Submission"


100% 17.1M/17.1M [00:00<00:00, 33.1MB/s]
Successfully submitted to 🎃 Spooktacular NVIDIA Data Science Competition