In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/solana-skill-sprint-memcoin-graduation/sample_submission.csv
/kaggle/input/solana-skill-sprint-memcoin-graduation/test_unlabeled.csv
/kaggle/input/solana-skill-sprint-memcoin-graduation/train.csv
/kaggle/input/pump-fun-graduation-february-2025/chunk_40.csv
/kaggle/input/pump-fun-graduation-february-2025/chunk_23.csv
/kaggle/input/pump-fun-graduation-february-2025/chunk_18.csv
/kaggle/input/pump-fun-graduation-february-2025/chunk_21.csv
/kaggle/input/pump-fun-graduation-february-2025/chunk_38.csv
/kaggle/input/pump-fun-graduation-february-2025/chunk_37.csv
/kaggle/input/pump-fun-graduation-february-2025/chunk_36.csv
/kaggle/input/pump-fun-graduation-february-2025/chunk_14.csv
/kaggle/input/pump-fun-graduation-february-2025/chunk_6.csv
/kaggle/input/pump-fun-graduation-february-2025/chunk_35.csv
/kaggle/input/pump-fun-graduation-february-2025/chunk_34.csv
/kaggle/input/pump-fun-graduation-february-2025/chunk_32.csv
/kaggle/input/pump-fun-graduation-february-2025/token_info_

In [5]:
from glob import glob

train = pd.read_csv("/kaggle/input/solana-skill-sprint-memcoin-graduation/train.csv")
test = pd.read_csv("/kaggle/input/solana-skill-sprint-memcoin-graduation/test_unlabeled.csv")

# Limit to first 10 chunk files to reduce memory usage
chunk_files = sorted(glob("/kaggle/input/pump-fun-graduation-february-2025/chunk*.csv"))[:10]
chunks = pd.concat([pd.read_csv(f, engine='python', on_bad_lines='skip') for f in chunk_files], ignore_index=True)

dune_info = pd.read_csv("/kaggle/input/pump-fun-graduation-february-2025/dune_token_info.csv", engine="python")


In [6]:
def compute_features(df):
    grouped = df.groupby("base_coin")
    features = grouped.agg(
        n_tx=('tx_idx', 'count'),
        n_buy=('direction', lambda x: (x == 'buy').sum()),
        n_sell=('direction', lambda x: (x == 'sell').sum()),
        total_token_volume=('base_coin_amount', 'sum'),
        total_sol_volume=('quote_coin_amount', 'sum'),
        avg_token_price=('quote_coin_amount', lambda x: np.mean(x / (df.loc[x.index, 'base_coin_amount'].replace(0, np.nan)))),
        unique_wallets=('signing_wallet', pd.Series.nunique),
        first_block=('slot', 'min'),
        last_block=('slot', 'max')
    ).reset_index().rename(columns={'base_coin': 'mint'})

    features['buy_ratio'] = features['n_buy'] / features['n_tx']
    features['sell_ratio'] = features['n_sell'] / features['n_tx']
    features['block_duration'] = features['last_block'] - features['first_block']
    
    return features

token_features = compute_features(chunks)


In [7]:
train_merged = train.merge(token_features, on='mint', how='left')
test_merged = test.merge(token_features, on='mint', how='left')


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier

X = train_merged.drop(columns=['mint', 'slot_min', 'slot_graduated', 'has_graduated'])
y = train_merged['has_graduated'].astype(int)

X = X.fillna(0)
test_X = test_merged.drop(columns=['mint', 'slot_min']).fillna(0)

X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

def clean_column_names(df):
    return df.rename(columns=lambda x: x.replace('"', '')
                                  .replace('{', '')
                                  .replace('}', '')
                                  .replace('[', '')
                                  .replace(']', '')
                                  .replace(',', '')
                                  .replace(':', '')
                                  .replace(' ', '_'))

X_train = clean_column_names(X_train)
X_val = clean_column_names(X_val)
X = clean_column_names(X)
test_X = clean_column_names(test_X)


In [9]:
model = LGBMClassifier(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.05,
    class_weight='balanced',
    random_state=42
)

model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:, 1]
roc_auc = roc_auc_score(y_val, y_pred)
print(f"Validation ROC-AUC: {roc_auc:.4f}")


[LightGBM] [Info] Number of positive: 5923, number of negative: 505722
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031265 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3031
[LightGBM] [Info] Number of data points in the train set: 511645, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Validation ROC-AUC: 0.7261


In [10]:
test_probs = model.predict_proba(test_X)[:, 1]
submission = pd.DataFrame({'mint': test_merged['mint'], 'has_graduated_prob': test_probs})
submission.to_csv("submission.csv", index=False)
print("✅ submission.csv written")


✅ submission.csv written


In [11]:
print("Submission DataFrame head:")
print(submission.head())

Submission DataFrame head:
                                           mint  has_graduated_prob
0  9Wt3N7etKMX9cioTdEJ5S4b8A9nK3M66n9RFVgBGpump             0.37111
1  9q5y2X2P8ZEKTjyXBVcS5q2EZM7HbNV8DURY2qnvqi2f             0.37111
2  HL2di8dcQ7eYDmkcFoZ4zJyHX5SbRZXAJxTegL3JPfx2             0.37111
3  7iAFj9Pc5QH9jbGmHwYe8T6yzNVbjhL13PNJXVTspump             0.37111
4  F7U1Rdgz2KFpneKpAnYytWF2jggnsrLScfi2A668pump             0.37111
