In [None]:
import requests
import numpy as np
import os
import pandas as pd
from datetime import datetime, timezone
from tqdm import tqdm
from collections import Counter
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

COVALENT_KEY = os.getenv("COVALENT_API_KEY") 
CHAIN_ID = 1  
THREADS = 4
GAMMA = 2  

wallets = pd.read_csv("wallets.csv").rename(columns={'wallet_id': 'wallet'})
print(f"Loaded {len(wallets)} wallets.")

# data fetching
def fetch_wallet_transactions(wallet):
    all_txs = []
    page = 0
    while True:
        url = f"https://api.covalenthq.com/v1/{CHAIN_ID}/address/{wallet}/transactions_v3/"
        params = {
            "page-size": 100,
            "page-number": page,
            "with-logs": True,  # needed for event-level extraction
            "key": COVALENT_KEY,
        }
        try:
            resp = requests.get(url, params=params, timeout=15)
            resp.raise_for_status()
            items = resp.json().get("data", {}).get("items", [])
            if not items:
                break
            all_txs.extend(items)
            if len(items) < 100:
                break
            page += 1
        except Exception as e:
            print(f"API error for wallet {wallet} page {page}: {e}")
            break
    return all_txs

print("Fetching transactions for all wallets...")
wallet_txs = {wallet: fetch_wallet_transactions(wallet) for wallet in tqdm(wallets.wallet, desc="Wallets")}

# features

def extract_borrowing_features(txlist):
    BORROW_EVENTS = {"Borrow"}
    SUPPLY_EVENTS = {"Mint", "Supply", "Deposit"}

    total_borrowed = 0
    borrowing_tx_count = 0
    total_supplied = 0
    borrow_history = []

    for tx in txlist:
        for log in tx.get("log_events", []):
            decoded = log.get("decoded")
            if not decoded:
                continue
            ev = decoded.get("name", "")
            if ev in BORROW_EVENTS:
                value = 0
                if 'value' in log:
                    try:
                        value = float(log['value'])
                    except:
                        value = 0
                else:
                    params = decoded.get('params') or []
                    for param in params:
                        val_str = param.get('value', '')
                        if isinstance(val_str, str) and val_str.startswith("0x") and len(val_str) == 42:
                            continue
                        try:
                            if isinstance(val_str, str) and val_str.startswith("0x"):
                                val_int = int(val_str, 16)
                                value = float(val_int)
                            else:
                                value = float(val_str)
                            break
                        except:
                            continue
                if value > 10**15:
                    value /= 1e18  # convert wei to ether
                borrowing_tx_count += 1
                total_borrowed += value
                borrow_history.append((pd.to_datetime(tx["block_signed_at"]), value))
            elif ev in SUPPLY_EVENTS:
                value = 0
                if 'value' in log:
                    try:
                        value = float(log['value'])
                    except:
                        value = 0
                else:
                    params = decoded.get('params') or []
                    for param in params:
                        val_str = param.get('value', '')
                        if isinstance(val_str, str) and val_str.startswith("0x") and len(val_str) == 42:
                            continue
                        try:
                            if isinstance(val_str, str) and val_str.startswith("0x"):
                                val_int = int(val_str, 16)
                                value = float(val_int)
                            else:
                                value = float(val_str)
                            break
                        except:
                            continue
                if value > 10**15:
                    value /= 1e18
                total_supplied += value

    max_borrow_to_collateral_ratio = total_borrowed / total_supplied if total_supplied > 0 else 0

    if borrow_history:
        borrow_history.sort(key=lambda x: x[0])
        cutoff = pd.Timestamp.now(tz='UTC') - pd.Timedelta(days=30)
        recent_val = sum(val for t, val in borrow_history if t >= cutoff)
        prior_val = sum(val for t, val in borrow_history if t < cutoff)
        recent_borrow_growth = recent_val / (prior_val + 1e-9)
    else:
        recent_borrow_growth = 0

    return {
        "total_borrowed": total_borrowed,
        "borrowing_tx_count": borrowing_tx_count,
        "max_borrow_to_collateral_ratio": max_borrow_to_collateral_ratio,
        "recent_borrow_growth": recent_borrow_growth,
    }


def compute_basic_features(txlist):
    if not txlist:
        zero_dict = {
            "avg_time_between_txn": 0,
            "in_out_ratio": 0,
            "net_inflow": 0,
            "success_rate": 0,
            "avg_per_day": 0,
            "wallet_age_days": 0,
            "mean_fee": 0,
            "mean_gas": 0,
            "mean_tx_value": 0,
            "tx_count": 0,
            "failed_count": 0,
        }
        return zero_dict

    df = pd.json_normalize(txlist)
    df['value'] = df['value'].astype(float) / 1e18
    df['gas_spent'] = df.get('gas_spent', 0).astype(float)
    df['fees_paid'] = df.get('fees_paid', 0).astype(float)
    df['block_signed_at'] = pd.to_datetime(df['block_signed_at'])
    df['successful'] = df['successful'].fillna(True)
    if 'nonce' in df.columns:
        df['nonce'] = df['nonce'].astype(int)
    else:
        df['nonce'] = 0

    tx_count = len(df)
    failed_count = (~df['successful']).sum()

    from_addr = df['from_address'].iloc[0].lower()
    in_value = df[df['to_address'].str.lower() == from_addr]['value'].sum()
    out_value = df[df['from_address'].str.lower() == from_addr]['value'].sum()
    in_out_ratio = in_value / (out_value + 1e-9)
    net_inflow = in_value - out_value

    success_rate = df['successful'].mean()
    df_sorted = df.sort_values('block_signed_at')
    now_utc = datetime.now(timezone.utc)
    wallet_age_days = (now_utc - df_sorted['block_signed_at'].iloc[0]).days if tx_count > 0 else 0
    avg_per_day = tx_count / max(wallet_age_days, 1)

    tx_intertimes = df_sorted['block_signed_at'].diff().dt.total_seconds().dropna()
    avg_time_between_txn = tx_intertimes.mean() if len(tx_intertimes) > 0 else 0

    mean_fee = df['fees_paid'].mean()
    mean_gas = df['gas_spent'].mean()
    mean_tx_value = df['value'].mean()

    return {
        "avg_time_between_txn": avg_time_between_txn,
        "in_out_ratio": in_out_ratio,
        "net_inflow": net_inflow,
        "success_rate": success_rate,
        "avg_per_day": avg_per_day,
        "wallet_age_days": wallet_age_days,
        "mean_fee": mean_fee,
        "mean_gas": mean_gas,
        "mean_tx_value": mean_tx_value,
        "tx_count": tx_count,
        "failed_count": failed_count,
    }


features = []
for wallet, txlist in tqdm(wallet_txs.items(), desc="Computing features"):
    basic = compute_basic_features(txlist)
    borrow = extract_borrowing_features(txlist)
    combined = {**basic, **borrow}
    combined['wallet'] = wallet
    features.append(combined)

features_df = pd.DataFrame(features)


borrow_threshold = features_df["total_borrowed"].quantile(0.10)  # 10th percentile threshold

median_tx_count = features_df["tx_count"].median()
y = (
    (features_df["total_borrowed"] > borrow_threshold) |
    (features_df["failed_count"] > 0) |
    (features_df["tx_count"] < median_tx_count)
).astype(int)

print("Label distribution:", Counter(y))

#training
feature_cols = [c for c in features_df.columns if c != "wallet"]
X = features_df[feature_cols].values
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, stratify=y, test_size=0.2, random_state=42
)

print(f"Train label distribution: {Counter(y_train)}")
print(f"Test label distribution: {Counter(y_test)}")

model = XGBClassifier(
    objective="binary:logistic",
    n_estimators=150,
    learning_rate=0.1,
    max_depth=4,
    subsample=0.85,
    colsample_bytree=0.8,
    n_jobs=THREADS,
    eval_metric="logloss",
    random_state=42,
    scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum()
)

model.fit(X_train, y_train)

probs = model.predict_proba(X_scaled)[:, 1]
scores = ((1 - probs) ** GAMMA * 1000).round().astype(int)
features_df["score"] = scores

# Export results
output_df = features_df[["wallet", "score"]].copy()
output_df.rename(columns={"wallet": "wallet_id"}, inplace=True)
output_df.to_csv("wallet_risk_scores_borrowing.csv", index=False)
print("Saved wallet risk scores to 'risk_scores.csv'")
