In [1]:
import numpy as np
import pandas as pd
import wandb
import os
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from dotenv import load_dotenv
from typing import Optional, Tuple



In [2]:
load_dotenv()

False

In [7]:
class LightGBMPipeline:
    def __init__(self, n_estimators: int = 100, learning_rate: float = 0.1, 
                 random_state: int = 42, n_jobs: int = -1, wandb_project: Optional[str] = None, 
                 wandb_entity: Optional[str] = None, wandb_api_key: Optional[str] = None) -> None:
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.random_state = random_state
        self.n_jobs = n_jobs
        self.model: Optional[lgb.LGBMClassifier] = None
        self.wandb_project = wandb_project
        self.wandb_entity = wandb_entity

        if self.wandb_project and wandb_api_key:
            wandb.login(key=wandb_api_key)
            wandb.init(project=self.wandb_project, entity=self.wandb_entity, reinit=True)

    def load_data(self, data: pd.DataFrame, target_column: str) -> Tuple[pd.DataFrame, pd.Series]:
        categorical_columns = data.select_dtypes(include=['object', 'category']).columns
        data[categorical_columns] = data[categorical_columns].astype(str)
        data = pd.get_dummies(data, columns=categorical_columns)
        X = data.drop(target_column, axis=1)
        y = data[target_column]
        return X, y

    def split_data(self, X: pd.DataFrame, y: pd.Series, test_size: float = 0.2) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
        return train_test_split(X, y, test_size=test_size, random_state=self.random_state)

    def create_model(self) -> None:
        self.model = lgb.LGBMClassifier(
            n_estimators=self.n_estimators,
            learning_rate=self.learning_rate,
            random_state=self.random_state,
            n_jobs=self.n_jobs
        )

    def train(self, X_train: pd.DataFrame, y_train: pd.Series) -> None:
        if self.model is None:
            self.create_model()
        
        if self.wandb_project:
            wandb.config.update({
                'n_estimators': self.n_estimators,
                'learning_rate': self.learning_rate,
                'random_state': self.random_state,
                'n_jobs': self.n_jobs
            })
        
        self.model.fit(X_train, y_train)

    def evaluate(self, X_test: pd.DataFrame, y_test: pd.Series) -> float:
        if self.model is None:
            raise RuntimeError("Model has not been trained.")
        
        y_pred = self.model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        
        if self.wandb_project:
            wandb.log({'accuracy': accuracy})
        
        return accuracy

    def run(self, data: pd.DataFrame, target_column: str) -> float:
        X, y = self.load_data(data, target_column)
        X_train, X_test, y_train, y_test = self.split_data(X, y)
        self.train(X_train, y_train)
        accuracy = self.evaluate(X_test, y_test)
        print(f"Test Accuracy: {accuracy:.4f}")
        return accuracy

In [8]:
if __name__ == "__main__":
    data = pd.read_excel('pressure ulcer.xlsx')
    lgb_pipeline = LightGBMPipeline(
        n_estimators=100, 
        learning_rate=0.1, 
        random_state=42,
        wandb_project='pressure_lightgbm', 
        wandb_entity=os.getenv('WANDB_ENTITY'),
        wandb_api_key=os.getenv('WANDB_API')
    )
    accuracy = lgb_pipeline.run(data, target_column='caretaker score')

AttributeError: module 'lightgbm' has no attribute 'LGBMClassifier'