## FINAL MODEL 

Polaczenie LightBgm z Catboost za pomoca soft VoteClassifiera

In [6]:
from ColumnTransformers import * 
from AdvModels import *

In [7]:

class CatBoostWithCatFeatures(CatBoostClassifier):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.cat_features_indices = None
    
    def fit(self, X, y=None, **fit_params):
        X_mod=self._preprocess(X)
        X_mod[:, self.cat_features_indices] = X_mod[:, self.cat_features_indices].astype(str)
        
        super().fit(X_mod, y, cat_features=self.cat_features_indices, **fit_params)
        return self

    def _preprocess(self, X):
        if self.cat_features_indices is None:
            self.cat_features_indices = GetCategorical(X)
        X_mod = X.copy()
        X_mod[:, self.cat_features_indices] = X_mod[:, self.cat_features_indices].astype(str)
        return X_mod

    def predict(self, X, **kwargs):
        X_mod = self._preprocess(X)
        return super().predict(X_mod, **kwargs)

    def predict_proba(self, X, **kwargs):
        X_mod = self._preprocess(X)
        return super().predict_proba(X_mod, **kwargs)
  
def CatBoostPipeline(X):
    catboost_pipeline = Pipeline([
    ("preprocessor", CatBoostTransformer(
        Numerical=['Transaction.Amount', 'Customer.Age','Account.Age.Days','Quantity']
        )),
    ("model", CatBoostWithCatFeatures(
        grow_policy="SymmetricTree",
        rsm=0.8,
        depth=9,
        auto_class_weights="Balanced",
        learning_rate=0.01,
        l2_leaf_reg=2,
        iterations=1500,
        border_count=256,
        verbose=0 ))])
    return catboost_pipeline

def LGBMClassifierPipeline(): 
    model=LGBMClassifier(
        colsample_bytree=0.8,
        is_unbalance=True,
        learning_rate=0.01,
        max_depth=2,
        min_split_gain=0.1,
        n_estimators=600,
        reg_lambda=1,
        subsample=1,
        verbosity=-1
    )
    result=PipelineModel(model,n=18)
    return result

def create_voting_classifier(X):
    
    catboost_pipeline = CatBoostPipeline(X)
    lgbm_pipeline = LGBMClassifierPipeline() 
    print(type(lgbm_pipeline))
    voting_ensemble = VotingClassifier(
        estimators=[
            ('catboost', catboost_pipeline),
            ('lgbm',     lgbm_pipeline)
        ],
        voting='soft'
    )
    return voting_ensemble

In [8]:
def lgbmImportanceGetter(pipeline): 
    
    feature_names = pipeline.named_steps["preprocessor"].get_feature_names_out()

    selector = pipeline.named_steps["featureselection"]
    support_mask = selector.support_
    selected_features = feature_names[support_mask]

    model = pipeline.named_steps["model"]
    importances = model.feature_importances_  
    feature_importance_df = pd.DataFrame({
    "Feature": selected_features,
    "Importance": importances
    }).sort_values(by="Importance", ascending=False)
    return feature_importance_df

def catImportanceGetter(pipeline): 
    model = pipeline.named_steps["model"]
    importances = model.feature_importances_
    preprocessor = pipeline.named_steps["preprocessor"]
    feature_names = preprocessor.get_feature_names_out()
    feature_importance_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importances
    }).sort_values(by="Importance", ascending=False)
    return  feature_importance_df
    
def FinalImportance(X_train,y_train): 
    Lgbm=LGBMClassifierPipeline() 
    Cat=CatBoostPipeline(X_train) 
    Lgbm.fit(X_train,y_train)
    Cat.fit(X_train,y_train) 
    importanceslgbm=lgbmImportanceGetter(Lgbm) 
    print(importanceslgbm) 
    importancesCat=catImportanceGetter(Cat)
    print(importancesCat)
    return importanceslgbm,importancesCat

In [9]:
def FinalModel(X_train,X_test,y_train,y_test,threshold=0.5): 
    finalModel=create_voting_classifier(X_train)
    set_config(display='diagram')
    display(finalModel)
    finalModel.fit(X_train,y_train)
    y_scores=finalModel.predict_proba(X_test)[:, 1]  
    fpr, tpr, thresholds = roc_curve(y_test, y_scores)
    roc_auc = auc(fpr, tpr)
    aucPlot(fpr, tpr,roc_auc)
    y_pred = (np.array(y_scores) > threshold).astype(int)
    PredictionQualityInfo(y_pred,y_test)
    print(f"AUC: {roc_auc}")
    return finalModel

In [10]:
X_train,y_train=getTestData()
X_test,y_test=getTrainingData()
Model=FinalModel(X_train,X_test,y_train,y_test)
FinalImportance(X_train,y_train)

FileNotFoundError: [Errno 2] No such file or directory: '../data/TestData.csv'

## OFFICIAL TEST

In [5]:
X_train,y_train=getTestData()
X_test,y_test=getTrainingData()
X_TRAIN = pd.concat([X_train, X_test]).reset_index(drop=True)
Y_TRAIN = pd.concat([y_train, y_test]).reset_index(drop=True)

NameError: name 'ImportanceGetter' is not defined

In [None]:
X_TEST,Y_TEST=getValidationData()
FinalModel(X_TRAIN,X_TEST,Y_TRAIN,Y_TEST,threshold=0.5)