In [114]:
import nbimporter
from ColumnTransformers import *
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
import joblib
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import roc_auc_score
import json 
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from catboost import CatBoostClassifier
import nbimporter
import pandas as pd 
import numpy as np 
from catboost import Pool 
from sklearn.metrics import f1_score

In [115]:
def XgboostSearch(scale_pos_weight,X,y,model__early_stopping=15): 
    param_grid_xgb = {
        'n_estimators': [100],  
        'learning_rate': [0.001],  
        'max_depth': [3], 
        'subsample': [1.0],  
        'colsample_bytree': [1.0],  
        'gamma': [0.1], 
        'reg_lambda': [4], 
        'scale_pos_weight':[scale_pos_weight], # weight = 1 if negstive else wegiht=scale_pos_weight
        }  

    score, params = GridSearchGradient("XGBClassifier",param_grid_xgb,X,y)
    return  score, params

def LightgbmSearch(X,y,model__early_stopping=15):
    param_grid_lgb = {
        'n_estimators': [100],
        'learning_rate': [0.001],
        'max_depth': [3],
        'subsample': [0.7],
        'colsample_bytree': [0.6],
        'min_split_gain': [0],   # similar to gamma from XGB
        'reg_lambda': [0], 
        'is_unbalance':[True] }

    score, params= GridSearchGradient("LGBMClassfier",param_grid_lgb,X,y)
    return  score, params




def AdaSearch(metric='roc_auc') : 
    param_grid_ada = {
    'n_estimators': [100], 
    'learning_rate': [0.6],  # before amount of say
    'estimator': [DecisionTreeClassifier(max_depth=2)#, 
                         # DecisionTreeClassifier(max_depth=3),
                          #DecisionTreeClassifier(max_depth=5)
                 ]
    }
    grid_search_ada = GridSearchCV(AdaBoostClassifier(), param_grid_ada, cv=5, scoring=metric, n_jobs=3)
    return grid_search_ada

def AdaBestModel(df_X,y):
    Lr=PipeLineGradient() 
    Lr.fit(df_X,y) 
    X=Lr.transform(df_X) 
    model= AdaSearch() 
    model.fit(X,y)    
    best_params=model.best_params_
    serializable_params = {k: str(v) for k, v in best_params.items()}
    with open("savedModels/AdaBoost.json", "w") as f:
        json.dump(serializable_params, f, indent=4)
        
    return best_params
    

In [116]:
def ReadCreateModel(modelName,metric="AUC"): 
    with open(f"savedModels/{modelName}.json", "r") as f:
        best_params=json.load(f) 
    if modelName=="XGBClassifier":
        createdModel=XGBClassifier(eval_metric=metric,random_state=42, verbosity=0, n_jobs=-1)
    elif modelName=="AdaBoost":
        createdModel = AdaBoostClassifier()
    elif modelName=="LGBMClassifier":
        createdModel=LGBMClassifier(random_state=42 ,n_jobs=-1,verbosity=0)
    else: 
        CatBoostClassifier(eval_metric=metric,early_stopping=15,random_state=42)
    createdModel.set_params(**best_params) 
    return createdModel
    
def TestingModel(modelName,X_train,X_test,y_train,y_test): 
    
    Model=ReadCreate(modelName) 
    if modelName!="CatBoostClassifier":
        Model=PipelineModel(Model)
        Model.fit(X_train,y_train) 
        Model.predict_proba(X_test)[:, 1]
    else:
        X_train_transformed,cat_train=MergeNumCat(X_train)
        X_test_transformed,cat_test=MergeNumCat(X_test)
        pool_train=Pool(data=X_train,label=y_train,cat_features=cat_train)
        pool_test = Pool(data=X_test, cat_features=cat_test)
        Model.fit(pool_train)
        Model.predict_proba(pool_test)[:,1]
        
    fpr, tpr, thresholds = roc_curve(y_test, y_scores)
    roc_auc = auc(fpr, tpr)
    aucPlot(fpr, tpr,roc_auc) 
    return roc_auc,Model

def F1Score(y_pred,y_test): 
    return f1_score(y_test, y_predict)

def aucPlot(fpr,tpr,roc_auc): 
    plt.figure()
    plt.plot(fpr, tpr, label=f"ROC curve (AUC = {roc_auc:.2f})")
    plt.plot([0, 1], [0, 1], linestyle="--", color="gray") 
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.grid(True)
    plt.show()

In [117]:
def kfold_split_np(X, y, n_splits=5, shuffle=True, random_state=42):
    X_copy = X.copy()
    y_copy = y.copy()
    n_samples = X_copy.shape[0]

    indices = np.arange(n_samples)
    if shuffle:
        rng = np.random.default_rng(seed=random_state)
        rng.shuffle(indices)

    fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=int)
    fold_sizes[:n_samples % n_splits] += 1
    
    splits = []
    current = 0
    for fold_size in fold_sizes:
        start = current
        stop = current + fold_size
        test_indices = indices[start:stop]
        train_indices = np.concatenate((indices[:start], indices[stop:]))
     
        X_train, X_test = X_copy[train_indices], X_copy[test_indices]
        y_train, y_test = y_copy[train_indices], y_copy[test_indices]
        
        splits.append((X_train, X_test, y_train, y_test))
        current = stop    
    return splits


In [118]:
def GridSearchGradient(ModelName,params,df_X,y,metric="auc", n_splits=5,model__early_stopping=15):
    BestModel=None 
    BestAucScore=float("-inf")
    BestIteration=None
    Lr=PipeLineGradient() 
    Lr.fit(df_X,y) 
    X=Lr.transform(df_X) 
    
    for param in  list(ParameterGrid(params)): 
        if ModelName=="XGBClassifier":
            Model=XGBClassifier(eval_metric=metric,random_state=42, verbosity=0,
                early_stopping_rounds=model__early_stopping, n_jobs=-1,**param)
        else: 
            Model=LGBMClassifier(random_state=42 ,early_stopping_rounds=model__early_stopping, 
                                                             n_jobs=-1,verbosity=0,**param)
        
        SplitMatrix=kfold_split_np(X,y, n_splits)
        NumberIterations=np.zeros(n_splits)
        aucVector=np.zeros(n_splits)
        counter=0
        for split in SplitMatrix:
            X_train, X_test, y_train, y_test=split[0],split[1],split[2],split[3]

            if type(Model).__name__=="LGBMClassifier":
                Model.fit(X_train,y_train,eval_set=[(X_test, y_test)],eval_metric=metric)
            else: 
                Model.fit(X_train, y_train,eval_set=[(X_test, y_test)],verbose=False)
            
            y_proba = Model.predict_proba(X_test)[:, 1]
            auc = roc_auc_score(y_test, y_proba)
            aucVector[counter]=auc 
            if type(Model).__name__=="LGBMClassifier":
                NumberIterations[counter]=Model.best_iteration_ 
            else: 
                NumberIterations[counter]=Model.best_iteration 
            counter+=1
        meanIteration=np.mean(NumberIterations)
        meanAuc=np.mean(aucVector)
        if BestAucScore<meanAuc:
            BestAucScore=meanAuc
            BestModel=param
            BestIteration=meanIteration
    BestModel["n_estimators"]=int(round(BestIteration))
    with open(f"savedModels/{type(Model).__name__}.json", "w") as f:
        json.dump(BestModel, f, indent=4)
    return BestAucScore,BestModel

In [141]:
def CatBoostTransformer(Numerical=['Transaction.Amount', 'Customer.Age','Account.Age.Days','Quantity']): 
    column_transformer = ColumnTransformer([
        ('time_features', TimeTransformer(),["Transaction.Date","Transaction.Hour"]), 
        ("high_amount",HighAmountTransformer(),["Transaction.Amount"]),
        ("numerical",StandardScaler(),Numerical), 
        ("age",AgeTransfomer(),["Customer.Age"]), 
        ("dropColumns",'drop',["Transaction.Date","Transaction.Hour"])],remainder="passthrough") 
    return column_transformer

def MergeNumCat(X):
    CatTransformer=CatBoostTransformer() 
    CatTransformer.fit(X) 
    X_transformed=CatTransformer.transform(X)
    categorical=[col for col in range(X_transformed.shape[1]) if len(np.unique(X_transformed[:, col]))<6] 
    return X_transformed,categorical

    
def CatBoostSearch(scale_pos_weight,x,y): 
    param_grid_cat = {
    'iterations': [100],           # Number of boosting iterations (trees)
    'learning_rate': [0.001],     # Step size shrinkage
    'depth': [4],                      # Depth of each tree
    'l2_leaf_reg': [1],                # L2 regularization
    'border_count': [32],           # Number of splits for numerical features                  
    'scale_pos_weight': [scale_pos_weight],           # Class weight scaling (for imbalanced data)
    'grow_policy': ['SymmetricTree'],  
    'colsample_bylevel': [1.0],         # Subsample ratio of columns for each split level
    'min_data_in_leaf': [1],         # Minimum samples in leaf
    'max_leaves': [31]              
    }
    
    
    score, params = GridCatBoostSearch(param_grid_cat,x,y)
    return  score, params
def GridCatBoostSearch(params,df_X,y,metric="AUC",n_splits=5):
    BestModel=None 
    BestAucScore=float("-inf")
    BestIteration=None
    X,cat_features=MergeNumCat(df_X)
  
    cat_features=list(cat_features)
    X[:, cat_features]=X[:, cat_features].astype(str)
    print(X[:, cat_features])
    for param in  list(ParameterGrid(params)): 
        
        Model=CatBoostClassifier(eval_metric=metric,early_stopping_rounds =15,random_state=42,**param)
        SplitMatrix=kfold_split_np(X,y, n_splits)
        aucVector=np.zeros(n_splits)
        NumberIterations=np.zeros(n_splits) 
        counter=0
        for split in SplitMatrix:
            X_train, X_test, y_train, y_test=split[0],split[1],split[2],split[3]
            train_pool=Pool(data=X_train,label=y_train,cat_features=cat_features)
            validation_pool = Pool(data=X_test,label=y_test,cat_features=cat_features)            
            Model.fit(train_pool,eval_set=validation_pool)
            y_proba = Model.predict_proba(X_test)[:, 1]
            auc = roc_auc_score(y_test, y_proba)
            aucVector[counter]=auc 
            NumberIterations[counter]=Model.get_best_iteration()
            counter+=1
        meanAuc=np.mean(aucVector)
        meanBestIteration=np.mean(NumberIterations)
        if BestAucScore<meanAuc:
            BestAucScore=meanAuc
            BestModel=param
            BestIteration=meanBestIteration
    BestModelo["iterations"]=int(round(BestIterationt))
    with open(f"savedModels/CatBoostClassifier.json", "w") as f:
        json.dump(BestModel, f, indent=4)
    return BestAucScore,BestModel

In [None]:
## NOT SURE IF IT WORKS

def XgbImportance(model): 
    if type(Model).__name__!="XGBClassifier":
        raise ValueError("What are you doing man?") 
    return model.get_booster().get_score(importance_type='gain')
    
def LGBMIMportance(model): 
     if type(Model).__name__!="LGBMClassifier": 
         raise ValueError("What are you doing man?") 
    importance = model.booster_.feature_importance(importance_type='gain')
    feature_names = model.booster_.feature_name()
    importance_dict = dict(zip(feature_names, importance))
    return importance_dict

def CatImportance(model,X_train): 
    if type(Model).__name__!="CatBoostClassifier": 
         raise ValueError("What are you doing man?") 
    importances = model.get_feature_importance()
    feature_names = X_train.columns
    importance_dict = dict(zip(feature_names, importances))    
    return importance_dict 

In [142]:
X,y=KCrossData() 
#BestAucScore,BestModel=XgboostSearch(GetScalePosWeight(y),X,y)
#BestAucScore,BestModel=LightgbmSearch(X,y)
#best_params=AdaBestModel(X,y)
#BestAucScore,BestModel=CatBoostSearch(GetScalePosWeight(y),X,y)

[['1.0' '0.0' '0.0' ... 'electronics' 'tablet' '1']
 ['0.0' '0.0' '0.0' ... 'home & garden' 'desktop' '1']
 ['1.0' '0.0' '0.0' ... 'toys & games' 'desktop' '1']
 ...
 ['0.0' '0.0' '0.0' ... 'health & beauty' 'mobile' '1']
 ['0.0' '0.0' '0.0' ... 'toys & games' 'mobile' '1']
 ['1.0' '0.0' '0.0' ... 'electronics' 'desktop' '1']]
0:	test: 0.6711545	best: 0.6711545 (0)	total: 57.4ms	remaining: 5.69s
1:	test: 0.6757643	best: 0.6757643 (1)	total: 120ms	remaining: 5.87s
2:	test: 0.6835863	best: 0.6835863 (2)	total: 173ms	remaining: 5.59s
3:	test: 0.7206248	best: 0.7206248 (3)	total: 232ms	remaining: 5.56s
4:	test: 0.7316697	best: 0.7316697 (4)	total: 286ms	remaining: 5.43s
5:	test: 0.7420456	best: 0.7420456 (5)	total: 340ms	remaining: 5.32s
6:	test: 0.7406872	best: 0.7420456 (5)	total: 385ms	remaining: 5.12s
7:	test: 0.7385874	best: 0.7420456 (5)	total: 428ms	remaining: 4.92s
8:	test: 0.7389617	best: 0.7420456 (5)	total: 471ms	remaining: 4.76s
9:	test: 0.7393565	best: 0.7420456 (5)	total: 518

KeyboardInterrupt: 