In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import matplotlib.pyplot as plt
import joblib

from xgboost import XGBClassifier

df = pd.read_csv("data/dataset.csv")
df.head()

df.info()
df.describe()

X = df.drop(columns=["target_variable"])
y = df["target_variable"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Add this to your code after the train_test_split
print(y_train.value_counts(normalize=True))

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



model = XGBClassifier(
    n_estimators=100,  # nombre d’arbres
    max_depth=12,  # profunditat màxima dels arbres
    learning_rate=0.1,  # taxa d’aprenentatge
    subsample=0.9,  # fracció de dades per cada arbre
    colsample_bytree=0.8,  # fracció de features per cada arbre
    scale_pos_weight=1,  # ajust per classes desbalancejades (pots modificar-ho)
    random_state=42,
    use_label_encoder=False,
    eval_metric="logloss"
)

model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35899 entries, 0 to 35898
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          35899 non-null  int64  
 1   product_A_sold_in_the_past  35899 non-null  float64
 2   product_B_sold_in_the_past  35899 non-null  float64
 3   product_A_recommended       35899 non-null  float64
 4   product_A                   35899 non-null  float64
 5   product_C                   35899 non-null  float64
 6   product_D                   35899 non-null  float64
 7   cust_hitrate                35899 non-null  float64
 8   cust_interactions           35899 non-null  float64
 9   cust_contracts              35899 non-null  float64
 10  opp_month                   35899 non-null  float64
 11  opp_old                     35899 non-null  float64
 12  competitor_Z                35899 non-null  int64  
 13  competitor_X                358

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.8105849582172702
F1: 0.7996464348850914
Precision: 0.7958944281524927
Recall: 0.8034339846062759


In [18]:
param_grid = {
    'n_estimators': [100, 200, 500, 700],
    'max_depth': [3, 5, 7, 10],  # <-- Note the sane values
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.7, 0.8, 0.9, 1.0],      # Row sampling
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0] # Column sampling
}

xgb = XGBClassifier(
    random_state=42,
    use_label_encoder=False,
    eval_metric="logloss",
    scale_pos_weight=1 # Explicitly set to 1 (or just remove it, 1 is default)
)

random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid,
    n_iter=25,
    cv=3,
    scoring='f1',  # Still optimizing for F1!
    n_jobs=-1,
    verbose=2,
    random_state=42
)

random_search.fit(X_train_scaled, y_train)

# 4. Get the best model
print(f"Best parameters found: {random_search.best_params_}")
best_model = random_search.best_estimator_

# 5. Evaluate the best model
y_pred = best_model.predict(X_test_scaled)

print("--- Best Model Performance ---")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))

Fitting 3 folds for each of 25 candidates, totalling 75 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.9, learning_rate=0.01, max_depth=3, n_estimators=700, subsample=0.8; total time=   2.3s
[CV] END colsample_bytree=0.9, learning_rate=0.01, max_depth=3, n_estimators=700, subsample=0.8; total time=   2.3s
[CV] END colsample_bytree=0.9, learning_rate=0.05, max_depth=3, n_estimators=500, subsample=1.0; total time=   2.4s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


[CV] END colsample_bytree=0.9, learning_rate=0.01, max_depth=3, n_estimators=700, subsample=0.8; total time=   2.7s
[CV] END colsample_bytree=0.9, learning_rate=0.2, max_depth=3, n_estimators=500, subsample=1.0; total time=   0.3s
[CV] END colsample_bytree=0.9, learning_rate=0.2, max_depth=3, n_estimators=500, subsample=1.0; total time=   0.5s
[CV] END colsample_bytree=0.9, learning_rate=0.05, max_depth=7, n_estimators=500, subsample=0.9; total time=   2.8s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=5, n_estimators=700, subsample=1.0; total time=   3.0s
[CV] END colsample_bytree=0.7, learning_rate=0.05, max_depth=3, n_estimators=200, subsample=0.9; total time=   0.3s
[CV] END colsample_bytree=0.9, learning_rate=0.05, max_depth=3, n_estimators=500, subsample=1.0; total time=   3.1s
[CV] END colsample_bytree=0.7, learning_rate=0.05, max_depth=3, n_estimators=200, subsample=0.9; total time=   0.3s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=7, n_estimators=200, subsample=0.8; total time=   3.1s
[CV] END colsample_bytree=0.9, learning_rate=0.05, max_depth=3, n_estimators=500, subsample=1.0; total time=   3.1s
[CV] END colsample_bytree=0.9, learning_rate=0.2, max_depth=3, n_estimators=500, subsample=1.0; total time=   3.1s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=7, n_estimators=200, subsample=0.8; total time=   3.2s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=7, n_estimat

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=10, n_estimators=100, subsample=0.9; total time=   0.8s
[CV] END colsample_bytree=0.7, learning_rate=0.05, max_depth=3, n_estimators=700, subsample=0.7; total time=   3.5s
[CV] END colsample_bytree=0.7, learning_rate=0.05, max_depth=3, n_estimators=700, subsample=0.7; total time=   3.5s
[CV] END colsample_bytree=0.7, learning_rate=0.05, max_depth=3, n_estimators=700, subsample=0.7; total time=   3.6s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=10, n_estimators=100, subsample=0.9; total time=   1.0s
[CV] END colsample_bytree=0.9, learning_rate=0.1, max_depth=7, n_estimators=200, subsample=0.8; total time=   0.8s
[CV] END colsample_bytree=0.7, learning_rate=0.05, max_depth=7, n_estimators=200, subsample=0.8; total time=   0.7s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=5, n_estimators=700, subsample=1.0; total time=   4.1s
[CV] END colsample_bytree=0.9, learning_rate=0.2, max_depth=5, n_estimators=700, subsample=0.9; total time=   4.0s
[CV] END colsample_bytree=0.7, learning_rate=0.05, max_depth=7, n_estimators=200, subsample=0.8; total time=   0.8s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=5, n_estimators=700, subsample=1.0; total time=   4.1s
[CV] END colsample_bytree=0.7, learning_rate=0.05, max_depth=7, n_estimators=200, subsample=0.8; total time=   0.8s
[CV] END colsample_bytree=0.9, learning_rate=0.1, max_depth=7, n_estimato

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.9, learning_rate=0.2, max_depth=5, n_estimators=700, subsample=0.9; total time=   4.3s[CV] END colsample_bytree=0.9, learning_rate=0.1, max_depth=7, n_estimators=200, subsample=0.8; total time=   1.1s

[CV] END colsample_bytree=0.9, learning_rate=0.05, max_depth=7, n_estimators=500, subsample=0.9; total time=   4.4s
[CV] END colsample_bytree=0.9, learning_rate=0.1, max_depth=3, n_estimators=700, subsample=0.8; total time=   1.0s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.9, learning_rate=0.1, max_depth=3, n_estimators=700, subsample=0.8; total time=   1.0s
[CV] END colsample_bytree=0.9, learning_rate=0.2, max_depth=5, n_estimators=700, subsample=0.9; total time=   4.5s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=5, n_estimators=200, subsample=0.7; total time=   0.5s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=5, n_estimators=200, subsample=0.7; total time=   0.5s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=5, n_estimators=200, subsample=0.7; total time=   0.5s
[CV] END colsample_bytree=0.7, learning_rate=0.05, max_depth=7, n_estimators=100, subsample=0.7; total time=   0.4s
[CV] END colsample_bytree=0.9, learning_rate=0.1, max_depth=3, n_estimators=700, subsample=0.8; total time=   1.1s
[CV] END colsample_bytree=0.9, learning_rate=0.05, max_depth=7, n_estimators=500, subsample=0.9; total time=   4.8s
[CV] END colsample_bytree=1.0, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=0.9; total time=   0.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.7, learning_rate=0.05, max_depth=7, n_estimators=100, subsample=0.7; total time=   0.5s
[CV] END colsample_bytree=0.7, learning_rate=0.05, max_depth=7, n_estimators=100, subsample=0.7; total time=   0.4s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=500, subsample=0.7; total time=   1.2s
[CV] END colsample_bytree=1.0, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=0.9; total time=   0.3s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=500, subsample=0.7; total time=   1.3s
[CV] END colsample_bytree=1.0, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=0.9; total time=   0.3s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=500, subsample=0.7; total time=   1.3s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8; total time=   0.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8; total time=   0.3s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8; total time=   0.4s
[CV] END colsample_bytree=0.9, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=0.9; total time=   0.2s
[CV] END colsample_bytree=0.9, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=0.9; total time=   0.3s
[CV] END colsample_bytree=0.9, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=0.9; total time=   0.3s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=7, n_estimators=500, subsample=0.8; total time=   1.5s
[CV] END colsample_bytree=0.9, learning_rate=0.05, max_depth=7, n_estimators=700, subsample=0.8; total time=   2.3s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=7, n_estimators=500, subsample=0.8; total time=   1.9s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=10, n_estimato

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best parameters found: {'subsample': 1.0, 'n_estimators': 700, 'max_depth': 10, 'learning_rate': 0.1, 'colsample_bytree': 0.8}
--- Best Model Performance ---
Accuracy: 0.8133704735376045
F1: 0.8027671474830733
Precision: 0.7983021077283372
Recall: 0.8072824156305506
