## Preprocessing (pipeline)

In [46]:
import pandas as pd
import numpy as np
from sklearn import set_config
set_config(transform_output = "pandas")

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from category_encoders import WOEEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, make_scorer

from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb

#### Load the data

In [27]:
data_train = pd.read_csv('../datasets/train.csv')
data_test = pd.read_csv('../datasets/test.csv')

pd.set_option('display.max_columns', None)

In [28]:
total_samples = data_train.shape[0] + data_test.shape[0]
train_distribution_percentage = (data_train.shape[0] / total_samples) * 100
test_distribution_percentage = (data_test.shape[0] / total_samples) * 100

print(f"Training Set Distribution: {train_distribution_percentage:.2f}% ({data_train.shape[0]} rows)")
print(f"Testing Set Distribution: {test_distribution_percentage:.2f}% ({data_test.shape[0]} rows)")

Training Set Distribution: 74.99% (5044 rows)
Testing Set Distribution: 25.01% (1682 rows)


### Pre-processing ideas
- missing values: drop or impute? Maybe just do median imputation because there’s so little
- imbalance in target (15% vs 85%): use stratified CV! Evaluate with proper metrics! Use ensemble of models! Data augmentation (e.g. undersampling or SMOTE) or using class weights? 
- gender: one-hot encoding (binary indicator 1/0)
- tariff: weights of evidence or one-hot encoding (ordinality or not?)
- handset: WOE
- Usage_Band: ordinal so take this into account but also WOE maybe
- tariff_OK, high dropped calls and No Usage might be very uninformative because extremely imbalanced – if we use: one-hot encoding for all (change tariff_OK values to OK vs High, so regrouping the High CAT 100, High CAT 50 and High Play 100)
- for numerical ones i'm not sure, maybe some form of outlier detection and potentially some WOE

Random Forests, being an ensemble of decision trees, are generally not sensitive to the scale of numeric features. The reason is that decision trees make splits based on feature values but do not rely on the absolute scale of those values. Therefore, in many cases, scaling is not a strict requirement when using Random Forests. --> no standardization for now so we keep interpretability

#### Missing values

In [29]:
# For training data
missing_count = data_train.isnull().sum() 
missing_data = pd.DataFrame({'Column Name': missing_count.index, 'Missing Count': missing_count.values})
missing_data = missing_data.sort_values(by='Missing Count', ascending=False)
missing_data = missing_data[missing_data['Missing Count'] > 0]
missing_data

Unnamed: 0,Column Name,Missing Count
22,Dropped_calls_ratio,4
23,Usage_Band,4
25,call_cost_per_min,4


In [30]:
rows_with_missing_values_train = data_train[data_train.isnull().any(axis=1)]
print("Rows with Missing Values in training data:")
rows_with_missing_values_train

Rows with Missing Values in training data:


Unnamed: 0,Gender,Age,Connect_Date,L_O_S,Dropped_Calls,tariff,Handset,Peak_calls_Sum,Peak_mins_Sum,OffPeak_calls_Sum,OffPeak_mins_Sum,Weekend_calls_Sum,Weekend_mins_Sum,International_mins_Sum,Nat_call_cost_Sum,AvePeak,AveOffPeak,AveWeekend,National_calls,National mins,AveNational,All_calls_mins,Dropped_calls_ratio,Usage_Band,Mins_charge,call_cost_per_min,actual call cost,Total_call_cost,Total_Cost,Tariff_OK,average cost min,Peak ratio,OffPeak ratio,Weekend ratio,Nat-InterNat Ratio,high Dropped calls,No Usage,target,id
1736,F,48.0,26/07/98,26.966667,2.0,Play 100,BS110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,-600.0,,0.0,0.0,59.94,OK,0.5,0.0,0.0,0.0,0.0,F,T,0,K244380
3237,F,34.0,22/03/97,43.333333,2.0,Play 100,BS110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,-600.0,,0.0,0.0,59.94,OK,0.5,0.0,0.0,0.0,0.0,F,T,0,K244320
3836,M,21.0,03/01/96,58.133333,2.0,Play 100,CAS30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,-600.0,,0.0,0.0,59.94,OK,0.5,0.0,0.0,0.0,0.0,F,T,1,K213590
4301,F,22.0,08/08/98,26.533333,5.0,Play 100,CAS30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,-600.0,,0.0,0.0,59.94,OK,0.5,0.0,0.0,0.0,0.0,F,T,1,K212820


In [31]:
# For test data
missing_count = data_test.isnull().sum() 
missing_data = pd.DataFrame({'Column Name': missing_count.index, 'Missing Count': missing_count.values})
missing_data = missing_data.sort_values(by='Missing Count', ascending=False)
missing_data = missing_data[missing_data['Missing Count'] > 0]
missing_data

Unnamed: 0,Column Name,Missing Count
22,Dropped_calls_ratio,1
23,Usage_Band,1
25,call_cost_per_min,1


In [32]:
rows_with_missing_values_test = data_test[data_test.isnull().any(axis=1)]
print("\nRows with Missing Values in test data:")
rows_with_missing_values_test


Rows with Missing Values in test data:


Unnamed: 0,Gender,Age,Connect_Date,L_O_S,Dropped_Calls,tariff,Handset,Peak_calls_Sum,Peak_mins_Sum,OffPeak_calls_Sum,OffPeak_mins_Sum,Weekend_calls_Sum,Weekend_mins_Sum,International_mins_Sum,Nat_call_cost_Sum,AvePeak,AveOffPeak,AveWeekend,National_calls,National mins,AveNational,All_calls_mins,Dropped_calls_ratio,Usage_Band,Mins_charge,call_cost_per_min,actual call cost,Total_call_cost,Total_Cost,Tariff_OK,average cost min,Peak ratio,OffPeak ratio,Weekend ratio,Nat-InterNat Ratio,high Dropped calls,No Usage,id,target
647,F,33.0,08/09/98,25.5,2.0,Play 100,BS110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,-600.0,,0.0,0.0,59.94,OK,0.5,0.0,0.0,0.0,0.0,F,T,K243820,0


**we will impute this since it's so little rows**

In [33]:
data_train = data_train.dropna()
data_test = data_test.dropna()

In [34]:
target_column = 'target'

# Separate features and target variable
X_train = data_train.drop(target_column, axis=1)
y_train = data_train[target_column]

X_test = data_test.drop(target_column, axis=1)
y_test = data_test[target_column]

In [35]:
print(X_train.shape[0], y_train.shape[0])
print(X_test.shape[0], y_test.shape[0])

5040 5040
1681 1681


don't know if this should be done after splitting or not

In [36]:
def process_date_column(data, date_column):
    # Convert the date column to datetime format
    data[date_column] = pd.to_datetime(data[date_column], format='%d/%m/%y')

    # Find the earliest date
    earliest_date = data[date_column].min()

    # Convert the date column to days since the earliest date
    data[date_column] = (data[date_column] - earliest_date).dt.days

    return data

X_train = process_date_column(X_train, 'Connect_Date')
X_test = process_date_column(X_test, 'Connect_Date')

#### Check the correlation

#### Split data into train and validation set

In [37]:
stratified_splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, valid_index in stratified_splitter.split(X_train, y_train):
    X_train_split, X_valid_split = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_train_split, y_valid_split = y_train.iloc[train_index], y_train.iloc[valid_index]
    # Now you can use X_train_split, y_train_split for training and X_valid_split, y_valid_split for validation

#### Pipeline

In [38]:
X_train_split['Tariff_OK'] = np.where(X_train_split['Tariff_OK'] == 'OK', 1, 0)
X_valid_split['Tariff_OK'] = np.where(X_valid_split['Tariff_OK'] == 'OK', 1, 0)
X_test['Tariff_OK'] = np.where(X_test['Tariff_OK'] == 'OK', 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_split['Tariff_OK'] = np.where(X_train_split['Tariff_OK'] == 'OK', 1, 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid_split['Tariff_OK'] = np.where(X_valid_split['Tariff_OK'] == 'OK', 1, 0)


In [39]:
X_train_split.head()

Unnamed: 0,Gender,Age,Connect_Date,L_O_S,Dropped_Calls,tariff,Handset,Peak_calls_Sum,Peak_mins_Sum,OffPeak_calls_Sum,OffPeak_mins_Sum,Weekend_calls_Sum,Weekend_mins_Sum,International_mins_Sum,Nat_call_cost_Sum,AvePeak,AveOffPeak,AveWeekend,National_calls,National mins,AveNational,All_calls_mins,Dropped_calls_ratio,Usage_Band,Mins_charge,call_cost_per_min,actual call cost,Total_call_cost,Total_Cost,Tariff_OK,average cost min,Peak ratio,OffPeak ratio,Weekend ratio,Nat-InterNat Ratio,high Dropped calls,No Usage,id
0,F,50.0,870,29.2,2.0,Play 100,BS210,62.0,153.0,185.0,438.600001,4.0,29.0,126.002615,2.045727,2.467742,2.370811,7.25,251.0,620.600001,2.47251,746.602616,0.003984,Med,20.600001,9.930712,2.045727,52.446773,112.386773,1,0.150531,0.246536,0.706735,0.046729,0.203034,F,F,K262360
1,M,25.0,350,46.533333,1.0,CAT 100,ASAD90,146.0,718.8,98.0,164.7,4.0,37.2,251.580636,41.072379,4.923288,1.680612,9.3,248.0,920.7,3.7125,1172.280636,0.002016,Med,320.7,12.807103,41.07238,116.546571,221.546571,1,0.188988,0.78071,0.178886,0.040404,0.273249,F,F,K170160
2,F,46.0,604,38.066667,1.0,CAT 50,WC95,160.0,322.8,7.0,123.9,0.0,0.0,91.584877,21.575073,2.0175,17.7,0.0,167.0,446.7,2.67485,538.284877,0.002994,MedLow,146.7,12.226326,17.936021,45.411484,128.811484,1,0.2393,0.722633,0.277367,0.0,0.205025,F,F,K331610
3,F,59.0,924,27.4,1.0,CAT 50,BS110,84.0,317.400001,57.0,161.699999,0.0,0.0,23.998036,20.950771,3.778571,2.836842,0.0,141.0,479.1,3.397872,503.098036,0.003546,MedLow,179.1,11.624922,20.820235,28.019646,111.419646,1,0.221467,0.662492,0.337508,0.0,0.05009,F,F,K332460
4,F,25.0,1103,21.433333,1.0,Play 300,WC95,14.0,309.6,326.0,637.8,6.0,14.4,87.051515,0.0,22.114286,1.956442,2.4,346.0,961.8,2.779769,1048.851515,0.001445,Med,-838.2,9.403618,0.0,34.820606,112.760606,1,0.107509,0.321896,0.663132,0.014972,0.090509,F,F,K394220


In [40]:
# Custom transformer to remove prefix from column names
class RemovePrefixTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, prefixes):
        self.prefixes = prefixes

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        for prefix in self.prefixes:
            X.columns = [col.split(f'{prefix}__')[1] if f'{prefix}__' in col else col for col in X.columns]
        return X

In [49]:
# Define columns to drop
columns_to_drop = ['id']  # Add more columns if needed

# Define columns for different encoding methods
one_hot_encode_columns = ['Gender', 'high Dropped calls', 'No Usage']
woe_encode_columns = ['tariff', 'Handset']
ordinal_encode_columns = ['Usage_Band']
impute_num = ['Dropped_calls_ratio', 'call_cost_per_min']
impute_cat = ['Usage_Band']
#numeric_columns = X_train_split.select_dtypes(include=['int64', 'float64']).columns
#categorical_columns = X_train_split.select_dtypes(include=['object']).columns
#categorical_columns = [col for col in categorical_columns if col != 'id']

# Define the preprocessing steps for each column
preprocessor = ColumnTransformer(
    transformers=[
        ('drop_columns', 'drop', columns_to_drop),
        ('impute_median', SimpleImputer(strategy='median'), impute_num),
        #('impute_mode', SimpleImputer(strategy='most_frequent'), impute_cat),
        ('one_hot_encode', OneHotEncoder(drop='first', sparse_output=False), one_hot_encode_columns),
        ('WOE_encode', WOEEncoder(), woe_encode_columns),
        ('ordinal_encode', OrdinalEncoder(categories=[['Low', 'MedLow', 'Med', 'MedHigh', 'High']]), ['Usage_Band']) #ordinal_encode_columns
    ],
    remainder='passthrough'  # Keep the remaining columns as they are
)

# Build the preprocessing pipeline
preprocessing_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('remove_prefix', RemovePrefixTransformer(prefixes=['impute_median', 'one_hot_encode', 'WOE_encode', 'ordinal_encode', 'remainder']))  # Add this step to remove the prefix
])

# Define the RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42) #class_weight='balanced' - but gives same AUC on validation set

lgb_classifier = lgb.LGBMClassifier(is_unbalance=True)
#lgb_classifier = lgb.LGBMClassifier(scale_pos_weight=(1 - y_train_split.sum() / len(y_train_split)))

xgb_classifier= xgb.XGBClassifier(scale_pos_weight=(1 - y_train_split.sum() / len(y_train_split)))

# Build the full pipeline with preprocessing and model
rf_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing_pipeline),
    ('model', rf_classifier)
])

lgb_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing_pipeline),
    ('model', lgb_classifier)
])

xgb_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing_pipeline),
    ('model', xgb_classifier)
])

repeat last 2 code lines for the other 2 models

In [42]:
# Define evaluation metrics
def profit_at_top_20(y_true, y_probabilities, top_k=20):
    # Extract probabilities for positive class
    churn_probabilities = y_probabilities[:, 1]

    # Sort customers by predicted probabilities in descending order
    sorted_indices = sorted(range(len(churn_probabilities)), key=lambda k: churn_probabilities[k], reverse=True)

    # Identify the top-20 customers
    top_20_indices = sorted_indices[:top_k]

    # Calculate profit at top-20
    profit = sum(y_true[i] * churn_probabilities[i] for i in top_20_indices)

    return profit

# Define custom scorer for use in GridSearchCV or RandomizedSearchCV
profit_at_top_20_scorer = make_scorer(profit_at_top_20, greater_is_better=True)

In [43]:
# Assuming X_train_split is your training data
X_train_preprocessed = preprocessing_pipeline.fit_transform(X_train_split, y_train_split)
X_train_preprocessed

Unnamed: 0,Dropped_calls_ratio,call_cost_per_min,Gender_M,high Dropped calls_T,tariff,Handset,Usage_Band,Age,Connect_Date,L_O_S,Dropped_Calls,Peak_calls_Sum,Peak_mins_Sum,OffPeak_calls_Sum,OffPeak_mins_Sum,Weekend_calls_Sum,Weekend_mins_Sum,International_mins_Sum,Nat_call_cost_Sum,AvePeak,AveOffPeak,AveWeekend,National_calls,National mins,AveNational,All_calls_mins,Mins_charge,actual call cost,Total_call_cost,Total_Cost,Tariff_OK,average cost min,Peak ratio,OffPeak ratio,Weekend ratio,Nat-InterNat Ratio
0,0.003984,9.930712,0.0,0.0,0.386262,-1.000185,2.0,50.0,870,29.200000,2.0,62.0,153.000000,185.0,438.600001,4.0,29.000000,126.002615,2.045727,2.467742,2.370811,7.250000,251.0,620.600001,2.472510,746.602616,20.600001,2.045727,52.446773,112.386773,1,0.150531,0.246536,0.706735,0.046729,0.203034
1,0.002016,12.807103,1.0,0.0,-0.057123,3.060143,2.0,25.0,350,46.533333,1.0,146.0,718.800000,98.0,164.700000,4.0,37.200000,251.580636,41.072379,4.923288,1.680612,9.300000,248.0,920.700000,3.712500,1172.280636,320.700000,41.072380,116.546571,221.546571,1,0.188988,0.780710,0.178886,0.040404,0.273249
2,0.002994,12.226326,0.0,0.0,0.106161,-2.481120,1.0,46.0,604,38.066667,1.0,160.0,322.800000,7.0,123.900000,0.0,0.000000,91.584877,21.575073,2.017500,17.700000,0.000000,167.0,446.700000,2.674850,538.284877,146.700000,17.936021,45.411484,128.811484,1,0.239300,0.722633,0.277367,0.000000,0.205025
3,0.003546,11.624922,0.0,0.0,0.106161,-0.038773,1.0,59.0,924,27.400000,1.0,84.0,317.400001,57.0,161.699999,0.0,0.000000,23.998036,20.950771,3.778571,2.836842,0.000000,141.0,479.100000,3.397872,503.098036,179.100000,20.820235,28.019646,111.419646,1,0.221467,0.662492,0.337508,0.000000,0.050090
4,0.001445,9.403618,0.0,0.0,-0.558449,-2.481120,2.0,25.0,1103,21.433333,1.0,14.0,309.600000,326.0,637.800000,6.0,14.400000,87.051515,0.000000,22.114286,1.956442,2.400000,346.0,961.800000,2.779769,1048.851515,-838.200000,0.000000,34.820606,112.760606,1,0.107509,0.321896,0.663132,0.014972,0.090509
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5038,0.007246,13.427203,0.0,0.0,-0.057123,-0.038773,2.0,22.0,1270,15.866667,1.0,26.0,721.200000,14.0,75.600001,29.0,59.000000,104.796650,34.346783,27.738462,5.400000,2.034483,69.0,855.800001,12.402899,960.596651,255.800001,34.346784,65.785779,170.785779,1,0.177791,0.842720,0.088338,0.068941,0.122455
5039,0.002591,8.079475,0.0,0.0,-0.009528,3.060143,3.0,16.0,623,37.433333,2.0,151.0,1169.400001,201.0,657.900000,34.0,71.400001,242.983418,56.451295,7.744371,3.273134,2.100000,386.0,1898.700002,4.918912,2141.683420,698.700002,56.451295,129.346320,279.346320,1,0.130433,0.615895,0.346500,0.037605,0.127974
5040,0.001852,10.557687,0.0,0.0,-0.057123,2.996780,2.0,29.0,271,49.166667,1.0,135.0,405.600000,124.0,301.200001,11.0,23.000000,174.140881,13.703878,3.004444,2.429032,2.090909,270.0,729.800001,2.702963,903.940882,129.800001,13.703878,65.946142,170.946142,1,0.189112,0.555769,0.412716,0.031515,0.238615
5042,0.009524,11.464996,1.0,0.0,0.386262,-0.149000,1.0,46.0,790,31.866667,2.0,72.0,112.200000,31.0,230.700000,2.0,4.200000,59.510484,0.000000,1.558333,7.441935,2.100000,105.0,347.100000,3.305714,406.610484,-252.900000,0.000000,23.804194,83.744194,1,0.205957,0.323250,0.664650,0.012100,0.171451


used a separate custom classifier to keep the names as original variables

#### Random Forest

In [19]:
''' 
# Create GridSearchCV instance
grid_search = GridSearchCV(full_pipeline, param_grid, scoring={'profit_at_top_20': profit_at_top_20_scorer, 'auc': 'roc_auc'},
                           refit='profit_at_top_20', cv=5, verbose=2, n_jobs=-1)

# Fit the GridSearchCV on training data
grid_search.fit(X_train_split, y_train_split)
'''

" \n# Create GridSearchCV instance\ngrid_search = GridSearchCV(full_pipeline, param_grid, scoring={'profit_at_top_20': profit_at_top_20_scorer, 'auc': 'roc_auc'},\n                           refit='profit_at_top_20', cv=5, verbose=2, n_jobs=-1)\n\n# Fit the GridSearchCV on training data\ngrid_search.fit(X_train_split, y_train_split)\n"

In [20]:
# Build parameter grid for hyperparameter tuning
rf_param_grid = {
    'model__n_estimators': [150, 160, 170, 180, 190, 200],
    'model__max_depth': [1, 2, 3, 4, 5], # maximum number of levels allowed in each decision tree
    'model__min_samples_split': [2, 4, 5, 6, 8, 10],
    'model__min_samples_leaf': [1, 2, 3, 4]
}

# Create GridSearchCV instance
grid_search = GridSearchCV(rf_pipeline, rf_param_grid, scoring={'auc': 'roc_auc'}, refit='auc', cv=5, verbose=2, n_jobs=-1)

# Fit the GridSearchCV on training data
grid_search.fit(X_train_split, y_train_split)

Fitting 5 folds for each of 720 candidates, totalling 3600 fits


In [21]:
# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Evaluate on the validation set
y_valid_probabilities = best_model.predict_proba(X_valid_split)
#profit_at_top_20_score = profit_at_top_20(y_valid_split, y_valid_probabilities)
auc_score = roc_auc_score(y_valid_split, y_valid_probabilities[:, 1])

#print(f'Profit at Top-20: {profit_at_top_20_score}')
print(f'AUC on Validation Set: {auc_score}')

# Access the best hyperparameters
best_hyperparameters_RF = grid_search.best_params_
print(f'Best Hyperparameters: {best_hyperparameters_RF}')

# Evaluate on the test set
y_test_probabilities = best_model.predict_proba(X_test)
auc_score_test = roc_auc_score(y_test, y_test_probabilities[:, 1])

print(f'AUC on Test Set: {auc_score_test}')

AUC: 0.9278230500582073


#### LGB

In [53]:
# Create parameter grid for LightGBM hyperparameter tuning
lgb_param_grid = {
    'model__n_estimators': [150, 160, 170, 180, 190, 200],
    'model__max_depth': [1, 2, 3, 4, 5],
    'model__learning_rate': [0.01, 0.05, 0.1, 0.2],
}

# Create GridSearchCV instance for LightGBM
lgb_grid_search = GridSearchCV(lgb_pipeline, lgb_param_grid, scoring={'auc': 'roc_auc'}, refit='auc', cv=5, verbose=2, n_jobs=1)

# Fit the GridSearchCV on training data for LightGBM
lgb_grid_search.fit(X_train_split, y_train_split)

Fitting 5 folds for each of 120 candidates, totalling 600 fits
[LightGBM] [Info] Number of positive: 475, number of negative: 2750
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001316 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7070
[LightGBM] [Info] Number of data points in the train set: 3225, number of used features: 36
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.147287 -> initscore=-1.756041
[LightGBM] [Info] Start training from score -1.756041
[CV] END model__learning_rate=0.01, model__max_depth=1, model__n_estimators=150; total time=   0.8s
[LightGBM] [Info] Number of positive: 475, number of negative: 2750
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001114 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7072
[LightGBM] [Info] N

In [54]:
# Get the best LightGBM model from the grid search
best_lgb_model = lgb_grid_search.best_estimator_

# Evaluate on the validation set for LightGBM
y_valid_probabilities_lgb = best_lgb_model.predict_proba(X_valid_split)
auc_score_lgb = roc_auc_score(y_valid_split, y_valid_probabilities_lgb[:, 1])

print(f'AUC for LightGBM on Validation Set: {auc_score_lgb}')

# Access the best hyperparameters for LightGBM
best_hyperparameters_LGB = lgb_grid_search.best_params_
print(f'Best Hyperparameters for LightGBM: {best_hyperparameters_LGB}')

# Evaluate on the test set for LightGBM
y_test_probabilities_lgb = best_lgb_model.predict_proba(X_test)
auc_score_test_lgb = roc_auc_score(y_test, y_test_probabilities_lgb[:, 1])

print(f'AUC for LightGBM on Test Set: {auc_score_test_lgb}')

AUC for LightGBM on Validation Set: 0.9374799790610279
Best Hyperparameters for LightGBM: {'model__learning_rate': 0.05, 'model__max_depth': 4, 'model__n_estimators': 160}
AUC for LightGBM on Test Set: 0.9353459919411115


#### XGB

In [55]:
# Create parameter grid for XGBoost hyperparameter tuning
xgb_param_grid = {
    'model__n_estimators': [150, 160, 170, 180, 190, 200],
    'model__max_depth': [1, 2, 3, 4, 5],
    'model__learning_rate': [0.01, 0.05, 0.1, 0.2],
}

# Create GridSearchCV instance for XGBoost
xgb_grid_search = GridSearchCV(xgb_pipeline, xgb_param_grid, scoring={'auc': 'roc_auc'}, refit='auc', cv=5, verbose=2, n_jobs=1)

# Fit the GridSearchCV on training data for XGBoost
xgb_grid_search.fit(X_train_split, y_train_split)

Fitting 5 folds for each of 120 candidates, totalling 600 fits
[CV] END model__learning_rate=0.01, model__max_depth=1, model__n_estimators=150; total time=   0.6s
[CV] END model__learning_rate=0.01, model__max_depth=1, model__n_estimators=150; total time=   0.8s
[CV] END model__learning_rate=0.01, model__max_depth=1, model__n_estimators=150; total time=   0.6s
[CV] END model__learning_rate=0.01, model__max_depth=1, model__n_estimators=150; total time=   0.5s
[CV] END model__learning_rate=0.01, model__max_depth=1, model__n_estimators=150; total time=   0.9s
[CV] END model__learning_rate=0.01, model__max_depth=1, model__n_estimators=160; total time=   1.3s
[CV] END model__learning_rate=0.01, model__max_depth=1, model__n_estimators=160; total time=   0.8s
[CV] END model__learning_rate=0.01, model__max_depth=1, model__n_estimators=160; total time=   0.5s
[CV] END model__learning_rate=0.01, model__max_depth=1, model__n_estimators=160; total time=   0.5s
[CV] END model__learning_rate=0.01, m

In [56]:
# Get the best XGBoost model from the grid search
best_xgb_model = xgb_grid_search.best_estimator_

# Evaluate on the validation set for XGBoost
y_valid_probabilities_xgb = best_xgb_model.predict_proba(X_valid_split)
auc_score_xgb = roc_auc_score(y_valid_split, y_valid_probabilities_xgb[:, 1])

print(f'AUC for XGBoost on Validation Set: {auc_score_xgb}')

# Access the best hyperparameters for XGBoost
best_hyperparameters_XGB = xgb_grid_search.best_params_
print(f'Best Hyperparameters for XGBoost: {best_hyperparameters_XGB}')

# Evaluate on the test set for XGBoost
y_test_probabilities_xgb = best_xgb_model.predict_proba(X_test)
auc_score_test_xgb = roc_auc_score(y_test, y_test_probabilities_xgb[:, 1])

print(f'AUC for XGBoost on Test Set: {auc_score_test_xgb}')

AUC for XGBoost on Validation Set: 0.9428319178692252
Best Hyperparameters for XGBoost: {'model__learning_rate': 0.05, 'model__max_depth': 5, 'model__n_estimators': 190}
AUC for XGBoost on Test Set: 0.9405094207955339


## Models

use these:
- RF
- lightgbm
- xgboost

imbalanced data so:
- use stratified CV to ensure that each fold maintains the class distribution
- evaluate with proper metrics (as given by the prof)
- use an ensemble of models (hence the above techniques - but also they are best techniques to work with tabular data)
- possibly do data augmentation with techniques like SMOTE to make the distribution more balanced

also this exists to deal with imbalance:

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(class_weight='balanced')

In [None]:
xgb_model = xgb.XGBClassifier(scale_pos_weight=(1 - y.sum() / len(y)))

In [None]:
# or lgb_model = lgb.LGBMClassifier(is_unbalance=True)
lgb_model = lgb.LGBMClassifier(scale_pos_weight=(1 - y.sum() / len(y)))