## Preprocessing (pipeline)

In [53]:
import pandas as pd
import numpy as np
#from sklearn import set_config
#set_config(transform_output="pandas")

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from category_encoders import WOEEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, roc_auc_score

from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier

#### Load the data

In [54]:
data_train = pd.read_csv('../datasets/train.csv')
data_test = pd.read_csv('../datasets/test.csv')

pd.set_option('display.max_columns', None)

In [55]:
total_samples = data_train.shape[0] + data_test.shape[0]
train_distribution_percentage = (data_train.shape[0] / total_samples) * 100
test_distribution_percentage = (data_test.shape[0] / total_samples) * 100

print(f"Training Set Distribution: {train_distribution_percentage:.2f}% ({data_train.shape[0]} rows)")
print(f"Testing Set Distribution: {test_distribution_percentage:.2f}% ({data_test.shape[0]} rows)")

Training Set Distribution: 74.99% (5044 rows)
Testing Set Distribution: 25.01% (1682 rows)


### Pre-processing ideas
- missing values: drop or impute? Maybe just do median imputation because there’s so little
- imbalance in target (15% vs 85%): use stratified CV! Evaluate with proper metrics! Use ensemble of models! Data augmentation (e.g. undersampling or SMOTE) or using class weights? 
- gender: one-hot encoding (binary indicator 1/0)
- tariff: weights of evidence or one-hot encoding (ordinality or not?)
- handset: WOE
- Usage_Band: ordinal so take this into account but also WOE maybe
- tariff_OK, high dropped calls and No Usage might be very uninformative because extremely imbalanced – if we use: one-hot encoding for all (change tariff_OK values to OK vs High, so regrouping the High CAT 100, High CAT 50 and High Play 100)
- for numerical ones i'm not sure, maybe some form of outlier detection and potentially some WOE

Random Forests, being an ensemble of decision trees, are generally not sensitive to the scale of numeric features. The reason is that decision trees make splits based on feature values but do not rely on the absolute scale of those values. Therefore, in many cases, scaling is not a strict requirement when using Random Forests. --> no standardization for now so we keep interpretability

#### Missing values

In [56]:
# For training data
missing_count = data_train.isnull().sum() 
missing_data = pd.DataFrame({'Column Name': missing_count.index, 'Missing Count': missing_count.values})
missing_data = missing_data.sort_values(by='Missing Count', ascending=False)
missing_data = missing_data[missing_data['Missing Count'] > 0]
missing_data

Unnamed: 0,Column Name,Missing Count
22,Dropped_calls_ratio,4
23,Usage_Band,4
25,call_cost_per_min,4


In [57]:
rows_with_missing_values_train = data_train[data_train.isnull().any(axis=1)]
print("Rows with Missing Values in training data:")
rows_with_missing_values_train

Rows with Missing Values in training data:


Unnamed: 0,Gender,Age,Connect_Date,L_O_S,Dropped_Calls,tariff,Handset,Peak_calls_Sum,Peak_mins_Sum,OffPeak_calls_Sum,OffPeak_mins_Sum,Weekend_calls_Sum,Weekend_mins_Sum,International_mins_Sum,Nat_call_cost_Sum,AvePeak,AveOffPeak,AveWeekend,National_calls,National mins,AveNational,All_calls_mins,Dropped_calls_ratio,Usage_Band,Mins_charge,call_cost_per_min,actual call cost,Total_call_cost,Total_Cost,Tariff_OK,average cost min,Peak ratio,OffPeak ratio,Weekend ratio,Nat-InterNat Ratio,high Dropped calls,No Usage,target,id
1736,F,48.0,26/07/98,26.966667,2.0,Play 100,BS110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,-600.0,,0.0,0.0,59.94,OK,0.5,0.0,0.0,0.0,0.0,F,T,0,K244380
3237,F,34.0,22/03/97,43.333333,2.0,Play 100,BS110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,-600.0,,0.0,0.0,59.94,OK,0.5,0.0,0.0,0.0,0.0,F,T,0,K244320
3836,M,21.0,03/01/96,58.133333,2.0,Play 100,CAS30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,-600.0,,0.0,0.0,59.94,OK,0.5,0.0,0.0,0.0,0.0,F,T,1,K213590
4301,F,22.0,08/08/98,26.533333,5.0,Play 100,CAS30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,-600.0,,0.0,0.0,59.94,OK,0.5,0.0,0.0,0.0,0.0,F,T,1,K212820


In [58]:
# For test data
missing_count = data_test.isnull().sum() 
missing_data = pd.DataFrame({'Column Name': missing_count.index, 'Missing Count': missing_count.values})
missing_data = missing_data.sort_values(by='Missing Count', ascending=False)
missing_data = missing_data[missing_data['Missing Count'] > 0]
missing_data

Unnamed: 0,Column Name,Missing Count
22,Dropped_calls_ratio,1
23,Usage_Band,1
25,call_cost_per_min,1


In [59]:
rows_with_missing_values_test = data_test[data_test.isnull().any(axis=1)]
print("\nRows with Missing Values in test data:")
rows_with_missing_values_test


Rows with Missing Values in test data:


Unnamed: 0,Gender,Age,Connect_Date,L_O_S,Dropped_Calls,tariff,Handset,Peak_calls_Sum,Peak_mins_Sum,OffPeak_calls_Sum,OffPeak_mins_Sum,Weekend_calls_Sum,Weekend_mins_Sum,International_mins_Sum,Nat_call_cost_Sum,AvePeak,AveOffPeak,AveWeekend,National_calls,National mins,AveNational,All_calls_mins,Dropped_calls_ratio,Usage_Band,Mins_charge,call_cost_per_min,actual call cost,Total_call_cost,Total_Cost,Tariff_OK,average cost min,Peak ratio,OffPeak ratio,Weekend ratio,Nat-InterNat Ratio,high Dropped calls,No Usage,id,target
647,F,33.0,08/09/98,25.5,2.0,Play 100,BS110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,-600.0,,0.0,0.0,59.94,OK,0.5,0.0,0.0,0.0,0.0,F,T,K243820,0


**we will impute this since it's so little rows**

In [60]:
data_train = data_train.dropna()
data_test = data_test.dropna()

In [61]:
target_column = 'target'

# Separate features and target variable
X_train = data_train.drop(target_column, axis=1)
y_train = data_train[target_column]

X_test = data_test.drop(target_column, axis=1)
y_test = data_test[target_column]

In [62]:
print(X_train.shape[0], y_train.shape[0])
print(X_test.shape[0], y_test.shape[0])

5040 5040
1681 1681


don't know if this should be done after splitting or not

In [63]:
def process_date_column(data, date_column):
    # Convert the date column to datetime format
    data[date_column] = pd.to_datetime(data[date_column], format='%d/%m/%y')

    # Find the earliest date
    earliest_date = data[date_column].min()

    # Convert the date column to days since the earliest date
    data[date_column] = (data[date_column] - earliest_date).dt.days

    return data

X_train = process_date_column(X_train, 'Connect_Date')
X_test = process_date_column(X_test, 'Connect_Date')

#### Check the correlation

#### Split data into train and validation set

In [64]:
stratified_splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, valid_index in stratified_splitter.split(X_train, y_train):
    X_train_split, X_valid_split = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_train_split, y_valid_split = y_train.iloc[train_index], y_train.iloc[valid_index]
    # Now you can use X_train_split, y_train_split for training and X_valid_split, y_valid_split for validation

#### Pipeline

In [65]:
X_train_split['Tariff_OK'] = np.where(X_train_split['Tariff_OK'] == 'OK', 1, 0)
X_valid_split['Tariff_OK'] = np.where(X_valid_split['Tariff_OK'] == 'OK', 1, 0)
X_test['Tariff_OK'] = np.where(X_test['Tariff_OK'] == 'OK', 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_split['Tariff_OK'] = np.where(X_train_split['Tariff_OK'] == 'OK', 1, 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid_split['Tariff_OK'] = np.where(X_valid_split['Tariff_OK'] == 'OK', 1, 0)


In [66]:
X_train_split.head()

Unnamed: 0,Gender,Age,Connect_Date,L_O_S,Dropped_Calls,tariff,Handset,Peak_calls_Sum,Peak_mins_Sum,OffPeak_calls_Sum,OffPeak_mins_Sum,Weekend_calls_Sum,Weekend_mins_Sum,International_mins_Sum,Nat_call_cost_Sum,AvePeak,AveOffPeak,AveWeekend,National_calls,National mins,AveNational,All_calls_mins,Dropped_calls_ratio,Usage_Band,Mins_charge,call_cost_per_min,actual call cost,Total_call_cost,Total_Cost,Tariff_OK,average cost min,Peak ratio,OffPeak ratio,Weekend ratio,Nat-InterNat Ratio,high Dropped calls,No Usage,id
0,F,50.0,870,29.2,2.0,Play 100,BS210,62.0,153.0,185.0,438.600001,4.0,29.0,126.002615,2.045727,2.467742,2.370811,7.25,251.0,620.600001,2.47251,746.602616,0.003984,Med,20.600001,9.930712,2.045727,52.446773,112.386773,1,0.150531,0.246536,0.706735,0.046729,0.203034,F,F,K262360
1,M,25.0,350,46.533333,1.0,CAT 100,ASAD90,146.0,718.8,98.0,164.7,4.0,37.2,251.580636,41.072379,4.923288,1.680612,9.3,248.0,920.7,3.7125,1172.280636,0.002016,Med,320.7,12.807103,41.07238,116.546571,221.546571,1,0.188988,0.78071,0.178886,0.040404,0.273249,F,F,K170160
2,F,46.0,604,38.066667,1.0,CAT 50,WC95,160.0,322.8,7.0,123.9,0.0,0.0,91.584877,21.575073,2.0175,17.7,0.0,167.0,446.7,2.67485,538.284877,0.002994,MedLow,146.7,12.226326,17.936021,45.411484,128.811484,1,0.2393,0.722633,0.277367,0.0,0.205025,F,F,K331610
3,F,59.0,924,27.4,1.0,CAT 50,BS110,84.0,317.400001,57.0,161.699999,0.0,0.0,23.998036,20.950771,3.778571,2.836842,0.0,141.0,479.1,3.397872,503.098036,0.003546,MedLow,179.1,11.624922,20.820235,28.019646,111.419646,1,0.221467,0.662492,0.337508,0.0,0.05009,F,F,K332460
4,F,25.0,1103,21.433333,1.0,Play 300,WC95,14.0,309.6,326.0,637.8,6.0,14.4,87.051515,0.0,22.114286,1.956442,2.4,346.0,961.8,2.779769,1048.851515,0.001445,Med,-838.2,9.403618,0.0,34.820606,112.760606,1,0.107509,0.321896,0.663132,0.014972,0.090509,F,F,K394220


In [69]:
# Define columns to drop
columns_to_drop = ['id']  # Add more columns if needed

# Define columns for different encoding methods
one_hot_encode_columns = ['Gender', 'high Dropped calls', 'No Usage']
woe_encode_columns = ['tariff', 'Handset']
ordinal_encode_columns = ['Usage_Band']
numeric_columns = X_train_split.select_dtypes(include=['int64', 'float64']).columns
categorical_columns = X_train_split.select_dtypes(include=['object']).columns
categorical_columns = [col for col in categorical_columns if col != 'id']

# Define the preprocessing steps for each column
preprocessor = ColumnTransformer(
    transformers=[
        ('drop_columns', 'drop', columns_to_drop),
        ('impute_median', SimpleImputer(strategy='median'), numeric_columns),
        ('impute_mode', SimpleImputer(strategy='most_frequent'), categorical_columns),
        ('one_hot_encode', OneHotEncoder(drop='first'), one_hot_encode_columns),
        ('woe_encode', WOEEncoder(), woe_encode_columns),
        ('ordinal_encode', OrdinalEncoder(categories=[['Low', 'MedLow', 'Med', 'MedHigh', 'High']]), ordinal_encode_columns)
    ])

# Build the preprocessing pipeline
preprocessing_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('to_dataframe', ArrayToDataFrame())  # Add this step to convert the output to a DataFrame
])

# Define the RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)

# Build the full pipeline with preprocessing and model
full_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing_pipeline),
    ('model', rf_classifier)
])

repeat last 2 code lines for the other 2 models

In [70]:
# Define evaluation metrics
def profit_at_top_20(y_true, y_probabilities, top_k=20):
    # Extract probabilities for positive class
    churn_probabilities = y_probabilities[:, 1]

    # Sort customers by predicted probabilities in descending order
    sorted_indices = sorted(range(len(churn_probabilities)), key=lambda k: churn_probabilities[k], reverse=True)

    # Identify the top-20 customers
    top_20_indices = sorted_indices[:top_k]

    # Calculate profit at top-20
    profit = sum(y_true[i] * churn_probabilities[i] for i in top_20_indices)

    return profit

# Define custom scorer for use in GridSearchCV or RandomizedSearchCV
profit_at_top_20_scorer = make_scorer(profit_at_top_20, greater_is_better=True)

# Build parameter grid for hyperparameter tuning
param_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
}

In [72]:
X_train_preprocessed = preprocessing_pipeline.fit_transform(X_train_split, y_train_split)
X_train_preprocessed

ValueError: Shape of passed values is (4032, 41), indices imply (4032, 38)

In [50]:
# Create GridSearchCV instance
grid_search = GridSearchCV(full_pipeline, param_grid, scoring={'profit_at_top_20': profit_at_top_20_scorer, 'auc': 'roc_auc'},
                           refit='profit_at_top_20', cv=5, verbose=2, n_jobs=-1)

# Fit the GridSearchCV on training data
grid_search.fit(X_train_split, y_train_split)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


ValueError: 
All the 405 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
405 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\lenne\anaconda3\envs\AA\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\lenne\anaconda3\envs\AA\lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\lenne\anaconda3\envs\AA\lib\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:\Users\lenne\anaconda3\envs\AA\lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\lenne\anaconda3\envs\AA\lib\site-packages\sklearn\ensemble\_forest.py", line 363, in fit
    X, y = self._validate_data(
  File "c:\Users\lenne\anaconda3\envs\AA\lib\site-packages\sklearn\base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "c:\Users\lenne\anaconda3\envs\AA\lib\site-packages\sklearn\utils\validation.py", line 1263, in check_X_y
    X = check_array(
  File "c:\Users\lenne\anaconda3\envs\AA\lib\site-packages\sklearn\utils\validation.py", line 997, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "c:\Users\lenne\anaconda3\envs\AA\lib\site-packages\sklearn\utils\_array_api.py", line 521, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
ValueError: could not convert string to float: 'F'


In [None]:
# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Evaluate on the validation set
y_valid_probabilities = best_model.predict_proba(X_valid_split)
profit_at_top_20_score = profit_at_top_20(y_valid_split, y_valid_probabilities)
auc_score = roc_auc_score(y_valid_split, y_valid_probabilities[:, 1])

print(f'Profit at Top-20: {profit_at_top_20_score}')
print(f'AUC: {auc_score}')

# Access the best hyperparameters
best_hyperparameters = grid_search.best_params_
print(f'Best Hyperparameters: {best_hyperparameters}')

Fitting 5 folds for each of 81 candidates, totalling 405 fits


ValueError: 
All the 405 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
405 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\lenne\anaconda3\envs\AA\lib\site-packages\pandas\core\indexes\base.py", line 3805, in get_loc
    return self._engine.get_loc(casted_key)
  File "index.pyx", line 167, in pandas._libs.index.IndexEngine.get_loc
  File "index.pyx", line 196, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\\_libs\\hashtable_class_helper.pxi", line 7081, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas\\_libs\\hashtable_class_helper.pxi", line 7089, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'gender'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "c:\Users\lenne\anaconda3\envs\AA\lib\site-packages\sklearn\utils\__init__.py", line 505, in _get_column_indices
    col_idx = all_columns.get_loc(col)
  File "c:\Users\lenne\anaconda3\envs\AA\lib\site-packages\pandas\core\indexes\base.py", line 3812, in get_loc
    raise KeyError(key) from err
KeyError: 'gender'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "c:\Users\lenne\anaconda3\envs\AA\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\lenne\anaconda3\envs\AA\lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\lenne\anaconda3\envs\AA\lib\site-packages\sklearn\pipeline.py", line 471, in fit
    Xt = self._fit(X, y, routed_params)
  File "c:\Users\lenne\anaconda3\envs\AA\lib\site-packages\sklearn\pipeline.py", line 408, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "c:\Users\lenne\anaconda3\envs\AA\lib\site-packages\joblib\memory.py", line 353, in __call__
    return self.func(*args, **kwargs)
  File "c:\Users\lenne\anaconda3\envs\AA\lib\site-packages\sklearn\pipeline.py", line 1303, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
  File "c:\Users\lenne\anaconda3\envs\AA\lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\lenne\anaconda3\envs\AA\lib\site-packages\sklearn\pipeline.py", line 543, in fit_transform
    return last_step.fit_transform(
  File "c:\Users\lenne\anaconda3\envs\AA\lib\site-packages\sklearn\utils\_set_output.py", line 295, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "c:\Users\lenne\anaconda3\envs\AA\lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\lenne\anaconda3\envs\AA\lib\site-packages\sklearn\compose\_column_transformer.py", line 906, in fit_transform
    self._validate_column_callables(X)
  File "c:\Users\lenne\anaconda3\envs\AA\lib\site-packages\sklearn\compose\_column_transformer.py", line 496, in _validate_column_callables
    transformer_to_input_indices[name] = _get_column_indices(X, columns)
  File "c:\Users\lenne\anaconda3\envs\AA\lib\site-packages\sklearn\utils\__init__.py", line 513, in _get_column_indices
    raise ValueError("A given column is not a column of the dataframe") from e
ValueError: A given column is not a column of the dataframe


In [None]:
# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.926829268292683


## Models

use these:
- RF
- lightgbm
- xgboost

imbalanced data so:
- use stratified CV to ensure that each fold maintains the class distribution
- evaluate with proper metrics (as given by the prof)
- use an ensemble of models (hence the above techniques - but also they are best techniques to work with tabular data)
- possibly do data augmentation with techniques like SMOTE to make the distribution more balanced

also this exists to deal with imbalance:

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(class_weight='balanced')

In [None]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(scale_pos_weight=(1 - y.sum() / len(y)))

In [None]:
import lightgbm as lgb

lgb_model = lgb.LGBMClassifier(is_unbalance=True)
# or
lgb_model = lgb.LGBMClassifier(scale_pos_weight=(1 - y.sum() / len(y)))