# Model Selection with Logistic Regression, Random Forest Classifier and Neural Networks (MLP Classifier)

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None  # default='warn'

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

In [3]:
# Read in selected features
selected_features = pd.read_csv("./preprocessed_data/selected_features.csv")

In [4]:
selected_features

Unnamed: 0,Selected Features
0,customer_profile_01
1,customer_profile_02
2,customer_profile_03
3,customer_profile_04
4,customer_spend_01
5,customer_spend_02
6,customer_spend_03
7,customer_spend_04
8,customer_spend_05
9,customer_spend_06


In [21]:
# Load dataset
train_data = pd.read_parquet("./preprocessed_data/imputed_train_data.parquet")

In [25]:
# Count NaN values
col_nan_counts = selected_train_data.isna().sum()
col_nan_counts = col_nan_counts.to_frame(name="NaN counts").reset_index()
col_nan_counts = col_nan_counts.rename(columns={"index": "Feature names"})
col_nan_counts # NO MORE NAs

Unnamed: 0,Feature names,NaN counts
0,customer_profile_01,0
1,customer_profile_02,0
2,customer_profile_03,0
3,customer_profile_04,0
4,customer_spend_01,0
5,customer_spend_02,0
6,customer_spend_03,0
7,customer_spend_04,0
8,customer_spend_05,0
9,customer_spend_06,0


In [31]:
type(selected_features['Selected Features'].tolist())

list

In [33]:
# Select features
selected_train_data = train_data[["customer", "merchant", "ind_recommended", "activation"] + selected_features['Selected Features'].tolist()]


In [34]:
selected_train_data.head()

Unnamed: 0,customer,merchant,ind_recommended,activation,customer_profile_01,customer_profile_02,customer_profile_03,customer_profile_04,customer_spend_01,customer_spend_02,customer_spend_03,customer_spend_04,customer_spend_05,customer_spend_06,customer_spend_07,customer_spend_18,customer_spend_19,customer_digital_activity_01,customer_digital_activity_02,customer_digital_activity_11,customer_digital_activity_20,customer_digital_activity_21,customer_digital_activity_22,customer_industry_spend_01,customer_industry_spend_02,customer_industry_spend_03,customer_industry_spend_04,customer_industry_spend_05,customer_merchant_03,distance_01,distance_02,distance_04,distance_05,merchant_profile_02,merchant_profile_03,merchant_spend_01,merchant_spend_02,merchant_spend_07,merchant_spend_08,merchant_spend_09,merchant_spend_10
0,168972,152285,0,0,5466.06,1700.0,58.434969,86.0,107.215862,14.0,133.0,4477.0,29719.09,782.0,306.0,0.714531,20.85,0.0,32.5,0.0,0.444444,0.017921,0.000468,26.686594,74.0,3682.75,138.0,111.0,0.90551,0.307692,1.219756,4.0,15.856826,0.157534,65923.0,29.781042,43.0,1429.49,48.0,49466.0,29.18
1,212404,39032,0,0,781.56,597.41,5.392089,125.0,35.552,2.0,8.0,17577.0,1051.4,52.0,43.0,0.871597,24.81,0.419355,7.0,0.0,0.641575,0.005845,0.000123,50.928261,3.0,1171.35,23.0,17.0,0.961583,3.808333,6.998555,3.808333,6.998555,0.084416,7801.0,34.643313,97.0,5646.86,163.0,3638.0,28.465
2,225178,7439,0,0,1457.84,1200.0,33.780445,180.0,31.623103,11.0,62.0,49494.0,4695.22,196.0,136.0,0.076536,32.26,0.836364,0.0,0.0,0.0,0.0,0.0,48.837872,19.0,2295.38,47.0,42.0,0.327672,11.352941,0.129853,30.822421,1.753009,0.24,12868.0,1731.0,2.0,3462.0,2.0,3912.0,421.5
3,183948,485069,0,0,351.22,500.0,37.340085,134.0,112.277391,16.0,33.0,147211.0,5190.94,167.0,112.0,0.173581,21.016154,0.952381,28.666667,19.0,0.786111,0.0,0.0,22.22144,4.25,323.92125,14.125,13.6875,0.769936,11.352941,1.948002,2.0,9.000063,0.1875,23553.0,54.8,4.0,274.0,5.0,28919.0,50.0
4,210107,536004,1,0,831.67,99.0,77.794164,114.0,448.427273,5.0,8.0,45.0,11713.96,33.0,28.0,0.275255,90.645,0.754386,15.0,15.0,0.801169,0.0,0.0,368.501048,5.5,11415.2825,23.5,19.75,0.651198,11.352941,1.948002,6.5,1.767939,0.428571,308.0,166.0,1.0,166.0,1.0,1086.0,69.509


In [35]:
# Define function to calculate target column for case 1001
# Case 1001 refers to the binary values for events A, B, C and D
# A: recommended + activated
# B: NOT recommended + activated
# C: recommended + NOT activated
# D: NOT recommended + NOT activated
def calculate_target_1001(row):
    if row["ind_recommended"] == row["activation"]:
        return 1
    else:
        return 0

In [36]:
# Calculate new target column
selected_train_data["target_1000"] = selected_train_data.apply(lambda row: 1 if row["ind_recommended"] == 1 and row["activation"] == 1 else 0, axis=1)
selected_train_data["target_1001"] = selected_train_data.apply(calculate_target_1001, axis=1)

In [37]:
# Separate features and target
features = selected_train_data.drop(["customer", "merchant", "ind_recommended", "activation", "target_1000", "target_1001"], axis=1)
target_1000 = selected_train_data["target_1000"]
target_1001 = selected_train_data["target_1001"]

In [38]:
# Separate data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target_1000, test_size=0.3 ,random_state=42)

In [39]:
# Initialize StandardScaler
scaler = StandardScaler()

# Scale your features
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [40]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Neural Network': MLPClassifier()
}

# Define hyperparameters to tune for each model
params = {
    'Logistic Regression': {'C': [0.001, 0.01, 0.1, 1, 10, 100]},
    'Random Forest': {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]},
    'Neural Network': {'hidden_layer_sizes': [(50,), (100,), (100, 50)], 'alpha': [0.0001, 0.001, 0.01]}
}

In [41]:
# Perform hyperparameter tuning and evaluate models
results = {}
for model_name, model in models.items():
    print(f"Training {model_name}...")
    grid_search = RandomizedSearchCV(model, params[model_name], n_iter=10, cv=5, scoring='roc_auc')
    grid_search.fit(X_train_scaled, y_train)

    # Evaluate the best model on the test set
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict_proba(X_test_scaled)[:, 1]  # Predict probabilities for positive class
    accuracy = accuracy_score(y_test, y_pred.round())
    roc_auc = roc_auc_score(y_test, y_pred)

    results[model_name] = {'best_model': best_model, 'accuracy': accuracy, 'roc_auc': roc_auc}

# Print results
print("\nResults:")
for model_name, result in results.items():
    print(f"{model_name}:")
    print(f"  Best hyperparameters: {result['best_model'].get_params()}")
    print(f"  Accuracy: {result['accuracy']:.4f}")
    print(f"  ROC AUC: {result['roc_auc']:.4f}")
    print()

Training Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

: 

: 