In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report


In [4]:
clients = pd.read_csv('../data/processed/clients.csv')
orders = pd.read_csv('../data/processed/orders.csv')
orders_raw = pd.read_csv('../data/processed/orders_raw.csv')

In [5]:
clients[clients['median_promotor_visits']==0]

Unnamed: 0,client_id,city,channel,promotor_id,frequency,efficiency,efficiency_scaled,total_orders,total_volume,total_income,...,median_promotor_calls,total_promotor_calls,class,zero_visit_flag,avg_orders_per_month,avg_visits_per_month,visit_order_gap,inefficiency_cost,profit_per_visit,opportunity_cost
1,100008050,Barcelona,AR,368568690,1.0,inf,1.5,14,3982.000,1905.59,...,2.0,20.0,HighTicket_Efficient,True,1.166667,0.0,-1.166667,-17.50,0.0,-0.0
5,100137028,Murcia,AR,733727842,2.0,inf,1.5,23,3930.000,1463.61,...,2.0,24.0,LowTicket_Efficient,True,1.916667,0.0,-1.916667,-28.75,0.0,-0.0
6,100163544,Castellon,HR,988985367,2.0,inf,1.5,14,867.174,1799.91,...,0.0,0.0,HighTicket_Efficient,True,1.166667,0.0,-1.166667,-17.50,0.0,-0.0
7,100167744,Barcelona,AR,368568690,1.0,inf,1.5,7,3054.470,1037.01,...,2.0,12.0,HighTicket_Efficient,True,0.583333,0.0,-0.583333,-8.75,0.0,-0.0
8,100175807,Murcia,HR,205150913,8.0,inf,1.5,80,3694.530,4658.89,...,0.0,0.0,LowTicket_Efficient,True,6.666667,0.0,-6.666667,-100.00,0.0,-0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41925,999447889,Barcelona,HR,767988162,1.0,inf,1.5,5,120.000,180.39,...,0.0,0.0,LowTicket_Efficient,True,0.416667,0.0,-0.416667,-6.25,0.0,-0.0
41926,999468169,Malaga,HR,424808266,1.0,inf,1.5,6,140.000,483.77,...,0.0,0.0,LowTicket_Efficient,True,0.500000,0.0,-0.500000,-7.50,0.0,-0.0
41931,999715264,Valladolid,HR,634459297,1.0,inf,1.5,10,105.640,967.82,...,2.0,18.0,HighTicket_Efficient,True,0.833333,0.0,-0.833333,-12.50,0.0,-0.0
41940,999880656,Valencia,AR,998162842,1.0,inf,1.5,8,2586.000,820.97,...,2.0,16.0,HighTicket_Efficient,True,0.666667,0.0,-0.666667,-10.00,0.0,-0.0


In [13]:
clients.loc[(clients['median_promotor_visits']==0) & (clients['class']=='HighTicket_Efficient')]['class'].unique()

array(['HighTicket_Efficient'], dtype=object)

In [46]:
# Split numerical and categorical features
numerical_features = ['frequency', 'efficiency', 'total_orders', 'total_volume', 'total_income', 'total_cost', 'total_profit', 'median_ticket', 'median_ticket_min', 'median_ticket_max', 'median_ticket_std', 'median_promotor_visits', 'median_promotor_calls', 'frequency_min', 'frequency_max', 'frequency_std', 'efficiency_min', 'efficiency_max', 'efficiency_std', 'avg_orders_per_month', 'avg_visits_per_month', 'visit_order_gap', 'inefficiency_cost', 'profit_per_visit', 'opportunity_cost']
categorical_features = ['client_id', 'promotor_id','city', 'channel', 'class']

features = ['frequency', 'efficiency', 'total_orders', 'total_volume', 'total_income', 'total_cost', 'total_profit', 'median_ticket', 'median_promotor_visits', 'median_promotor_calls', 'avg_orders_per_month', 'avg_visits_per_month', 'visit_order_gap', 'inefficiency_cost', 'profit_per_visit', 'opportunity_cost']

In [48]:
labels = clients.iloc[:, 0].to_list()
X = clients.iloc[:, 4:]
X.drop(columns=['class'], inplace=True)


In [49]:
X.head()

Unnamed: 0,frequency,efficiency,total_orders,total_volume,total_income,total_cost,total_profit,median_ticket,median_ticket_min,median_ticket_max,...,efficiency_min,efficiency_max,efficiency_std,zero_visit_flag,avg_orders_per_month,avg_visits_per_month,visit_order_gap,inefficiency_cost,profit_per_visit,opportunity_cost
0,2.0,1.0,22,1658.706,1494.53,880,614.53,60.99,33.0,123.89,...,0.5,1.5,0.417424,False,1.833333,0.166667,-1.666667,-25.0,307.265,-512.108333
1,1.0,20.0,14,3982.0,1905.59,140,1765.59,132.37,66.96,187.1,...,20.0,20.0,0.0,True,1.166667,0.0,-1.166667,-17.5,0.0,-0.0
2,1.0,0.25,14,1812.85,2243.3,980,1263.3,128.125,29.62,220.82,...,0.25,0.5,0.116775,False,1.166667,0.333333,-0.833333,-12.5,315.825,-263.1875
3,4.0,2.0,16,4590.18,2273.12,640,1633.12,132.64,83.21,276.33,...,2.0,2.0,0.0,False,1.333333,0.166667,-1.166667,-17.5,816.56,-952.653333
4,3.0,3.0,25,1266.5,2204.24,625,1579.24,85.81,26.15,173.95,...,1.0,4.0,0.971825,False,2.083333,0.083333,-2.0,-30.0,1579.24,-3158.48


In [50]:
scaler = StandardScaler()
df_normalized = scaler.fit_transform(X)

# PCA
pca = PCA(n_components=4)
df_pca = pca.fit_transform(df_normalized)

# K-means
kmeans = KMeans(n_clusters=3, random_state=42)
df_kmeans = kmeans.fit_predict(df_normalized)

# Add kmeans labels to the normalized dataframe
df_normalized = pd.DataFrame(df_pca)

df_normalized['cluster'] = df_kmeans


ValueError: Input X contains NaN.
PCA does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

### Stepwise

In [None]:
# Stepwise function
def stepwise_selection(X, y, 
                       initial_list=[], 
                       threshold_in=0.05, 
                       threshold_out = 0.1, 
                       verbose=True):
    """ Perform a forward-backward feature selection 
    based on p-value from statsmodels.api.OLS
    Arguments:
        X - pandas.DataFrame with candidate features
        y - list-like with the target
        initial_list - list of features to start with (column names of X)
        threshold_in - include a feature if its p-value < threshold_in
        threshold_out - exclude a feature if its p-value > threshold_out
        verbose - whether to print the sequence of inclusions and exclusions
    Returns: list of selected features 
    Always set threshold_in < threshold_out to avoid infinite looping.
    """
    included = list(initial_list)
    while True:
        changed=False
        # forward step
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(X[included+[new_column]])).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add  {:30} with p-value {:.4}'.format(best_feature, best_pval))

        # backward step
        model = sm.OLS(y, sm.add_constant(X[included])).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() # null if pvalues is empty
        if worst_pval > threshold_out:
            changed=True
            worst_feature = pvalues.argmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.4}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included

### Classification



In [51]:
clients

Unnamed: 0,client_id,city,channel,promotor_id,frequency,efficiency,total_orders,total_volume,total_income,total_cost,...,efficiency_max,efficiency_std,class,zero_visit_flag,avg_orders_per_month,avg_visits_per_month,visit_order_gap,inefficiency_cost,profit_per_visit,opportunity_cost
0,100006690,Madrid,AR,275609911,2.0,1.00,22,1658.706,1494.53,880,...,1.5,0.417424,LowTicket_Inefficient,False,1.833333,0.166667,-1.666667,-25.00,307.265,-512.108333
1,100008050,Barcelona,AR,368568690,1.0,20.00,14,3982.000,1905.59,140,...,20.0,0.000000,HighTicket_Efficient,True,1.166667,0.000000,-1.166667,-17.50,0.000,-0.000000
2,100042162,Barcelona,HR,455263770,1.0,0.25,14,1812.850,2243.30,980,...,0.5,0.116775,HighTicket_Inefficient,False,1.166667,0.333333,-0.833333,-12.50,315.825,-263.187500
3,100046227,Barcelona,AR,454554895,4.0,2.00,16,4590.180,2273.12,640,...,2.0,0.000000,HighTicket_Efficient,False,1.333333,0.166667,-1.166667,-17.50,816.560,-952.653333
4,100125158,Cadiz,HR,483340469,3.0,3.00,25,1266.500,2204.24,625,...,4.0,0.971825,HighTicket_Efficient,False,2.083333,0.083333,-2.000000,-30.00,1579.240,-3158.480000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41945,999934164,Barcelona,HR,480416490,2.0,2.00,23,691.000,785.30,575,...,4.0,1.236033,LowTicket_Efficient,False,1.916667,0.083333,-1.833333,-27.50,210.300,-385.550000
41946,999940211,Barcelona,AR,908993212,1.0,20.00,3,557.820,260.55,30,...,20.0,0.000000,LowTicket_Efficient,True,0.250000,0.000000,-0.250000,-3.75,0.000,-0.000000
41947,999940578,Madrid,AR,275609911,1.0,0.50,13,1101.524,1044.61,520,...,1.0,0.202260,LowTicket_Inefficient,False,1.083333,0.166667,-0.916667,-13.75,262.305,-240.446250
41948,999941988,Madrid,AR,677360818,3.0,1.50,36,5415.150,3828.31,1410,...,2.0,0.369274,HighTicket_Efficient,False,3.000000,0.166667,-2.833333,-42.50,1209.155,-3425.939167


In [None]:
target = "class"
df.describe()

df_model, df_reserved = train_test_split(df, test_size=0.2, random_state=42)
print ("Sample size dataset reserved for prediction: ", df_reserved.shape[0], "records")

### Decision Tree

In [70]:
from sklearn.metrics import auc, roc_curve, f1_score, accuracy_score, precision_recall_curve, confusion_matrix, classification_report
from sklearn.metrics import recall_score, precision_score
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import train_test_split
target = "class"

# Split the data into features and target
X = clients.drop(columns=['client_id', 'promotor_id', 'city', 'channel', target], axis=1)
y = clients[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Training data set: ", X_train.shape)
print("Testing data set: ", X_test.shape)

Training data set:  (33560, 26)
Testing data set:  (8390, 26)


In [72]:
from sklearn.tree import DecisionTreeClassifier

eda_tree = DecisionTreeClassifier(criterion="entropy", random_state=42)
eda_tree.fit(X_train, y_train)

FileNotFoundError: [Errno 2] No such file or directory: '/opt/homebrew/Caskroom/miniconda/base/envs/capstone/lib/python3.11/site-packages/sklearn/utils/_repr_html/estimator.js'

FileNotFoundError: [Errno 2] No such file or directory: '/opt/homebrew/Caskroom/miniconda/base/envs/capstone/lib/python3.11/site-packages/sklearn/utils/_repr_html/estimator.js'

DecisionTreeClassifier(criterion='entropy', random_state=42)

In [None]:
import os
os.environ["PATH"] += os.pathsep + os.path.join(os.environ["CONDA_PREFIX"], "bin")

import graphviz
from io import StringIO
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

def plot_tree(tree, feature_names):
    dot_data = StringIO()
    export_graphviz(tree, out_file=dot_data, feature_names=feature_names,
                    filled=True, rounded=True,special_characters=True)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
    graph.write_png("churn_train.png")
    return Image(graph.create_png())

In [None]:
plot_tree(eda_tree, X_train.columns)

In [None]:
pred_train = eda_tree.predict(X_train)
print("Accuracy of training set = {0:.2%}".format(accuracy_score(y_train, pred_train)))

pred_test = eda_tree.predict(X_test)
print("Accuracy of testing set = {0:.2%}".format(accuracy_score(y_test, pred_test))