In [13]:
import pandas as pd
import seaborn as sns
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

In [14]:
my_palette = ['#10A19D', '#540375', '#FF7000', '#FFBF00', '#dd44aa', '#BEBEBE', '#112385']
sns.set_theme(style='whitegrid')
sns.set_palette(sns.color_palette(my_palette))

# Reading data

In [15]:
fitness = pd.read_csv('data/fitness_class_2212.csv')
fitness

Unnamed: 0,booking_id,months_as_member,weight,days_before,day_of_week,time,category,attended
0,1,17,79.56,8,Wed,PM,Strength,0
1,2,10,79.01,2,Mon,AM,HIIT,0
2,3,16,74.53,14,Sun,AM,Strength,0
3,4,5,86.12,10,Fri,AM,Cycling,0
4,5,15,69.29,8,Thu,AM,HIIT,0
...,...,...,...,...,...,...,...,...
1495,1496,21,79.51,10,Fri,AM,HIIT,0
1496,1497,29,89.55,2,Mon,AM,Strength,0
1497,1498,9,87.38,4,Tue,AM,HIIT,0
1498,1499,34,68.64,14,Sun,AM,Aqua,0


In [16]:
fitness.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   booking_id        1500 non-null   int64  
 1   months_as_member  1500 non-null   int64  
 2   weight            1480 non-null   float64
 3   days_before       1500 non-null   object 
 4   day_of_week       1500 non-null   object 
 5   time              1500 non-null   object 
 6   category          1500 non-null   object 
 7   attended          1500 non-null   int64  
dtypes: float64(1), int64(3), object(4)
memory usage: 93.9+ KB


The only missing values are in the column `weight`. So, there is no need to impute any other column.

# Auxiliary functions

In [17]:
def show_unique(column):
    if column not in fitness.columns:
        raise ValueError(f'Column {column} not in dataframe {fitness}')
    return fitness[column].drop_duplicates().sort_values().reset_index(drop=True)

In [18]:
def show_sensitivity(conf_mat):
    senstivity = conf_mat[1, 1] / (conf_mat[1, 0] + conf_mat[1, 1])
    print(f'Sensitivity: {senstivity}')

In [19]:
def show_specificity(conf_mat):
    specificity = conf_mat[0, 0] / (conf_mat[0, 0] + conf_mat[0, 1])
    print(f'Specificity: {specificity}')

In [20]:

def show_roc_score(estimator, X_test, y_test):
    y_pred = estimator.predict_proba(X_test)[:, 1]
    print(f'ROC AUC score: {roc_auc_score(y_test, y_pred)}')

In [21]:
plot_name = {
    'booking_id': 'Booking ID',
    'months_as_member': 'Months as member',
    'weight': 'Weight',
    'days_before': 'Days before',
    'day_of_week': 'Day of week',
    'time': 'Time',
    'category': 'Category',
    'attended': 'Attended'
}

# Data validation
I will check every variable in its own subsection. At the end, I will add a summary of the results.

## `booking_id`
There is no duplicates, and the values are unique. Also, the values are integers (even though the method `describe` transforms then into float).

In [22]:
fitness['booking_id'].describe()

count    1500.000000
mean      750.500000
std       433.157015
min         1.000000
25%       375.750000
50%       750.500000
75%      1125.250000
max      1500.000000
Name: booking_id, dtype: float64

In [23]:
fitness['booking_id'].duplicated().any()

False

## `months_as_member`
As mentioned above, there are no missing values, so there is not necessary to impute. The minimum value is 1 as expected.

In [24]:
fitness['months_as_member'].describe()

count    1500.000000
mean       15.628667
std        12.926543
min         1.000000
25%         8.000000
50%        12.000000
75%        19.000000
max       148.000000
Name: months_as_member, dtype: float64

In [25]:
assert fitness['months_as_member'].min() >= 1

## `weight`
There are missing values, so I will impute them with the mean. The minimum value is 40 as expected.

In [26]:
fitness['weight'].describe()

count    1480.000000
mean       82.610378
std        12.765859
min        55.410000
25%        73.490000
50%        80.760000
75%        89.520000
max       170.520000
Name: weight, dtype: float64

In [27]:
assert fitness['weight'].min() >= 40.0

In [28]:
fitness['weight'].isna().sum()

20

In [29]:
fitness['weight'].fillna(fitness['weight'].mean(), inplace=True)

In [30]:
assert fitness['weight'].isna().sum() == 0

## `days_before`
Is a discrete variable, but there is a problem with the values. There are some values with the string `days` at the end. Also, there are some values with spaces at the beginning. I will remove the string `days` and the spaces at the beginning. The minimum value is 1 as expected.

In [31]:
show_unique('days_before')

0           1
1      1 days
2          10
3     10 days
4          11
5          12
6     12 days
7          13
8     13 days
9          14
10    14 days
11         15
12         16
13         17
14          2
15     2 days
16         20
17         29
18          3
19     3 days
20          4
21     4 days
22          5
23     5 days
24          6
25     6 days
26          7
27     7 days
28          8
29     8 days
30          9
Name: days_before, dtype: object

In [32]:
fitness['days_before'] = (
    fitness['days_before']
    .str.replace(r'\s*days', '', regex=True)
    .str.strip().astype(int)
)
show_unique('days_before')

0      1
1      2
2      3
3      4
4      5
5      6
6      7
7      8
8      9
9     10
10    11
11    12
12    13
13    14
14    15
15    16
16    17
17    20
18    29
Name: days_before, dtype: int64

In [33]:
assert fitness['days_before'].min() >= 1

## `day_of_week`
There are some misspellings, and the days have a natural order.

In [34]:
show_unique('day_of_week')

0          Fri
1         Fri.
2          Mon
3       Monday
4          Sat
5          Sun
6          Thu
7          Tue
8          Wed
9    Wednesday
Name: day_of_week, dtype: object

In [35]:
day_map = {
    'Fri.': 'Fri',
    'Monday': 'Mon',
    'Wednesday': 'Wed'
}

In [36]:
fitness['day_of_week'] = fitness['day_of_week'].replace(day_map)
show_unique('day_of_week')

0    Fri
1    Mon
2    Sat
3    Sun
4    Thu
5    Tue
6    Wed
Name: day_of_week, dtype: object

In [37]:
fitness['day_of_week'] = (
    fitness['day_of_week'].astype('category')
    .cat.set_categories(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
    .cat.as_ordered()
)
show_unique('day_of_week')

0    Mon
1    Tue
2    Wed
3    Thu
4    Fri
5    Sat
6    Sun
Name: day_of_week, dtype: category
Categories (7, object): ['Mon' < 'Tue' < 'Wed' < 'Thu' < 'Fri' < 'Sat' < 'Sun']

## `time`
There are not errors, all is correct.

In [38]:
show_unique('time')

0    AM
1    PM
Name: time, dtype: object

In [39]:
fitness['time'] = (
    fitness['time'].astype('category')
    .cat.set_categories(['AM', 'PM'])
    .cat.as_ordered()
)
show_unique('time')

0    AM
1    PM
Name: time, dtype: category
Categories (2, object): ['AM' < 'PM']

## `category`
There is an unknown category `'-'`.

In [40]:
show_unique('category')

0           -
1        Aqua
2     Cycling
3        HIIT
4    Strength
5        Yoga
Name: category, dtype: object

In [41]:
fitness['category'] = (
    fitness['category']
    .str.replace(r'-', 'unknown', regex=True)
    .astype('category')
    .cat.set_categories(['Yoga', 'Aqua', 'Strength', 'HIIT', 'Cycling', 'unknown'])
)
show_unique('category')

0        Yoga
1        Aqua
2    Strength
3        HIIT
4     Cycling
5     unknown
Name: category, dtype: category
Categories (6, object): ['Yoga', 'Aqua', 'Strength', 'HIIT', 'Cycling', 'unknown']

## `attended`
I prefer to use the values `Yes` and `No` instead of `1` and `0` in the analysis. In the model, I will use the original values.

In [42]:
show_unique('attended')

0    0
1    1
Name: attended, dtype: int64

In [44]:
fitness['attended'] = (
    fitness['attended']
    .replace({0: 'No', 1: 'Yes'})
    .astype('category')
    .cat.set_categories(['Yes', 'No'])
    .cat.as_ordered()
)
show_unique('attended')

0    Yes
1     No
Name: attended, dtype: category
Categories (2, object): ['Yes' < 'No']