In [20]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
import matplotlib.pyplot as plt

# Load data
data = pd.read_csv("../../dataset/cleaned_bank.csv", delimiter=',')
data.info()

# Separate target from the features
X = data.copy()
y = X.pop("y")  # target feature

# One-hot encoding for categorical variables
data = pd.get_dummies(data, columns=['contact'], drop_first=False) # Drop_first=False keeps all categories

# Check if all discrete features are integers
discrete_features = X.dtypes.apply(lambda dtype: np.issubdtype(dtype, np.integer))

print(data.columns)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2403 entries, 0 to 2402
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        2403 non-null   int64  
 1   job        2403 non-null   object 
 2   marital    2403 non-null   object 
 3   education  2403 non-null   object 
 4   default    2403 non-null   object 
 5   balance    2403 non-null   float64
 6   housing    2403 non-null   object 
 7   loan       2403 non-null   object 
 8   contact    2403 non-null   object 
 9   day        2403 non-null   int64  
 10  month      2403 non-null   object 
 11  duration   2403 non-null   int64  
 12  campaign   2403 non-null   int64  
 13  pdays      2403 non-null   int64  
 14  previous   2403 non-null   int64  
 15  poutcome   2403 non-null   object 
 16  y          2403 non-null   object 
dtypes: float64(1), int64(6), object(10)
memory usage: 319.3+ KB
Index(['age', 'job', 'marital', 'education', 'default', 'balance', '

In [21]:
import seaborn as sns

# Create the box plot
plt.figure(figsize=(10, 6))
box_plot = sns.boxplot(x='contact', y='duration', data=data)

# Calculate medians for each contact method
medians = data.groupby(['contact'])['duration'].median()

# Add median values as text labels on the plot
for i, median in enumerate(medians):
    plt.text(i, median, f'{median:.2f}', horizontalalignment='center', color='white', weight='semibold')

# Set the title and show the plot
plt.title('Distribution of Contact Duration by Contact Method (with Medians)')
plt.show()



ValueError: Could not interpret value `contact` for `x`. An entry with this name does not appear in `data`.

<Figure size 1000x600 with 0 Axes>

In [None]:
import scipy.stats as stats

# Perform a test to check if duration varies by contact method
anova_result = stats.f_oneway(data[data['contact'] == 'cellular']['duration'],
                              data[data['contact'] == 'telephone']['duration'],
                              data[data['contact'] == 'unknown']['duration'])

print(' F-statistic:', anova_result.statistic)
print(' p-value:', anova_result.pvalue)


F-statistic: Measures how different the means are across the groups. A larger value indicates more variance between groups. The value in our case is relatively high which means that duration varies across different contact methods.
p-value: The p-value is less than 0.005 that suggests that there are significant differences in the duration across different contact methods.

Conclusion:
Distribution diagram as well as F-statistic and p-value shows that contact method impacts the duration of the call.

In [22]:
# Create interaction features between 'duration' and one-hot encoded contact columns
data['duration_contact_cellular'] = data['duration'] * data['contact_cellular']
data['duration_contact_telephone'] = data['duration'] * data['contact_telephone']
data['duration_contact_unknown'] = data['duration'] * data['contact_unknown']

# Calculate the median duration for each contact method
# For each one-hot encoded contact type, calculate the median of 'duration'
median_cellular = data[data['contact_cellular'] == 1]['duration'].median()
median_telephone = data[data['contact_telephone'] == 1]['duration'].median()
median_unknown = data[data['contact_unknown'] == 1]['duration'].median()

# Create the 'above_median_duration' feature based on one-hot encoded contact types
data['above_median_duration'] = data.apply(
    lambda row: 1 if (
        (row['contact_cellular'] == 1 and row['duration'] > median_cellular) or
        (row['contact_telephone'] == 1 and row['duration'] > median_telephone) or
        (row['contact_unknown'] == 1 and row['duration'] > median_unknown)
    ) else 0, axis=1
)
print(data.columns)

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous',
       'poutcome', 'y', 'contact_cellular', 'contact_telephone',
       'contact_unknown', 'duration_contact_cellular',
       'duration_contact_telephone', 'duration_contact_unknown',
       'above_median_duration'],
      dtype='object')
