In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import sys

In [None]:
# import reusable functions from utils directory
sys.path.append('../../utils')
import my_utils

# **Cleaning Data Conclusions**

## Day 2
- `merged_final_demo_final_experiment_clients_df`

# Final Demo + Final Experiment Clients (Merged DataFrame)

In [None]:
final_demo_df = pd.read_csv('../../data/clean/merged_final_demo_final_experiment_clients_df.csv')
final_demo_df.head()

In [None]:
my_utils.inspect_dataframe(final_demo_df)

In [None]:
my_utils.check_unique_and_empty(final_demo_df)

### Table Overview

**Rows:**

- `client_id (int)`: A unique identifier for each client, used to distinguish one client from another in the dataset.
- `client_tenure_years (int)`: The number of years a client has been associated with the company. For example, a client with client_tenure_years = 6 has been with the company for 6 years.
- `client_tenure_months (int)`: The number of months a client has been associated with the company. This value is often more granular than client_tenure_years and could be used for more detailed analysis. For instance, a tenure of 6 years and 1 month would be represented as 73 months.
- `client_age (int)`: The age of the client in years.
- `gender`: The gender of the client. The value can be "Male," "Female," or "Unspecified," meaning the gender data is either recorded or missing.
- `num_accounts (int)`: The number of accounts the client has with the company.
- `balance (float)`: The total balance of the client's accounts with the company. This is a monetary value, and the balance can indicate how much money the client holds across their accounts.
- `calls_last_6_months (int)`: The number of calls the client has made to the company in the past six months. This can give an idea of how actively the client has engaged with the company.
- `logons_last_6_months (int)`: The number of times the client has logged into their account or interacted with the company online in the past six months.
- `variation (object)`: This column likely indicates whether the client is part of a control group or a test group for an experiment. In this case, clients are either labeled as "Test", "Control", or "Unknown".

## Day 1 & 2 (Week 5)

### **Client behavior analysis**

Answer the following questions about demographics:

- Who are the primary clients using this online process?
- Are the primary clients younger or older, new or long-standing?
- Next, carry out a client behaviour analysis to answer any additional relevant questions you think are important.

In [None]:
# Step 1: Analyze Active Clients using the online process

# assuming active clients are those with more than 10 logons in the last 6 months and more than 3 accounts
active_clients = final_demo_df[(final_demo_df['logons_last_6_months'] > 10) & (final_demo_df['num_accounts'] > 3)]

# Step 2: Are the primary clients younger or older?

average_age = final_demo_df['client_age'].mean()

# age distribution to visualize client ages
plt.figure(figsize=(8, 6))
sns.histplot(final_demo_df['client_age'], kde=True, color="blue", bins=10)
plt.title('Client Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

# Step 3: Are the primary clients newer or long-standing?
# Calculate the average tenure (in years)
average_tenure = final_demo_df['client_tenure_years'].mean()

# Plot the tenure distribution to visualize how long clients have been with the service
plt.figure(figsize=(8, 6))
sns.histplot(final_demo_df['client_tenure_years'], kde=True, color='green', bins=10)
plt.title('Client Tenure Distribution (in years)')
plt.xlabel('Year duration')
plt.ylabel('Frequency')
plt.show()

# Step 4: Drawing Conclusions

# 1. Active clients using the online process
print('Active clients (higher logons and accounts):')
print(active_clients)

# 2. Average age of clients
print(f'Average client age: {average_age:.2f} years')

# 3. Average client tenure (years)
print(f'Average client tenure: {average_tenure:.2f} years')

# conclusions:
if average_age < 40:
    print('Conclusion: The primary clients are generally younger (under 40 years old).')
else:
    print('Conclusion: The primary clients are generally older (above 40 years old).')

if average_tenure < 3:
    print('Conclusion: The primary clients are generally newer (under 3 years).')
else:
    print('Conclusion: The primary clients are generally long-standing (over 3 years).')

## Day 3 (Week 5)

### **Performance Metrics**

**Success Indicators**

You have now been asked to discover what key performance indicators (KPIs) will determine the success of the new design? Use at least completion rate, time spent on each step and error rates. Add any KPIs you might find relevant.

- **Completion Rate:** The proportion of users who reach the final ‘confirm’ step.
- **Time Spent on Each Step:** The average duration users spend on each step.
- **Error Rates:** If there’s a step where users go back to a previous step, it may indicate confusion or an error. You should consider moving from a later step to an earlier one as an error.