<a href="https://colab.research.google.com/github/benjaminADMU/NBO_BPI/blob/main/P2_Communication_Profile_Synthetic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

### Generated Dataframe based on the Paper

In [2]:
cpq = pd.read_parquet('../Data/Other Data/CUSTOMER INFO.parquet')
cpq = cpq[0:101]

In [3]:
com_df = pd.DataFrame(cpq['CUST_NUM'])

In [4]:
# Generate counts ensuring they are positive integers and reasonable
com_df['count_of_successful_communications'] = np.random.randint(0, 11, size=len(com_df))  # Random successful communications (0 to 10)
com_df['count_of_unsuccessful_communications'] = np.random.randint(0, 11, size=len(com_df))  # Random unsuccessful communications (0 to 10)
com_df['count_of_communications_under_decision'] = np.random.randint(0, 11, size=len(com_df))  # Random under decision communications (0 to 10)
com_df['count_of_expired_offers'] = np.random.randint(0, 11, size=len(com_df))  # Random expired offers (0 to 10)


com_df['number_of_communications'] = (com_df['count_of_successful_communications'] +
                                       com_df['count_of_unsuccessful_communications'] +
                                       com_df['count_of_communications_under_decision'] +
                                       com_df['count_of_expired_offers'])

# Calculate the success rate
com_df['success_rate_of_communications'] = (com_df['count_of_successful_communications'] / com_df['number_of_communications']) * 100  # Success rate as a percentage

# Apply the classification as stated in the paper
def get_class(success_rate_of_communications):
    if success_rate_of_communications <= 20:
        return 'Cold'
    elif success_rate_of_communications <= 40:
        return 'Less Cold'
    elif success_rate_of_communications <= 60:
        return 'Moderate'
    elif success_rate_of_communications <= 80:
        return 'Receptive'
    else:
        return 'Highly Receptive'

com_df['class'] = com_df['success_rate_of_communications'].apply(get_class)

In [5]:
com_df.head()

Unnamed: 0,CUST_NUM,count_of_successful_communications,count_of_unsuccessful_communications,count_of_communications_under_decision,count_of_expired_offers,number_of_communications,success_rate_of_communications,class
0,1215.78123,3,6,9,8,26,11.538462,Cold
1,1357.977909,0,2,8,5,15,0.0,Cold
2,1437.499217,8,3,6,3,20,40.0,Less Cold
3,1606.444521,6,10,1,6,23,26.086957,Less Cold
4,2075.312988,9,5,9,8,31,29.032258,Less Cold


### Generate Multiple Dataframes for different Communication Channel


In [6]:
def generate_communications_data(client_df):
    data = pd.DataFrame({
        'CUST_NUM': client_df['CUST_NUM'],
        'count_of_successful_communications': np.random.randint(0, 11, size=len(client_df)),
        'count_of_unsuccessful_communications': np.random.randint(0, 11, size=len(client_df)),
        'count_of_communications_under_decision': np.random.randint(0, 11, size=len(client_df)),
        'count_of_expired_offers': np.random.randint(0, 11, size=len(client_df))
    })

    data['number_of_communications'] = (
        data['count_of_successful_communications'] +
        data['count_of_unsuccessful_communications'] +
        data['count_of_communications_under_decision'] +
        data['count_of_expired_offers']
    )

    # Calculate success rate
    data['success_rate_of_communications'] = (
        data['count_of_successful_communications'] / data['number_of_communications']
    ) * 100

    return data

    # Apply the classification based on the success rate
    def get_class(success_rate):
        if success_rate <= 20:
            return 'Cold'
        elif success_rate <= 40:
            return 'Less Cold'
        elif success_rate <= 60:
            return 'Moderate'
        elif success_rate <= 80:
            return 'Receptive'
        else:
            return 'Highly Receptive'

    existing_df['class'] = existing_df['success_rate_of_communications'].apply(get_class)

    return data



In [7]:
# Create artificially generated dataframes of success rates of different communication channel
sms_df = generate_communications_data(com_df)
sms_df['channel'] = 'sms'

email_df = generate_communications_data(com_df)
email_df['channel'] = 'email'

phone_df = generate_communications_data(com_df)
phone_df['channel'] = 'phone'

social_media_df = generate_communications_data(com_df)
social_media_df['channel'] = 'social_media'


In [8]:
phone_df

Unnamed: 0,CUST_NUM,count_of_successful_communications,count_of_unsuccessful_communications,count_of_communications_under_decision,count_of_expired_offers,number_of_communications,success_rate_of_communications,channel
0,1215.781230,0,5,7,6,18,0.000000,phone
1,1357.977909,5,4,1,10,20,25.000000,phone
2,1437.499217,10,7,7,10,34,29.411765,phone
3,1606.444521,0,2,4,8,14,0.000000,phone
4,2075.312988,4,8,1,3,16,25.000000,phone
...,...,...,...,...,...,...,...,...
96,307.252339,7,8,8,8,31,22.580645,phone
97,347.367241,1,9,3,0,13,7.692308,phone
98,587.964285,3,0,2,3,8,37.500000,phone
99,594.291174,9,9,6,0,24,37.500000,phone


### Get best channel

#### Best channel in general

In [9]:
combined_df = pd.concat([sms_df, email_df, phone_df, social_media_df], ignore_index=True)

# Group by 'channel' and calculate the mean success rate
average_success_rates = combined_df.groupby('channel')['success_rate_of_communications'].mean().reset_index()

# Identify the channel with the highest average success rate
best_channel = average_success_rates.loc[average_success_rates['success_rate_of_communications'].idxmax()]

# Display the results
print("Average Success Rates by Channel:")
print(average_success_rates)
print("\nBest Channel:")
print(best_channel)


Average Success Rates by Channel:
        channel  success_rate_of_communications
0         email                       26.580434
1         phone                       25.837461
2           sms                       23.168190
3  social_media                       26.782762

Best Channel:
channel                           social_media
success_rate_of_communications       26.782762
Name: 3, dtype: object


#### Best channel for each client

In [10]:
# Sort by client ID and success rate in descending order
combined_df_sorted = combined_df.sort_values(by=['CUST_NUM', 'success_rate_of_communications'], ascending=[True, False])

# Drop duplicates, keeping the first instance (highest success rate) for each client
best_channel_per_client = combined_df_sorted.drop_duplicates(subset='CUST_NUM', keep='first')

In [11]:
best_channel_per_client

Unnamed: 0,CUST_NUM,count_of_successful_communications,count_of_unsuccessful_communications,count_of_communications_under_decision,count_of_expired_offers,number_of_communications,success_rate_of_communications,channel
195,225.042218,7,1,3,0,11,63.636364,email
398,285.734142,10,4,3,3,20,50.000000,social_media
96,307.252339,7,1,9,1,18,38.888889,sms
97,347.367241,3,6,10,0,19,15.789474,sms
199,587.964285,5,1,0,0,6,83.333333,email
...,...,...,...,...,...,...,...,...
391,13870.576196,5,4,7,10,26,19.230769,social_media
392,13895.646944,6,3,4,6,19,31.578947,social_media
191,13908.280411,8,8,7,9,32,25.000000,email
293,14019.293991,10,6,5,6,27,37.037037,phone
