In [19]:
# Cell 1: Setup and Imports

import pandas as pd
import numpy as np
import random
import string

# Setting random seed for reproducibility
np.random.seed(42)


In [20]:
# Cell 2: Parameters and Age Generation

# Define parameters
num_customers = 30000

# Define age percentiles
age_percentiles = [0.25, 0.5, 0.75, 1.0]
age_ranges = [18, 30, 45, 60, 70]
percentile_distribution = [0.3, 0.4, 0.2, 0.1]

# Generate ages based on percentiles
ages = []
for i in range(4):
    ages += list(np.random.randint(age_ranges[i], age_ranges[i+1], int(num_customers * percentile_distribution[i])))

# Ensure we have exactly num_customers by sampling if necessary
ages = np.random.choice(ages, num_customers, replace=False)


In [21]:
# Cell 3: Generate Genders, Locations, and Customer IDs

# Generate genders
genders = np.random.choice(['Male', 'Female'], num_customers, p=[0.5, 0.5])

# Generate locations
parishes_of_jamaica = ['Kingston', 'St. Andrew', 'St. Thomas', 'Portland', 'St. Mary', 'St. Ann', 'Trelawny', 'St. James', 'Hanover', 'Westmoreland', 'St. Elizabeth', 'Manchester', 'Clarendon', 'St. Catherine']
locations = np.random.choice(parishes_of_jamaica, num_customers)

# Function to generate CustomerID
def generate_customer_id():
    return 'sdv-id-' + ''.join(random.choices(string.ascii_letters + string.digits, k=6))

# Generate CustomerIDs
customer_ids = [generate_customer_id() for _ in range(num_customers)]


In [22]:
# Cell 4: Generate Income Levels, Education Levels, and Employment Status

# Generate income levels
income_levels = np.random.choice(['Low', 'Medium', 'High'], num_customers, p=[0.4, 0.4, 0.2])

# Generate education levels
education_levels = np.random.choice(['High School', 'College', 'Graduate'], num_customers, p=[0.4, 0.4, 0.2])

# Generate employment status
employment_status = np.random.choice(['Employed', 'Unemployed', 'Student', 'Retired'], num_customers, p=[0.5, 0.2, 0.2, 0.1])


In [23]:
# Cell 5: Assign Products Based on Age and Tenure

# Function to assign products based on age and tenure
def assign_products_by_age(age, tenure):
    products = []
    if age < age_ranges[1]:  # 0-25th percentile
        if np.random.rand() < 0.8:
            products.append('Remittances')
        if np.random.rand() < 0.9:
            products.append('PeerToPeer Sending')
        if np.random.rand() < 0.1:
            products.append('MotorInsurance')
        if np.random.rand() < 0.4:
            products.append('MarketPlace')
    elif age < age_ranges[2]:  # 25th-50th percentile
        if np.random.rand() < 0.9:
            products.append('Remittances')
        if np.random.rand() < 0.6:
            products.append('PeerToPeer Sending')
        if np.random.rand() < 0.7:
            products.append('MotorInsurance')
        if np.random.rand() < 0.7:
            products.append('MarketPlace')
    else:
        products = np.random.choice(product_types, np.random.randint(1, min(6, tenure + 1)), replace=False).tolist()
    return products

# Generate customer tenure
customer_tenure = np.random.randint(1, 7, num_customers)  # Tenure in years (1 to 6)

# Assign products to customers
product_types = ['PeerToPeer Sending', 'Remittances', 'BillPayments', 'MarketPlace', 'MotorInsurance']
customer_products = [assign_products_by_age(age, tenure) for age, tenure in zip(ages, customer_tenure)]


In [24]:
# Cell 6: Adjust Locations Based on Age Groups

# Adjust locations based on age groups
def adjust_locations_by_age(age):
    if age < age_ranges[2]:  # 0-50th percentile
        if np.random.rand() < 0.6:
            return np.random.choice(['Kingston', 'St. Andrew', 'St. Catherine'])
        else:
            return np.random.choice(parishes_of_jamaica)
    else:
        return np.random.choice(parishes_of_jamaica)

locations = [adjust_locations_by_age(age) for age in ages]


In [25]:
# Cell 7: Generate Transaction Data

# Function to generate transaction data based on age group, income level, and tenure
def generate_transaction_data_by_age_income(age, product_name, income_level, tenure):
    if age < age_ranges[1]:  # 0-25th percentile
        if product_name == 'Remittances':
            value = np.random.uniform(0, 2000)
            frequency = np.random.randint(15, 33)
        elif product_name == 'PeerToPeer Sending':
            if income_level == 'Low':
                value = np.random.uniform(0, 5000)
            elif income_level == 'Medium':
                value = np.random.uniform(5000, 10000)
            else:  # High
                value = np.random.uniform(10000, 15000)
            frequency = np.random.randint(0, 20)
        elif product_name == 'MotorInsurance':
            value = np.random.uniform(0, 2000)
            frequency = np.random.randint(0, 10)
        elif product_name == 'MarketPlace':
            value = np.random.uniform(0, 2000)
            frequency = np.random.randint(0, 20)
        else:
            value = np.random.uniform(0, 2000)
            frequency = np.random.randint(0, 20)
    elif age < age_ranges[2]:  # 25th-50th percentile
        if product_name == 'Remittances':
            value = np.random.uniform(0, 2000)
            frequency = np.random.randint(35, 61)
        elif product_name == 'PeerToPeer Sending':
            if income_level == 'Low':
                value = np.random.uniform(0, 10000)
            elif income_level == 'Medium':
                value = np.random.uniform(10000, 20000)
            else:  # High
                value = np.random.uniform(20000, 25000)
            frequency = np.random.randint(0, 20)
        elif product_name == 'MotorInsurance':
            value = np.random.uniform(0, 2000)
            frequency = np.random.randint(0, 20)
        elif product_name == 'MarketPlace':
            value = np.random.uniform(0, 2000)
            frequency = np.random.randint(0, 20)
        else:
            value = np.random.uniform(0, 2000)
            frequency = np.random.randint(0, 20)
    elif age < age_ranges[3]:  # 50th-75th percentile
        if product_name == 'Remittances':
            value = np.random.uniform(0, 2000)
            frequency = np.random.randint(15, 33)
        elif product_name == 'PeerToPeer Sending':
            if income_level == 'Low':
                value = np.random.uniform(0, 15000)
            elif income_level == 'Medium':
                value = np.random.uniform(15000, 20000)
            else:  # High
                value = np.random.uniform(20000, 25000)
            frequency = np.random.randint(0, 20)
        elif product_name == 'MotorInsurance':
            value = np.random.uniform(0, 2000)
            frequency = np.random.randint(0, 10)
        elif product_name == 'MarketPlace':
            value = np.random.uniform(0, 2000)
            frequency = np.random.randint(0, 20)
        else:
            value = np.random.uniform(0, 2000)
            frequency = np.random.randint(0, 20)
    else:  # 75th-100th percentile
        if product_name == 'Remittances':
            value = np.random.uniform(0, 2000)
            frequency = np.random.randint(35, 61)
        elif product_name == 'PeerToPeer Sending':
            if income_level == 'Low':
                value = np.random.uniform(0, 15000)
            elif income_level == 'Medium':
                value = np.random.uniform(15000, 20000)
            else:  # High
                value = np.random.uniform(20000, 25000)
            frequency = np.random.randint(0, 20)
        elif product_name == 'MotorInsurance':
            value = np.random.uniform(0, 2000)
            frequency = np.random.randint(0, 20)
        elif product_name == 'MarketPlace':
            value = np.random.uniform(0, 2000)
            frequency = np.random.randint(0, 20)
        else:
            value = np.random.uniform(0, 2000)
            frequency = np.random.randint(0, 20)
    # Adjust values based on tenure
    value = value * (1 + 0.1 * (tenure - 1))
    return value, frequency

# Generate transaction values and frequencies based on age group, income level, and tenure
def generate_transaction_data(customers_ages, customers_products, product_name, income_levels, tenures):
    values = []
    frequencies = []
    for age, products, income, tenure in zip(customers_ages, customers_products, income_levels, tenures):
        if product_name in products:
            value, frequency = generate_transaction_data_by_age_income(age, product_name, income, tenure)
            values.append(value)
            frequencies.append(frequency)
        else:
            values.append(0)
            frequencies.append(0)
    return values, frequencies

# Generate data for each product type
peer_to_peer_values, peer_to_peer_freqs = generate_transaction_data(ages, customer_products, 'PeerToPeer Sending', income_levels, customer_tenure)
remittances_values, remittances_freqs = generate_transaction_data(ages, customer_products, 'Remittances', income_levels, customer_tenure)
bill_payments_values, bill_payments_freqs = generate_transaction_data(ages, customer_products, 'BillPayments', income_levels, customer_tenure)
marketplace_values, marketplace_freqs = generate_transaction_data(ages, customer_products, 'MarketPlace', income_levels, customer_tenure)


In [26]:
# Cell 8: Adjust Linked Bank Account Values

# Adjust linked bank account values
def adjust_linked_bank_values(ages, income_levels, tenures):
    values = []
    for age, income, tenure in zip(ages, income_levels, tenures):
        if age_ranges[1] <= age < age_ranges[2]:  # 25th-50th percentile
            if income == 'High' and tenure >= 4:
                value = np.random.uniform(200000, 300000)
            else:
                value = np.random.uniform(20000, 100000)
        else:
            value = np.random.uniform(20000, 50000)
        values.append(value)
    return values

linked_bank_values = adjust_linked_bank_values(ages, income_levels, customer_tenure)


In [27]:
# Cell 9: Generate Activity Levels

# Generate activity level based on employment status
def generate_activity_level(employment_status):
    if employment_status == 'Employed':
        return np.random.randint(10, 30)
    elif employment_status == 'Student':
        return np.random.randint(15, 25)
    elif employment_status == 'Unemployed':
        return np.random.randint(5, 15)
    else:  # Retired
        return np.random.randint(5, 20)

activity_level = [generate_activity_level(status) for status in employment_status]


In [31]:
# Cell 10: Create and Save the DataFrame

# Create DataFrame
data = {
    'CustomerID': customer_ids,
    'Age': ages,
    'Gender': genders,
    'Location': locations,
    'LinkedBankAccountMonthlyValue': linked_bank_values,
    'ProductType': customer_products,
    'PeerToPeer_MonthlyTransValue': peer_to_peer_values,
    'Remittances_MonthlyTransValue': remittances_values,
    'BillPayments_MonthlyTransValue': bill_payments_values,
    'RemittancesFreq_Monthly': remittances_freqs,
    'BillPaymentsFreq_Monthly': bill_payments_freqs,
    'MarketPlaceFreq_Monthly': marketplace_freqs,
    'PeerToPeerFreq_Monthly': peer_to_peer_freqs,
    'IncomeLevel': income_levels,
    'EducationLevel': education_levels,
    'EmploymentStatus': employment_status,
    'CustomerTenure': customer_tenure,
    'ActivityLevel': activity_level
}

df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
df.to_csv('/content/Updated_GKONE_Synthetic_Dataset.csv', index=False)

# Display the DataFrame
df.head(50)

Unnamed: 0,CustomerID,Age,Gender,Location,LinkedBankAccountMonthlyValue,ProductType,PeerToPeer_MonthlyTransValue,Remittances_MonthlyTransValue,BillPayments_MonthlyTransValue,RemittancesFreq_Monthly,BillPaymentsFreq_Monthly,MarketPlaceFreq_Monthly,PeerToPeerFreq_Monthly,IncomeLevel,EducationLevel,EmploymentStatus,CustomerTenure,ActivityLevel
0,sdv-id-IN7ITT,25,Female,Trelawny,46601.668557,"[Remittances, PeerToPeer Sending, MarketPlace]",5276.880318,1238.053843,0.0,31,0,9,4,Low,High School,Employed,4,17
1,sdv-id-4hnCCT,43,Female,St. Catherine,64044.610963,"[Remittances, PeerToPeer Sending, MarketPlace]",28503.972689,2965.286891,0.0,56,0,7,7,Medium,Graduate,Student,6,24
2,sdv-id-iOIToj,42,Female,St. Andrew,93085.837759,"[Remittances, PeerToPeer Sending, MotorInsurance]",11195.330762,1468.833594,0.0,41,0,0,5,Low,College,Employed,5,14
3,sdv-id-8TvnUL,69,Male,Kingston,22944.196442,[PeerToPeer Sending],89.461759,0.0,0.0,0,0,0,2,Low,Graduate,Unemployed,1,5
4,sdv-id-QijYGa,27,Female,St. Andrew,47014.788265,[PeerToPeer Sending],7674.463699,0.0,0.0,0,0,0,7,Medium,College,Employed,1,21
5,sdv-id-PpmeEb,44,Male,Kingston,56340.738768,"[Remittances, PeerToPeer Sending, MarketPlace]",5526.267647,1461.346428,0.0,56,0,19,15,Low,College,Employed,1,19
6,sdv-id-qnYvws,56,Female,St. Elizabeth,20663.810887,[Remittances],0.0,1852.559755,0.0,28,0,0,0,Medium,High School,Unemployed,1,5
7,sdv-id-EsLm1l,31,Male,Kingston,23845.991525,"[Remittances, MotorInsurance, MarketPlace]",0.0,1602.324331,0.0,50,0,10,0,High,College,Employed,3,22
8,sdv-id-BuRnMD,42,Female,St. Catherine,292507.231778,"[Remittances, PeerToPeer Sending, MotorInsuran...",30982.728497,871.447039,0.0,57,0,14,7,High,High School,Unemployed,4,13
9,sdv-id-b3VsIs,19,Male,St. Ann,36838.764974,"[Remittances, PeerToPeer Sending]",6871.776559,1891.206707,0.0,21,0,0,12,Medium,College,Retired,1,15


In [29]:
# Display the DataFrame
df.head(50)

Unnamed: 0,CustomerID,Age,Gender,Location,LinkedBankAccountMonthlyValue,ProductType,PeerToPeer_MonthlyTransValue,Remittances_MonthlyTransValue,BillPayments_MonthlyTransValue,RemittancesFreq_Monthly,BillPaymentsFreq_Monthly,MarketPlaceFreq_Monthly,PeerToPeerFreq_Monthly,IncomeLevel,EducationLevel,EmploymentStatus,CustomerTenure,ActivityLevel
0,sdv-id-IN7ITT,25,Female,Trelawny,46601.668557,"[Remittances, PeerToPeer Sending, MarketPlace]",5276.880318,1238.053843,0.0,31,0,9,4,Low,High School,Employed,4,17
1,sdv-id-4hnCCT,43,Female,St. Catherine,64044.610963,"[Remittances, PeerToPeer Sending, MarketPlace]",28503.972689,2965.286891,0.0,56,0,7,7,Medium,Graduate,Student,6,24
2,sdv-id-iOIToj,42,Female,St. Andrew,93085.837759,"[Remittances, PeerToPeer Sending, MotorInsurance]",11195.330762,1468.833594,0.0,41,0,0,5,Low,College,Employed,5,14
3,sdv-id-8TvnUL,69,Male,Kingston,22944.196442,[PeerToPeer Sending],89.461759,0.0,0.0,0,0,0,2,Low,Graduate,Unemployed,1,5
4,sdv-id-QijYGa,27,Female,St. Andrew,47014.788265,[PeerToPeer Sending],7674.463699,0.0,0.0,0,0,0,7,Medium,College,Employed,1,21
5,sdv-id-PpmeEb,44,Male,Kingston,56340.738768,"[Remittances, PeerToPeer Sending, MarketPlace]",5526.267647,1461.346428,0.0,56,0,19,15,Low,College,Employed,1,19
6,sdv-id-qnYvws,56,Female,St. Elizabeth,20663.810887,[Remittances],0.0,1852.559755,0.0,28,0,0,0,Medium,High School,Unemployed,1,5
7,sdv-id-EsLm1l,31,Male,Kingston,23845.991525,"[Remittances, MotorInsurance, MarketPlace]",0.0,1602.324331,0.0,50,0,10,0,High,College,Employed,3,22
8,sdv-id-BuRnMD,42,Female,St. Catherine,292507.231778,"[Remittances, PeerToPeer Sending, MotorInsuran...",30982.728497,871.447039,0.0,57,0,14,7,High,High School,Unemployed,4,13
9,sdv-id-b3VsIs,19,Male,St. Ann,36838.764974,"[Remittances, PeerToPeer Sending]",6871.776559,1891.206707,0.0,21,0,0,12,Medium,College,Retired,1,15
