### Generate Data

In [11]:
import numpy as np
import pandas as pd

np.random.seed(42)

number_samples = 3000

# categorical features
industry = ['Automive', 'Finance', 'Healthcare', 'Retail', 'Technology', 'Government', 'Education', 'Manufacturing', 'Food and Beverage', 'Energy', 'Business Services']
c_level_engagement = ['Low', 'Medium', 'High']
feature_use = ['low', 'medium', 'high']

def random_dates(start, end, n):
    business_days = pd.date_range(start=start, end=end, freq='B')
    return np.random.choice(business_days, size=n, replace=True)

random_dates(pd.to_datetime('2015-01-01'), pd.to_datetime('2023-01-01'), 5)  # Example usage

array(['2018-04-19T00:00:00.000000000', '2019-12-18T00:00:00.000000000',
       '2019-05-02T00:00:00.000000000', '2019-03-14T00:00:00.000000000',
       '2021-04-13T00:00:00.000000000'], dtype='datetime64[ns]')

In [13]:
customer_data = {
    'customer_id': np.arange(1, number_samples + 1),
    'customer_size': np.random.normal(1500,250, number_samples).astype(int),
    'annueal_revenue': np.random.normal(1000000, 200000, number_samples).round(2),
    'industry': np.random.choice(industry, number_samples),
    'c_level_engagement': np.random.choice(c_level_engagement, number_samples, p=[0.6, 0.2, 0.2]),
    'open_deals': np.random.binomial(1, 0.2, number_samples),
    'customer_sign_up_date': random_dates(pd.to_datetime('2010-01-01'), pd.to_datetime('2022-12-31'), number_samples),
    'next_renewal_date': random_dates(pd.to_datetime('2023-01-01'), pd.to_datetime('2025-12-31'), number_samples),
    'amount_ARR_cloud': np.random.normal(50000, 10000, number_samples).round(2),
    'amount_ARR_on_prem': np.random.normal(30000, 8000, number_samples).round(2),
    'last_qbr_date': random_dates(pd.to_datetime('2019-01-01'), pd.to_datetime('2022-12-31'), number_samples),
    'feature_1_usage': np.random.choice(feature_use, number_samples, p=[0.5, 0.3, 0.2]),
    'feature_2_usage': np.random.choice(feature_use, number_samples, p=[0.4, 0.4, 0.2]),
    'feature_3_usage': np.random.choice(feature_use, number_samples, p=[0.3, 0.5, 0.2]),
    'churned': np.random.binomial(1, 0.17, number_samples)
}

In [14]:
df_customers = pd.DataFrame(customer_data)
df_customers['amount_ARR_total'] = df_customers['amount_ARR_cloud'] + df_customers['amount_ARR_on_prem']
df_customers['customer_age'] = (pd.Timestamp.now() - df_customers['customer_sign_up_date']).dt.days // 30
df_customers.head(20)

Unnamed: 0,customer_id,customer_size,annueal_revenue,industry,c_level_engagement,open_deals,customer_sign_up_date,next_renewal_date,amount_ARR_cloud,amount_ARR_on_prem,last_qbr_date,feature_1_usage,feature_2_usage,feature_3_usage,churned,amount_ARR_total,customer_age
0,1,761,843704.79,Manufacturing,Low,0,2014-10-09,2023-08-01,49285.05,35457.18,2019-11-26,low,high,high,1,84742.23,135
1,2,1369,1028642.62,Technology,Low,0,2012-02-16,2025-04-16,57604.95,26313.1,2020-05-05,medium,medium,low,0,83918.05,167
2,3,1283,1215118.98,Healthcare,Low,1,2020-04-22,2024-03-29,62109.51,32946.16,2020-08-05,low,medium,low,0,95055.67,67
3,4,1587,1062857.86,Technology,Medium,1,2015-01-05,2023-12-11,60175.79,15696.08,2022-03-24,high,high,medium,0,75871.87,132
4,5,2321,777766.94,Technology,Medium,0,2022-01-18,2024-03-15,38102.83,29842.12,2021-11-05,medium,medium,low,0,67944.95,46
5,6,1673,931130.82,Manufacturing,Low,0,2017-04-26,2023-08-15,42222.38,43695.51,2022-11-08,medium,medium,high,0,85917.89,104
6,7,2183,889694.52,Food and Beverage,Low,0,2015-05-12,2025-05-06,48333.94,25844.21,2019-01-17,low,medium,medium,0,74178.15,128
7,8,1642,1010587.89,Government,Medium,0,2010-12-29,2024-05-30,49233.09,29186.09,2022-05-17,low,medium,medium,0,78419.18,181
8,9,1342,1266090.33,Energy,Low,0,2019-07-23,2023-09-15,50303.73,38555.3,2022-02-01,medium,high,low,1,88859.03,76
9,10,1586,970223.44,Healthcare,Medium,0,2014-09-11,2024-01-22,61195.92,33767.81,2019-10-24,low,low,medium,0,94963.73,136
