### Generate Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.preprocessing as OneHotEncoder, OrdinalEncoder, StandardScaler
import sklearn.model_selection as train_test_split
import sklearn.linear_model as LogisticRegression
import sklearn.metrics as classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score, precision_score, recall_score, f1_score


np.random.seed(42)

number_samples = 3000

# categorical features
industry = ['Automive', 'Finance', 'Healthcare', 'Retail', 'Technology', 'Government', 'Education', 'Manufacturing', 'Food and Beverage', 'Energy', 'Business Services']
c_level_engagement = ['Low', 'Medium', 'High']
feature_use = ['low', 'medium', 'high']

def random_dates(start, end, n):
    business_days = pd.date_range(start=start, end=end, freq='B')
    return np.random.choice(business_days, size=n, replace=True)

random_dates(pd.to_datetime('2015-01-01'), pd.to_datetime('2023-01-01'), 5)  # Example usage

array(['2018-04-19T00:00:00.000000000', '2019-12-18T00:00:00.000000000',
       '2019-05-02T00:00:00.000000000', '2019-03-14T00:00:00.000000000',
       '2021-04-13T00:00:00.000000000'], dtype='datetime64[ns]')

In [9]:
customer_data = {
    'customer_id': np.arange(1, number_samples + 1),
    'customer_size': np.random.normal(1500,250, number_samples).astype(int),
    'annueal_revenue': np.random.normal(1000000, 200000, number_samples).round(2),
    'industry': np.random.choice(industry, number_samples),
    'c_level_engagement': np.random.choice(c_level_engagement, number_samples, p=[0.6, 0.2, 0.2]),
    'open_deals': np.random.binomial(1, 0.2, number_samples),
    'customer_sign_up_date': random_dates(pd.to_datetime('2010-01-01'), pd.to_datetime('2022-12-31'), number_samples),
    'amount_ARR_cloud': np.random.normal(50000, 10000, number_samples).round(2),
    'amount_ARR_on_prem': np.random.normal(30000, 8000, number_samples).round(2),
    'feature_1_usage': np.random.choice(feature_use, number_samples, p=[0.5, 0.3, 0.2]),
    'feature_2_usage': np.random.choice(feature_use, number_samples, p=[0.4, 0.4, 0.2]),
    'feature_3_usage': np.random.choice(feature_use, number_samples, p=[0.3, 0.5, 0.2]),
    'churned': np.random.binomial(1, 0.17, number_samples)
}

In [10]:
df_customers = pd.DataFrame(customer_data)
df_customers['next_renewal_date'] = df_customers['customer_sign_up_date'] + pd.DateOffset(years=3)
df_customers['last_qbr_date'] = df_customers.apply(lambda df: random_dates(df['customer_sign_up_date'], 
                                                            df['next_renewal_date'], 1)[0], 
                                                            axis=1)
df_customers['amount_ARR_total'] = df_customers['amount_ARR_cloud'] + df_customers['amount_ARR_on_prem']
df_customers['customer_age'] = (pd.Timestamp.now() - df_customers['customer_sign_up_date']).dt.days // 30
df_customers.head()

Unnamed: 0,customer_id,customer_size,annueal_revenue,industry,c_level_engagement,open_deals,customer_sign_up_date,amount_ARR_cloud,amount_ARR_on_prem,feature_1_usage,feature_2_usage,feature_3_usage,churned,next_renewal_date,last_qbr_date,amount_ARR_total,customer_age
0,1,1270,1197736.11,Technology,Low,0,2022-09-07,20461.77,23748.19,medium,medium,medium,0,2025-09-07,2023-09-28,44209.96,38
1,2,1468,1063621.01,Food and Beverage,High,0,2016-05-24,44768.96,31145.7,high,low,medium,0,2019-05-24,2018-04-12,75914.66,115
2,3,997,1086966.24,Government,Low,0,2015-03-11,41334.29,38604.76,low,medium,medium,0,2018-03-11,2016-02-12,79939.05,130
3,4,1376,935449.79,Government,High,0,2017-10-20,53490.12,32514.31,medium,medium,medium,0,2020-10-20,2019-09-03,86004.43,98
4,5,1598,1135527.94,Business Services,Low,0,2021-01-15,82857.24,21110.68,high,low,high,0,2024-01-15,2022-02-04,103967.92,58


In [11]:
df_customers[['customer_sign_up_date','next_renewal_date','last_qbr_date']].head()

Unnamed: 0,customer_sign_up_date,next_renewal_date,last_qbr_date
0,2022-09-07,2025-09-07,2023-09-28
1,2016-05-24,2019-05-24,2018-04-12
2,2015-03-11,2018-03-11,2016-02-12
3,2017-10-20,2020-10-20,2019-09-03
4,2021-01-15,2024-01-15,2022-02-04


# EDA

In [12]:
df_customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   customer_id            3000 non-null   int32         
 1   customer_size          3000 non-null   int32         
 2   annueal_revenue        3000 non-null   float64       
 3   industry               3000 non-null   object        
 4   c_level_engagement     3000 non-null   object        
 5   open_deals             3000 non-null   int32         
 6   customer_sign_up_date  3000 non-null   datetime64[ns]
 7   amount_ARR_cloud       3000 non-null   float64       
 8   amount_ARR_on_prem     3000 non-null   float64       
 9   feature_1_usage        3000 non-null   object        
 10  feature_2_usage        3000 non-null   object        
 11  feature_3_usage        3000 non-null   object        
 12  churned                3000 non-null   int32         
 13  nex

In [13]:
df_customers.describe()

Unnamed: 0,customer_id,customer_size,annueal_revenue,open_deals,customer_sign_up_date,amount_ARR_cloud,amount_ARR_on_prem,churned,next_renewal_date,last_qbr_date,amount_ARR_total,customer_age
count,3000.0,3000.0,3000.0,3000.0,3000,3000.0,3000.0,3000.0,3000,3000,3000.0,3000.0
mean,1500.5,1509.700333,999851.9,0.205333,2016-08-01 11:29:16.800000,50197.257,30156.464977,0.168,2019-08-02 05:43:40.800000,2018-02-10 08:54:14.400000,80353.721977,112.704333
min,1.0,653.0,332100.0,0.0,2010-01-04 00:00:00,13648.0,-851.0,0.0,2013-01-04 00:00:00,2010-03-04 00:00:00,28305.34,35.0
25%,750.75,1336.0,865477.3,0.0,2013-05-20 18:00:00,43321.78,24762.415,0.0,2016-05-20 18:00:00,2014-10-26 06:00:00,71841.185,72.0
50%,1500.5,1513.0,998384.9,0.0,2016-07-10 00:00:00,50171.265,30186.515,0.0,2019-07-10 00:00:00,2018-03-03 12:00:00,80336.925,113.5
75%,2250.25,1684.0,1137634.0,0.0,2019-12-06 00:00:00,57070.4375,35637.13,0.0,2022-12-06 00:00:00,2021-05-07 06:00:00,89035.0925,152.0
max,3000.0,2373.0,1810510.0,1.0,2022-12-30 00:00:00,94790.84,61538.65,1.0,2025-12-30 00:00:00,2025-12-15 00:00:00,126946.23,193.0
std,866.169729,254.689572,199391.1,0.404012,,10085.555741,7997.814602,0.373929,,,12844.149573,45.892515


In [21]:
df_customers['c_level_engagement'] = pd.Categorical(df_customers['c_level_engagement'], 
                                                    categories=['Low', 'Medium', 'High'], 
                                                    ordered=True)

In [25]:
df_customers['c_level_engagement'].cat.codes.mean()

0.612

In [26]:
df_customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   customer_id            3000 non-null   int32         
 1   customer_size          3000 non-null   int32         
 2   annueal_revenue        3000 non-null   float64       
 3   industry               3000 non-null   object        
 4   c_level_engagement     3000 non-null   category      
 5   open_deals             3000 non-null   int32         
 6   customer_sign_up_date  3000 non-null   datetime64[ns]
 7   amount_ARR_cloud       3000 non-null   float64       
 8   amount_ARR_on_prem     3000 non-null   float64       
 9   feature_1_usage        3000 non-null   object        
 10  feature_2_usage        3000 non-null   object        
 11  feature_3_usage        3000 non-null   object        
 12  churned                3000 non-null   int32         
 13  nex

In [14]:
df_customers.isna().sum()

customer_id              0
customer_size            0
annueal_revenue          0
industry                 0
c_level_engagement       0
open_deals               0
customer_sign_up_date    0
amount_ARR_cloud         0
amount_ARR_on_prem       0
feature_1_usage          0
feature_2_usage          0
feature_3_usage          0
churned                  0
next_renewal_date        0
last_qbr_date            0
amount_ARR_total         0
customer_age             0
dtype: int64

In [18]:
df_customers.duplicated().sum()

0