In [51]:
#Libraries
import pandas as pd
import numpy as np

In [52]:
# To decide churn metric as LOGO or ACV

df = pd.read_csv("dataset1.csv")  

SEG_COL = "sales_segment"   # try: "sales_segment", "product_tier", "company_size_bucket", "region", "acquisition_channel", "is_eu", "industry"

ID_COL    = "customer_id"
CHURN_COL = "is_churned"              
ACV_COL   = "annual_contract_value"   

df[SEG_COL] = df[SEG_COL].fillna("Unknown")
df[CHURN_COL] = pd.to_numeric(df[CHURN_COL], errors="coerce").fillna(0).astype(int)
df[ACV_COL] = pd.to_numeric(df[ACV_COL], errors="coerce").fillna(0.0)

churned = df[df[CHURN_COL] == 1].copy()
total_churned_logos = churned[ID_COL].nunique()
total_churned_acv   = churned[ACV_COL].sum()

summary = (
    df.groupby(SEG_COL, dropna=False)
      .agg(customers=(ID_COL, "nunique"),
           churned_customers=(CHURN_COL, "sum"),
           total_acv=(ACV_COL, "sum"))
      .reset_index()
)

churn_by_seg = (
    churned.groupby(SEG_COL, dropna=False)
           .agg(churned_acv=(ACV_COL, "sum"))
           .reset_index()
)

summary = summary.merge(churn_by_seg, on=SEG_COL, how="left").fillna({"churned_acv": 0.0})

summary["logo_churn_rate"] = summary["churned_customers"] / summary["customers"]
summary["acv_churn_rate"]  = summary["churned_acv"] / summary["total_acv"].replace({0: pd.NA})

summary["logo_churn_share"] = summary["churned_customers"] / (total_churned_logos if total_churned_logos else 1)
summary["acv_churn_share"]  = summary["churned_acv"] / (total_churned_acv if total_churned_acv else 1)

logo_top = summary.sort_values("logo_churn_share", ascending=False).head(10)
acv_top  = summary.sort_values("acv_churn_share",  ascending=False).head(10)

print("Top by LOGO churn share:")
print(logo_top[[SEG_COL, "customers", "churned_customers", "logo_churn_share", "logo_churn_rate"]].to_string(index=False))

print("\nTop by ACV churn share:")
print(acv_top[[SEG_COL, "total_acv", "churned_acv", "acv_churn_share", "acv_churn_rate"]].to_string(index=False))

top_logo_seg = logo_top.iloc[0][SEG_COL] if len(logo_top) else None
top_acv_seg  = acv_top.iloc[0][SEG_COL]  if len(acv_top)  else None
print("\nDecision helper:")
print("Top logo segment:", top_logo_seg)
print("Top ACV segment :", top_acv_seg)


Top by LOGO churn share:
sales_segment  customers  churned_customers  logo_churn_share  logo_churn_rate
   SMB_Inside       1320                217          0.596154         0.164394
    SMB_Field        574                 97          0.266484         0.168990
    MidMarket        632                 29          0.079670         0.045886
   Enterprise        474                 21          0.057692         0.044304

Top by ACV churn share:
sales_segment   total_acv  churned_acv  acv_churn_share  acv_churn_rate
   Enterprise 84265245.37   3666669.15         0.701746        0.043513
    MidMarket 15870773.45    743722.20         0.142337        0.046861
   SMB_Inside  5436775.50    595852.90         0.114037        0.109597
    SMB_Field  2260570.77    218822.49         0.041879        0.096800

Decision helper:
Top logo segment: SMB_Inside
Top ACV segment : Enterprise


In [55]:
# Dataset 1 cleaning and preprocessing
df1 = pd.read_csv("dataset1.csv") 

# Checking missingness
missing_counts = df1.isna().sum().sort_values(ascending=False)
missing_rates = (df1.isna().mean()*100).round(2).sort_values(ascending=False)

print("Missing counts (top):")
print(missing_counts.head(5))

''' Missing counts (top):
contract_end_date           1428
industry                     862 '''

df1.loc[df1["industry"].isna(), ["customer_id","company_name","country","region","industry"]].head(20)

df1["industry_is_missing"] = df1["industry"].isna()
df1["industry"] = df1["industry"].fillna("Unknown")

print("\nIndustry missing after fix:", df1["industry"].isna().sum())
print(df1["industry"].value_counts().head(10))

Missing counts (top):
contract_end_date           1428
industry                     862
customer_id                    0
sales_segment                  0
initial_onboarding_score       0
dtype: int64

Industry missing after fix: 0
industry
Unknown                  862
Ecommerce                278
Hospitality              275
Healthcare               273
Logistics                268
Professional Services    264
Wholesale                261
Manufacturing            261
Retail                   258
Name: count, dtype: int64


In [56]:
# fixing company_size_bucket: common Excel auto-date formatting issues in this file 
print(df1["company_size_bucket"].value_counts())

SIZE_BUCKET_FIX = {"10-Jan": "1-10", "Nov-50": "11-50"}
VALID_BUCKETS = {"1-10","11-50","51-200","201-1000","1000+"}

df1["company_size_bucket_original"] = df1["company_size_bucket"]
df1["company_size_bucket"] = df1["company_size_bucket"].replace(SIZE_BUCKET_FIX)

df1["company_size_bucket_was_fixed"] = (
    df1["company_size_bucket_original"].notna()
    & (df1["company_size_bucket_original"] != df1["company_size_bucket"])
)

print("\nFixed rows:", int(df1["company_size_bucket_was_fixed"].sum()))
print(df1["company_size_bucket"].value_counts())


company_size_bucket
10-Jan      1035
Nov-50       859
51-200       632
201-1000     307
1000+        167
Name: count, dtype: int64

Fixed rows: 1894
company_size_bucket
1-10        1035
11-50        859
51-200       632
201-1000     307
1000+        167
Name: count, dtype: int64


In [57]:
# ---- A1) Parse dates
df1["contract_start_dt"] = pd.to_datetime(df1["contract_start_date"], errors="coerce")
df1["contract_end_dt"]   = pd.to_datetime(df1["contract_end_date"], errors="coerce")

print("Missing start dates:", df1["contract_start_dt"].isna().sum())
print("Missing end dates:", df1["contract_end_dt"].isna().sum())

df1["end_before_start"] = (
    df1["contract_start_dt"].notna()
    & df1["contract_end_dt"].notna()
    & (df1["contract_end_dt"] < df1["contract_start_dt"])
)
print("End before start:", int(df1["end_before_start"].sum()))

df1["has_missing_contract_end_date"] = df1["contract_end_dt"].isna()

print(pd.crosstab(df1["renewed_flag"], df1["has_missing_contract_end_date"]))
print(pd.crosstab(df1["is_churned"], df1["has_missing_contract_end_date"]))

# Contract term in days (only where end exists)
df1["contract_term_days"] = (df1["contract_end_dt"] - df1["contract_start_dt"]).dt.days

print("\nContract term (days) stats:")
print(df1["contract_term_days"].describe())

print("\nMost common terms:", df1["contract_term_days"].value_counts().head(10))

df1["violation_churn_missing_end_date"] = (df1["is_churned"] == 1) & df1["contract_end_dt"].isna()

# If churned but term >= ~12 months, that contradicts "early termination"
df1["violation_churn_not_early_term"] = (
    (df1["is_churned"] == 1)
    & df1["contract_term_days"].notna()
    & (df1["contract_term_days"] >= 365)
)

print("Churned but missing end date:", int(df1["violation_churn_missing_end_date"].sum()))
print("Churned but not early termination:", int(df1["violation_churn_not_early_term"].sum()))

# view the rows if any violations exist
df1.loc[df1["violation_churn_not_early_term"],
        ["customer_id","contract_start_date","contract_end_date","contract_term_days","renewed_flag","is_churned"]].head(20)

# print violation counts with renewed_flag==0 and is_churned==1
print("Violation counts with renewed_flag==0 and is_churned==1:",   
      int(df1[(df1["violation_churn_not_early_term"]) & (df1["renewed_flag"] == 0) & (df1["is_churned"] == 1)].shape[0])
     )

Missing start dates: 0
Missing end dates: 1428
End before start: 0
has_missing_contract_end_date  False  True 
renewed_flag                               
0                               1572      0
1                                  0   1428
has_missing_contract_end_date  False  True 
is_churned                                 
0                               1208   1428
1                                364      0

Contract term (days) stats:
count    1572.000000
mean      299.603053
std        99.361937
min        91.000000
25%       183.000000
50%       366.000000
75%       366.000000
max       366.000000
Name: contract_term_days, dtype: float64

Most common terms: contract_term_days
366.0    1068
183.0     372
91.0      132
Name: count, dtype: int64
Churned but missing end date: 0
Churned but not early termination: 69
Violation counts with renewed_flag==0 and is_churned==1: 69


In [58]:
valid_channels = {"Inbound","Outbound","Partner","SelfServe"}
df1["acquisition_channel_invalid"] = df1["acquisition_channel"].notna() & ~df1["acquisition_channel"].isin(valid_channels)
print("Invalid acquisition_channel:", int(df1["acquisition_channel_invalid"].sum()))
df1.loc[df1["acquisition_channel_invalid"], ["customer_id","acquisition_channel"]].head(10)

# EU consistency check
df1["eu_region_mismatch"] = (
    (df1["region"].notna()) &
    (((df1["is_eu"] == 1) & (df1["region"] != "Europe")) |
     ((df1["is_eu"] == 0) & (df1["region"] == "Europe")))
)
print("EU-region mismatches:", int(df1["eu_region_mismatch"].sum()))

df1["acv_invalid"] = df1["annual_contract_value"].notna() & (df1["annual_contract_value"] <= 0)
df1["discount_invalid"] = df1["discount_pct"].notna() & ~df1["discount_pct"].between(0, 1)
df1["onboarding_invalid"] = df1["initial_onboarding_score"].notna() & ~df1["initial_onboarding_score"].between(0, 10)

print("ACV invalid:", int(df1["acv_invalid"].sum()))
print("Discount invalid:", int(df1["discount_invalid"].sum()))
print("Onboarding invalid:", int(df1["onboarding_invalid"].sum()))

print("Duplicate customer_id:", int(df1["customer_id"].duplicated().sum()))


Invalid acquisition_channel: 0
EU-region mismatches: 0
ACV invalid: 0
Discount invalid: 0
Onboarding invalid: 0
Duplicate customer_id: 0


In [63]:
df1.head()
df1.columns
cols_to_drop = [
    "company_size_bucket_original",
    "company_size_bucket_was_fixed",
    "end_before_start",
    "violation_churn_missing_end_date",
    "acquisition_channel_invalid",
    "eu_region_mismatch",
    "acv_invalid",
    "discount_invalid",
    "onboarding_invalid",
]

# only drop columns that actually exist (safe)
df1.drop(columns=[c for c in cols_to_drop if c in df1.columns], inplace=True)

df1.columns

Index(['customer_id', 'company_name', 'country', 'region', 'is_eu', 'industry',
       'company_size_bucket', 'annual_contract_value', 'product_tier',
       'sales_segment', 'acquisition_channel', 'contract_start_date',
       'contract_end_date', 'renewed_flag', 'discount_pct',
       'initial_onboarding_score', 'is_churned', 'industry_is_missing',
       'contract_start_dt', 'contract_end_dt', 'has_missing_contract_end_date',
       'contract_term_days', 'violation_churn_not_early_term'],
      dtype='object')