In [14]:
from pathlib import Path

import joblib
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import auc, f1_score, precision_recall_curve
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [15]:
class Config:
    data_dir = Path("../data/raw")
    processed_dir = Path("../data/processed")
    features_dir = Path("../data/features")


for d in [Config.processed_dir, Config.features_dir]:
    d.mkdir(parents=True, exist_ok=True)

In [16]:
customers = pd.read_csv(Config.data_dir / "customers.csv")
noncustomers = pd.read_csv(Config.data_dir / "noncustomers.csv")
usage = pd.read_csv(Config.data_dir / "usage_actions.csv")

print(f"Customers: {customers.shape}, Noncustomers: {noncustomers.shape}, Usage: {usage.shape}")

customers["is_customer"] = 1
noncustomers["is_customer"] = 0

base = pd.concat([customers, noncustomers], axis=0).reset_index(drop=True)
print("Unified dataset:", base.shape)
base.head()

Customers: (200, 6), Noncustomers: (5003, 4), Usage: (25387, 10)
Unified dataset: (5203, 7)


Unnamed: 0,CLOSEDATE,MRR,ALEXA_RANK,EMPLOYEE_RANGE,INDUSTRY,id,is_customer
0,2019-06-20,290.0,309343.0,201 to 1000,Other,199,1
1,2019-03-22,392.54,16000001.0,51 to 200,,147,1
2,2020-01-08,-61.15,60502.0,1001 to 10000,HIGHER_EDUCATION,118,1
3,2019-09-19,400.0,3575321.0,201 to 1000,CONSUMER_SERVICES,126,1
4,2019-01-27,209.98,273063.0,26 to 50,Technology - Software,174,1


3 Summarize and Clean Raw Data

In [17]:
dup_customers = customers["id"].duplicated().sum()
dup_noncustomers = noncustomers["id"].duplicated().sum()
print(f"Duplicate customer IDs: {dup_customers}, Duplicate non-customer IDs: {dup_noncustomers}")

base = base.drop_duplicates(subset="id", keep="first")

for col in ["INDUSTRY", "EMPLOYEE_RANGE"]:
    if col in base.columns:
        base[col] = base[col].fillna("Unknown")

if "MRR" in base.columns:
    base["MRR"] = base["MRR"].fillna(0).clip(lower=0)

print("Cleaned and unified static dataset")
base.head()

Duplicate customer IDs: 0, Duplicate non-customer IDs: 3
Cleaned and unified static dataset


Unnamed: 0,CLOSEDATE,MRR,ALEXA_RANK,EMPLOYEE_RANGE,INDUSTRY,id,is_customer
0,2019-06-20,290.0,309343.0,201 to 1000,Other,199,1
1,2019-03-22,392.54,16000001.0,51 to 200,Unknown,147,1
2,2020-01-08,0.0,60502.0,1001 to 10000,HIGHER_EDUCATION,118,1
3,2019-09-19,400.0,3575321.0,201 to 1000,CONSUMER_SERVICES,126,1
4,2019-01-27,209.98,273063.0,26 to 50,Technology - Software,174,1


In [18]:
usage["WHEN_TIMESTAMP"] = pd.to_datetime(usage["WHEN_TIMESTAMP"])

current_date = usage["WHEN_TIMESTAMP"].max()
usage["days_since_action"] = (current_date - usage["WHEN_TIMESTAMP"]).dt.days

usage_summary = (
    usage.groupby("id")
    .agg(
        {
            "ACTIONS_CRM_CONTACTS": "sum",
            "ACTIONS_CRM_COMPANIES": "sum",
            "ACTIONS_CRM_DEALS": "sum",
            "ACTIONS_EMAIL": "sum",
            "USERS_CRM_CONTACTS": "mean",
            "USERS_CRM_COMPANIES": "mean",
            "USERS_CRM_DEALS": "mean",
            "USERS_EMAIL": "mean",
            "days_since_action": "min",  # recency → smaller = more recent
        }
    )
    .reset_index()
)

print("Aggregated usage features:", usage_summary.shape)
display(usage_summary.head())

Aggregated usage features: (3569, 10)


Unnamed: 0,id,ACTIONS_CRM_CONTACTS,ACTIONS_CRM_COMPANIES,ACTIONS_CRM_DEALS,ACTIONS_EMAIL,USERS_CRM_CONTACTS,USERS_CRM_COMPANIES,USERS_CRM_DEALS,USERS_EMAIL,days_since_action
0,1,14845,1889,5253,70,4.985714,3.214286,3.8,0.285714,0
1,2,4970,798,790,247,1.931507,0.863014,1.013699,0.479452,0
2,3,43467,8219,18797,498,16.581081,10.635135,9.959459,1.459459,0
3,4,54329,77,867,0,8.0,0.486111,2.666667,0.0,7
4,5,5172,1450,3551,70,5.347222,2.444444,4.111111,0.375,0


In [19]:
final_df = base.merge(usage_summary, on="id", how="left")

usage_num_cols = [
    "ACTIONS_CRM_CONTACTS",
    "ACTIONS_CRM_COMPANIES",
    "ACTIONS_CRM_DEALS",
    "ACTIONS_EMAIL",
    "USERS_CRM_CONTACTS",
    "USERS_CRM_COMPANIES",
    "USERS_CRM_DEALS",
    "USERS_EMAIL",
]
final_df[usage_num_cols] = final_df[usage_num_cols].fillna(0)

max_days = usage["days_since_action"].max()
final_df["days_since_action"] = final_df["days_since_action"].fillna(max_days + 1)

print("Merged dataset with usage:")
display(final_df.head())

Merged dataset with usage:


Unnamed: 0,CLOSEDATE,MRR,ALEXA_RANK,EMPLOYEE_RANGE,INDUSTRY,id,is_customer,ACTIONS_CRM_CONTACTS,ACTIONS_CRM_COMPANIES,ACTIONS_CRM_DEALS,ACTIONS_EMAIL,USERS_CRM_CONTACTS,USERS_CRM_COMPANIES,USERS_CRM_DEALS,USERS_EMAIL,days_since_action
0,2019-06-20,290.0,309343.0,201 to 1000,Other,199,1,5438.0,24.0,125.0,2464.0,7.985714,0.314286,0.528571,7.528571,0.0
1,2019-03-22,392.54,16000001.0,51 to 200,Unknown,147,1,6069.0,2935.0,3461.0,108.0,4.65625,4.25,5.328125,0.671875,0.0
2,2020-01-08,0.0,60502.0,1001 to 10000,HIGHER_EDUCATION,118,1,65199.0,8855.0,21252.0,742.0,14.558824,9.779412,10.720588,2.338235,0.0
3,2019-09-19,400.0,3575321.0,201 to 1000,CONSUMER_SERVICES,126,1,8574.0,728.0,64.0,94.0,2.442623,0.836066,0.360656,0.622951,0.0
4,2019-01-27,209.98,273063.0,26 to 50,Technology - Software,174,1,16964.0,5230.0,8360.0,372.0,8.898551,6.217391,4.797101,0.913043,0.0


In [20]:
final_df["total_actions"] = final_df[
    ["ACTIONS_CRM_CONTACTS", "ACTIONS_CRM_COMPANIES", "ACTIONS_CRM_DEALS", "ACTIONS_EMAIL"]
].sum(axis=1)

final_df["total_users"] = final_df[["USERS_CRM_CONTACTS", "USERS_CRM_COMPANIES", "USERS_CRM_DEALS", "USERS_EMAIL"]].sum(
    axis=1
)

final_df["email_action_ratio"] = final_df["ACTIONS_EMAIL"] / (final_df["total_actions"] + 1e-5)
final_df["deal_action_ratio"] = final_df["ACTIONS_CRM_DEALS"] / (final_df["total_actions"] + 1e-5)

final_df["log_alexa_rank"] = np.log1p(final_df["ALEXA_RANK"])
final_df["log_total_actions"] = np.log1p(final_df["total_actions"])

# Drop post-conversion / redundant columns (no target leakage!)
drop_cols = ["CLOSEDATE", "MRR", "ALEXA_RANK"]
final_df = final_df.drop(columns=drop_cols, errors="ignore")

print("Feature-engineered dataset:")
display(final_df.head())

Feature-engineered dataset:


Unnamed: 0,EMPLOYEE_RANGE,INDUSTRY,id,is_customer,ACTIONS_CRM_CONTACTS,ACTIONS_CRM_COMPANIES,ACTIONS_CRM_DEALS,ACTIONS_EMAIL,USERS_CRM_CONTACTS,USERS_CRM_COMPANIES,USERS_CRM_DEALS,USERS_EMAIL,days_since_action,total_actions,total_users,email_action_ratio,deal_action_ratio,log_alexa_rank,log_total_actions
0,201 to 1000,Other,199,1,5438.0,24.0,125.0,2464.0,7.985714,0.314286,0.528571,7.528571,0.0,8051.0,16.357143,0.306049,0.015526,12.642209,8.993676
1,51 to 200,Unknown,147,1,6069.0,2935.0,3461.0,108.0,4.65625,4.25,5.328125,0.671875,0.0,12573.0,14.90625,0.00859,0.275272,16.588099,9.439386
2,1001 to 10000,HIGHER_EDUCATION,118,1,65199.0,8855.0,21252.0,742.0,14.558824,9.779412,10.720588,2.338235,0.0,96048.0,37.397059,0.007725,0.221264,11.010448,11.472614
3,201 to 1000,CONSUMER_SERVICES,126,1,8574.0,728.0,64.0,94.0,2.442623,0.836066,0.360656,0.622951,0.0,9460.0,4.262295,0.009937,0.006765,15.089566,9.154933
4,26 to 50,Technology - Software,174,1,16964.0,5230.0,8360.0,372.0,8.898551,6.217391,4.797101,0.913043,0.0,30926.0,20.826087,0.012029,0.270323,12.517461,10.339385


In [21]:
num_cols = final_df.select_dtypes(include=np.number).columns.drop("is_customer")

corr = final_df[num_cols].corrwith(final_df["is_customer"]).abs().sort_values(ascending=False)
print("Correlations with is_customer (abs):")
display(corr)

group_means = final_df.groupby("is_customer")[num_cols].mean().diff().iloc[1].abs().sort_values(ascending=False)
print("Mean differences (customers - noncustomers, abs):")
display(group_means)

corr_matrix = final_df[num_cols].corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
high_corr = [column for column in upper.columns if any(upper[column] > 0.9)]
print("Highly correlated features to drop:", high_corr)

final_df = final_df.drop(columns=high_corr, errors="ignore")

num_cols = final_df.select_dtypes(include=np.number).columns.drop("is_customer")
corr = final_df[num_cols].corrwith(final_df["is_customer"]).abs().sort_values(ascending=False)
group_means = final_df.groupby("is_customer")[num_cols].mean().diff().iloc[1].abs().sort_values(ascending=False)

rank_corr = corr.rank(ascending=False)
rank_mean = group_means.rank(ascending=False)
avg_rank = (rank_corr + rank_mean) / 2

top_numeric = avg_rank.sort_values().index[:15].tolist()
top_features = top_numeric + ["INDUSTRY", "EMPLOYEE_RANGE"]

print("Selected top features:")
print(top_features)

Correlations with is_customer (abs):


log_total_actions        0.595525
total_users              0.480855
USERS_CRM_DEALS          0.469252
USERS_EMAIL              0.404566
USERS_CRM_COMPANIES      0.395456
total_actions            0.379711
USERS_CRM_CONTACTS       0.360257
ACTIONS_EMAIL            0.349402
id                       0.333087
ACTIONS_CRM_CONTACTS     0.314380
ACTIONS_CRM_COMPANIES    0.279472
days_since_action        0.264800
deal_action_ratio        0.264672
ACTIONS_CRM_DEALS        0.252595
email_action_ratio       0.059706
log_alexa_rank           0.051519
dtype: float64

Mean differences (customers - noncustomers, abs):


total_actions            14604.560400
ACTIONS_CRM_CONTACTS      9392.042600
ACTIONS_CRM_DEALS         3136.067200
id                        2600.000000
ACTIONS_CRM_COMPANIES     1935.747600
days_since_action          298.799600
ACTIONS_EMAIL              140.703000
total_users                  9.295926
log_total_actions            6.677345
USERS_CRM_CONTACTS           4.329865
USERS_CRM_DEALS              2.650336
USERS_CRM_COMPANIES          1.889725
log_alexa_rank               0.558362
USERS_EMAIL                  0.426001
deal_action_ratio            0.200783
email_action_ratio           0.021737
Name: 1, dtype: float64

Highly correlated features to drop: ['total_actions']
Selected top features:
['log_total_actions', 'total_users', 'ACTIONS_CRM_CONTACTS', 'id', 'ACTIONS_EMAIL', 'USERS_CRM_DEALS', 'ACTIONS_CRM_COMPANIES', 'ACTIONS_CRM_DEALS', 'USERS_CRM_CONTACTS', 'USERS_CRM_COMPANIES', 'days_since_action', 'USERS_EMAIL', 'deal_action_ratio', 'log_alexa_rank', 'email_action_ratio', 'INDUSTRY', 'EMPLOYEE_RANGE']


In [22]:
X = final_df[top_features + ["id"]]  # keep id for later tracking
y = final_df["is_customer"]

X_train, X_temp, y_train, y_temp = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, stratify=y_temp, test_size=0.5, random_state=42)

print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
print("Train class balance:", np.bincount(y_train))

Train: (3640, 18), Val: (780, 18), Test: (780, 18)
Train class balance: [3500  140]


In [23]:
top_features = [c for c in top_features if c not in ["id", "is_customer"]]


class Preprocessor:
    def __init__(self, feature_names, full_df):
        # numeric vs categorical features based on dtypes
        numeric_features = [col for col in feature_names if full_df[col].dtype in [np.float64, np.int64]]
        categorical_features = [col for col in feature_names if col in ["INDUSTRY", "EMPLOYEE_RANGE"]]

        numeric_transformer = Pipeline(
            steps=[
                ("imputer", SimpleImputer(strategy="median")),
                ("scaler", StandardScaler()),
            ]
        )
        categorical_transformer = Pipeline(
            steps=[
                ("imputer", SimpleImputer(strategy="most_frequent")),
                ("encoder", OneHotEncoder(handle_unknown="ignore")),
            ]
        )

        self.preprocessor = ColumnTransformer(
            transformers=[
                ("num", numeric_transformer, numeric_features),
                ("cat", categorical_transformer, categorical_features),
            ]
        )

    def fit_transform(self, X):
        # Drop id if present, just in case
        return self.preprocessor.fit_transform(X.drop(columns="id", errors="ignore"))

    def transform(self, X):
        return self.preprocessor.transform(X.drop(columns="id", errors="ignore"))

    def save(self, path):
        joblib.dump(self.preprocessor, path)


prep = Preprocessor(top_features, final_df)
X_train_processed = prep.fit_transform(X_train)
X_val_processed = prep.transform(X_val)
X_test_processed = prep.transform(X_test)

print("Preprocessed shapes:", X_train_processed.shape, X_val_processed.shape, X_test_processed.shape)

Preprocessed shapes: (3640, 174) (780, 174) (780, 174)


In [24]:
prep.save(Config.features_dir / "preprocessor.pkl")

sparse.save_npz(Config.features_dir / "X_train.npz", sparse.csr_matrix(X_train_processed))
sparse.save_npz(Config.features_dir / "X_val.npz", sparse.csr_matrix(X_val_processed))
sparse.save_npz(Config.features_dir / "X_test.npz", sparse.csr_matrix(X_test_processed))

y_train.to_csv(Config.features_dir / "y_train.csv", index=False)
y_val.to_csv(Config.features_dir / "y_val.csv", index=False)
y_test.to_csv(Config.features_dir / "y_test.csv", index=False)

print("✅ Clean features and splits saved to:", Config.features_dir)

✅ Clean features and splits saved to: ../data/features


In [25]:
logreg = LogisticRegression(class_weight="balanced", max_iter=1000)
logreg.fit(X_train_processed, y_train)

y_proba = logreg.predict_proba(X_val_processed)[:, 1]
precision, recall, _ = precision_recall_curve(y_val, y_proba)
auc_pr = auc(recall, precision)
f1 = f1_score(y_val, (y_proba > 0.5).astype(int))

print(f"Sanity check – Val AUC-PR: {auc_pr:.3f}, F1: {f1:.3f}")

Sanity check – Val AUC-PR: 0.763, F1: 0.538
