In [1]:
import numpy as np
import pandas as pd

from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from scipy.stats import ttest_ind

In [2]:
np.random.seed(42)

In [3]:
n_users = 10000

Generated dataframe with user characteristics

- user_id
- age (years old)
- income (user income)
- page_views (number of pages viewed)
- sessions (number of sessions)
- received_push (fact of receiving a push)
- conversion (user conversion to purchase: target)

In [4]:
data = pd.DataFrame({
    "user_id": np.arange(n_users),
    "age": np.random.randint(18, 60, size=n_users),
    "income": np.random.randint(20000, 100000, size=n_users),
    "page_views": np.random.randint(1, 50, size=n_users),
    "sessions": np.random.randint(1, 20, size=n_users),
    "received_push": np.random.choice([0, 1], size=n_users, p=[0.4, 0.6]),
#     "conversion": np.random.choice([0, 1], size=n_users, p=[0.8, 0.2])
})

Let's introduce an imbalance into user groups: let the users who received push notifications have a higher income and a greater number of viewed pages

Conversion depends on the number of pages viewed

In [5]:
data.loc[data["received_push"] == 1, "page_views"] += np.random.randint(
    0,
    50,
    size=data[data["received_push"] == 1].shape[0]
)

In [6]:
def calculate_conversion(page_views: int) -> int:
    """"""
    base_conversion_rate = 0.05
    conversion_rate = base_conversion_rate + (page_views / 10) * 0.01
    conversion_rate = min(conversion_rate, 1.0)
    
    return np.random.choice([0, 1], p=[1 - conversion_rate, conversion_rate])

In [7]:
data["conversion"] = data["page_views"].apply(calculate_conversion)

We divide users into groups depending on the fact of receiving a push notification

In [8]:
treated = data[data["received_push"] == 1].copy()
control = data[data["received_push"] == 0].copy()

Estimation of the target variable before feature balancing

In [9]:
len(treated)

5973

In [10]:
len(control)

4027

In [11]:
treated["conversion"].mean()

0.09409007199062448

In [12]:
control["conversion"].mean()

0.06977899180531413

In [13]:
stat, p_value = ttest_ind(treated["conversion"], control["conversion"])
print(f"T-stat: {stat:.3f}, P-value: {p_value:.3f}")

T-stat: 4.295, P-value: 0.000


Selecting Important Features for Balancing

In [14]:
features = ["age", "income", "page_views", "sessions"]

In [15]:
scaler = StandardScaler()
treated_scaled = scaler.fit_transform(treated[features])
control_scaled = scaler.transform(control[features])

In [16]:
knn = NearestNeighbors(n_neighbors=1, metric="manhattan").fit(control_scaled)
distances, indices = knn.kneighbors(treated_scaled)

In [17]:
matched_control_indices = indices.flatten()
matched_control = control.iloc[matched_control_indices].copy()

In [18]:
print(f"Size test group: {len(treated)}")
print(f"Size control group: {len(matched_control)}")

Size test group: 5973
Size control group: 5973


In [19]:
treated_conversion_rate = treated["conversion"].mean()
matched_control_conversion_rate = matched_control["conversion"].mean()

In [20]:
print(f"Conversion in test group: {treated_conversion_rate:.5f}")
print(f"Conversion in control group: {matched_control_conversion_rate:.5f}")

Conversion in test group: 0.09409
Conversion in control group: 0.08522


In [21]:
stat, p_value = ttest_ind(treated["conversion"], matched_control["conversion"])
print(f"T-stat: {stat:.3f}, P-value: {p_value:.3f}")

T-stat: 1.697, P-value: 0.090
