In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from evidently import Report
from evidently.presets import DataDriftPreset



In [8]:
def generate_churn_data(n_rows=10000):
    np.random.seed(42)
    data = pd.DataFrame({
        'CreditScore': np.random.randint(300, 850, n_rows),
        'Age': np.random.randint(18, 90, n_rows),
        'Tenure': np.random.randint(0, 10, n_rows),
        'Balance': np.random.uniform(0, 250000, n_rows),
        'NumOfProducts': np.random.randint(1, 4, n_rows),
        'HasCrCard': np.random.randint(0, 2, n_rows),
        'IsActiveMember': np.random.randint(0, 2, n_rows),
        'EstimatedSalary': np.random.uniform(10000, 400000, n_rows),
    })

    churn_prob = (data['Age'] * 0.01) + (data['Balance'] * 0.000001) - (data['NumOfProducts'] * 0.1)
    churn_prob = (churn_prob - churn_prob.min()) / (churn_prob.max() - churn_prob.min())
    data['Exited'] = (churn_prob + np.random.normal(0, 0.1, n_rows) > 0.6).astype(int)

    return data

In [9]:
print("Generating Dataset")
df = generate_churn_data(20000)

ref_data = df[:10000].copy()
curr_data = df[10000:].copy()


target = 'Exited'
features = [col for col in df.columns if col != target]

print("Training Model")
model = RandomForestClassifier(n_estimators=50, random_state=42)
model.fit(ref_data[features], ref_data[target])


ref_data['prediction'] = model.predict(ref_data[features])
curr_data['prediction'] = model.predict(curr_data[features])


# Scenario: A marketing campaign targeted University Students.
# Result: The new customers are much younger and have lower salaries.


curr_data['Age'] = curr_data['Age'] - 30
curr_data['Age'] = curr_data['Age'].clip(18, 90)

curr_data['EstimatedSalary'] = curr_data['EstimatedSalary'] * 0.4

print(f"Reference Data (Normal): {ref_data.shape} rows")
print(f"Current Data (Drifted):  {curr_data.shape} rows")
print("Artificial Drift Introduced: 'Age' and 'EstimatedSalary' have been altered in the Current data.")

Generating Dataset
Training Model
Reference Data (Normal): (10000, 10) rows
Current Data (Drifted):  (10000, 10) rows
Artificial Drift Introduced: 'Age' and 'EstimatedSalary' have been altered in the Current data.


In [13]:
print(" Starting Drift Analysis...")

drift_report = Report(metrics=[
    DataDriftPreset(),
])

drift_results = drift_report.run(reference_data=ref_data, current_data=curr_data)
drift_results.save_html('drift_report_churn.html')

print("Report Generated Successfully!")


 Starting Drift Analysis...
Report Generated Successfully!
