In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
def load_data(filepath='../Data/Camp_Market_Cleaned.csv'):
    df = pd.read_csv(filepath, sep=",")
    df.columns = df.columns.str.strip()
    print(f"Données chargées: {len(df)} lignes, {len(df.columns)} colonnes")
    return df

In [None]:
def get_column_lists(df):
    PURCHASE_COLS = ["NumWebPurchases", "NumStorePurchases", "NumCatalogPurchases", "NumDealsPurchases"]
    PURCHASE_COLS = [c for c in PURCHASE_COLS if c in df.columns]
    
    SPEND_COLS = ["MntWines","MntFruits","MntMeatProducts","MntFishProducts","MntSweetProducts","MntGoldProds"]
    SPEND_COLS = [c for c in SPEND_COLS if c in df.columns]
    
    return PURCHASE_COLS, SPEND_COLS

In [None]:
def prepare_data(df, PURCHASE_COLS, SPEND_COLS):
    dfw = df.copy()
    
    for c in PURCHASE_COLS + SPEND_COLS:
        if c in dfw.columns:
            dfw[c] = pd.to_numeric(dfw[c], errors="coerce")
    
    dfw["total_spend"] = dfw[SPEND_COLS].sum(axis=1, min_count=1).fillna(0)
    dfw["total_purchases"] = dfw[PURCHASE_COLS].sum(axis=1, min_count=1).fillna(0)
    
    campaign_response_cols = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Response']
    for col in campaign_response_cols:
        if col in dfw.columns:
            dfw[col] = pd.to_numeric(dfw[col], errors="coerce").fillna(0).astype(int).clip(0, 1)
    
    if any(c in dfw.columns for c in campaign_response_cols):
        available_cols = [c for c in campaign_response_cols if c in dfw.columns]
        dfw["_response_any"] = dfw[available_cols].max(axis=1)  
        dfw["_total_responses"] = dfw[available_cols].sum(axis=1)
        print(f"Variables de réponse créées à partir de: {available_cols}")
    else:
        dfw["_response_any"] = 0
        dfw["_total_responses"] = 0
    
    print("Préparation des données terminée")
    return dfw

In [None]:
def calculate_base_kpis(dfw):
    n_customers = len(dfw)
    
    if "_response_any" in dfw.columns:
        responders = dfw[dfw["_response_any"] == 1]
        n_responders = len(responders)
        response_rate_any = (n_responders / n_customers * 100) if n_customers > 0 else np.nan
    else:
        responders = dfw
        n_responders = n_customers
        response_rate_any = np.nan
    
    if "_total_responses" in responders.columns and len(responders) > 0:
        avg_responses_responders = responders["_total_responses"].mean()
    else:
        avg_responses_responders = np.nan
    
    total_revenue = responders["total_spend"].sum()
    total_purchases = responders["total_purchases"].sum()
    avg_freq = responders["total_purchases"].mean()
    aov = total_revenue / total_purchases if total_purchases > 0 else np.nan
    
    kpi_base = pd.DataFrame({
        "Metric": [
            "Total Clients (tous)", 
            "Clients Ayant Participé à ≥1 Campagne",
            "Taux de Réponse (%)", 
            "Nb Moyen de Campagnes (parmi répondants)",
            "Revenu Total (répondants uniquement)",
            "Achats Totaux (répondants uniquement)", 
            "Fréquence Moyenne d'Achat (répondants)", 
            "Valeur Moyenne d'Achat / AOV (répondants)"
        ],
        "Value": [
            n_customers,
            n_responders,
            round(response_rate_any, 2) if not np.isnan(response_rate_any) else np.nan,
            round(avg_responses_responders, 2) if not np.isnan(avg_responses_responders) else np.nan,
            round(total_revenue, 2), 
            round(total_purchases, 2), 
            round(avg_freq, 2),
            round(aov, 2) if not np.isnan(aov) else np.nan
        ]
    })
    
    print("\n=== KPIs de base (calculés sur les répondants uniquement) ===")
    print(kpi_base)
    
    return kpi_base, responders

In [None]:
def plot_kpi_correlation(responders, output_dir='plots'):
    os.makedirs(output_dir, exist_ok=True)
    
    kpi_data = pd.DataFrame({
        'Total_Spend': responders["total_spend"],
        'Total_Purchases': responders["total_purchases"],
        'Nb_Campagnes': responders["_total_responses"],
        'AOV': responders["total_spend"] / responders["total_purchases"].replace(0, np.nan)
    })
    
    if "Income" in responders.columns:
        kpi_data['Income'] = pd.to_numeric(responders["Income"], errors="coerce")
    
    kpi_data_clean = kpi_data.dropna()
    corr_matrix = kpi_data_clean.corr()
    
    print("\n=== Matrice de Corrélation des KPIs (Répondants uniquement) ===")
    print(corr_matrix)
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, fmt=".3f", cmap="coolwarm", center=0, 
                square=True, linewidths=2, cbar_kws={"shrink": 0.8},
                vmin=-1, vmax=1)
    plt.title("Matrice de Corrélation - KPIs des Répondants", fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig(f"{output_dir}/kpi_correlation_matrix.png", dpi=150)
    plt.show()

In [None]:
def analyze_channels(dfw, PURCHASE_COLS, output_dir='plots'):
    channel_counts = []
    for c in PURCHASE_COLS:
        channel_counts.append(
            pd.DataFrame({
                "channel": [c],
                "purchases": [dfw[c].sum()],
                "clients": [(dfw[c] > 0).sum()]
            })
        )
    by_channel_volume = pd.concat(channel_counts, ignore_index=True).sort_values("purchases", ascending=False)
    print("\n=== Meilleurs canaux (volume) ===")
    print(by_channel_volume)
    
    plt.figure(figsize=(10, 6))
    plt.bar(by_channel_volume['channel'], by_channel_volume['purchases'])
    plt.title("Achats par Canal", fontsize=14, fontweight='bold')
    plt.xlabel("Canal")
    plt.ylabel("Nombre d'achats")
    plt.xticks(rotation=30, ha="right")
    plt.tight_layout()
    plt.savefig(f"{output_dir}/purchases_by_channel.png", dpi=150)
    plt.show()
    
    return by_channel_volume

In [None]:
def plot_distributions(dfw, output_dir='plots'):
    plt.figure(figsize=(10, 6))
    plt.hist(dfw["total_spend"].dropna().values, bins=30, edgecolor='black')
    plt.title("Distribution des Dépenses Totales", fontsize=14, fontweight='bold')
    plt.xlabel("Dépense totale")
    plt.ylabel("Nombre de clients")
    plt.tight_layout()
    plt.savefig(f"{output_dir}/dist_total_spend.png", dpi=150)
    plt.show()
    
    plt.figure(figsize=(10, 6))
    plt.hist(dfw["total_purchases"].dropna().values, bins=30, edgecolor='black')
    plt.title("Distribution des Achats Totaux", fontsize=14, fontweight='bold')
    plt.xlabel("Nombre d'achats")
    plt.ylabel("Nombre de clients")
    plt.tight_layout()
    plt.savefig(f"{output_dir}/dist_total_purchases.png", dpi=150)
    plt.show()

In [None]:
def analyze_aov_by_segment(dfw, segments=None, output_dir='plots'):
    if segments is None:
        segments = ['Education', 'Childrens']
    
    aov_by_segment_tables = {}
    
    for seg in segments:
        if seg not in dfw.columns:
            continue
            
        tmp = dfw.groupby(seg, dropna=False).agg(
            clients=(seg, "size"),
            total_spend=("total_spend", "sum"),
            total_purchases=("total_purchases", "sum"),
            response_rate=("_response_any", "mean"),
            avg_campaigns=("_total_responses", "mean")
        ).reset_index()
        tmp["AOV"] = tmp["total_spend"] / tmp["total_purchases"]
        tmp["response_rate"] = (tmp["response_rate"] * 100).round(2)
        tmp["avg_campaigns"] = tmp["avg_campaigns"].round(2)
        tmp["AOV"] = tmp["AOV"].round(2)
        aov_by_segment_tables[seg] = tmp.sort_values("AOV", ascending=False)
    
    if aov_by_segment_tables:
        print("\n=== AOV par segment ===")
        for seg, tab in aov_by_segment_tables.items():
            print(f"\nSegment: {seg}")
            print(tab.head(10))
    
    return aov_by_segment_tables

In [None]:
def plot_age_income_distributions(dfw, output_dir='plots'):
    income_candidates_list = ["Income", "income_imputed", "income"]
    yob_candidates = ["Year_Birth", "year_birth", "YEAR_BIRTH"]
    
    income_col_plot = next((c for c in income_candidates_list if c in dfw.columns), None)
    
    if "Age" in dfw.columns and dfw["Age"].notna().any():
        age_all = pd.to_numeric(dfw["Age"], errors="coerce")
    else:
        yob_col = next((c for c in yob_candidates if c in dfw.columns), None)
        age_all = (pd.Timestamp.now().year - pd.to_numeric(dfw[yob_col], errors="coerce")) if yob_col else None
    
    mask_resp = (dfw["_response_any"] == 1)
    mask_non = (dfw["_response_any"] == 0)
    
    r_age = age_all[mask_resp].dropna() if age_all is not None else None
    nr_age = age_all[mask_non].dropna() if age_all is not None else None
    
    r_inc = pd.to_numeric(dfw.loc[mask_resp, income_col_plot], errors="coerce").dropna() if income_col_plot else None
    nr_inc = pd.to_numeric(dfw.loc[mask_non, income_col_plot], errors="coerce").dropna() if income_col_plot else None
    
    if r_age is not None: r_age = r_age[(r_age >= 18) & (r_age <= 100)]
    if nr_age is not None: nr_age = nr_age[(nr_age >= 18) & (nr_age <= 100)]
    
    make_age = r_age is not None and (len(r_age) > 0 or (nr_age is not None and len(nr_age) > 0))
    make_income = r_inc is not None and (len(r_inc) > 0 or (nr_inc is not None and len(nr_inc) > 0))
    
    if make_age and make_income:
        fig, axes = plt.subplots(1, 2, figsize=(14, 5))
        
        if nr_age is not None and len(nr_age) > 0:
            axes[0].hist(nr_age, bins=20, alpha=0.6, label=f"Non-répondants (n={len(nr_age)})", color='coral', zorder=1)
        if r_age is not None and len(r_age) > 0:
            axes[0].hist(r_age, bins=20, alpha=0.6, label=f"Répondants (n={len(r_age)})", color='steelblue', zorder=2)
        axes[0].set_title("Distribution de l'Âge", fontsize=12, fontweight='bold')
        axes[0].set_xlabel("Âge")
        axes[0].set_ylabel("Nombre de clients")
        axes[0].legend()
        
        if nr_inc is not None and len(nr_inc) > 0:
            axes[1].hist(nr_inc, bins=20, alpha=0.6, label=f"Non-répondants (n={len(nr_inc)})", color='coral', zorder=1)
        if r_inc is not None and len(r_inc) > 0:
            axes[1].hist(r_inc, bins=20, alpha=0.6, label=f"Répondants (n={len(r_inc)})", color='steelblue', zorder=2)
        axes[1].set_title(f"Distribution du Revenu ({income_col_plot})", fontsize=12, fontweight='bold')
        axes[1].set_xlabel("Revenu")
        axes[1].set_ylabel("Nombre de clients")
        axes[1].legend()
        
        plt.tight_layout()
        plt.savefig(f"{output_dir}/age_income_distribution.png", dpi=150)
        plt.show()
    elif make_age:
        plt.figure(figsize=(10, 6))
        if nr_age is not None and len(nr_age) > 0:
            plt.hist(nr_age, bins=20, alpha=0.6, label=f"Non-répondants (n={len(nr_age)})", color='coral', zorder=1)
        if r_age is not None and len(r_age) > 0:
            plt.hist(r_age, bins=20, alpha=0.6, label=f"Répondants (n={len(r_age)})", color='steelblue', zorder=2)
        plt.title("Distribution de l'Âge", fontsize=14, fontweight='bold')
        plt.xlabel("Âge")
        plt.ylabel("Nombre de clients")
        plt.legend()
        plt.tight_layout()
        plt.savefig(f"{output_dir}/age_distribution.png", dpi=150)
        plt.show()
    elif make_income:
        plt.figure(figsize=(10, 6))
        if nr_inc is not None and len(nr_inc) > 0:
            plt.hist(nr_inc, bins=20, alpha=0.6, label=f"Non-répondants (n={len(nr_inc)})", color='coral', zorder=1)
        if r_inc is not None and len(r_inc) > 0:
            plt.hist(r_inc, bins=20, alpha=0.6, label=f"Répondants (n={len(r_inc)})", color='steelblue', zorder=2)
        plt.title(f"Distribution du Revenu ({income_col_plot})", fontsize=14, fontweight='bold')
        plt.xlabel("Revenu")
        plt.ylabel("Nombre de clients")
        plt.legend()
        plt.tight_layout()
        plt.savefig(f"{output_dir}/income_distribution.png", dpi=150)
        plt.show()
    else:
        print("Aucune donnée d'âge ou de revenu disponible pour le graphique")

In [None]:
def plot_full_correlation_matrix(dfw, PURCHASE_COLS, SPEND_COLS, output_dir='plots'):
    cols_for_corr = PURCHASE_COLS + SPEND_COLS + ["total_spend", "total_purchases"]
    
    campaign_cols = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Response']
    for c in campaign_cols:
        if c in dfw.columns and c not in cols_for_corr:
            cols_for_corr.append(c)
    
    if "Income" in dfw.columns and "Income" not in cols_for_corr:
        cols_for_corr.append("Income")
    
    if "Age_Group" in dfw.columns:
        age_order = sorted(dfw["Age_Group"].unique())
        age_mapping = {age: idx for idx, age in enumerate(age_order)}
        dfw["Age_Group_encoded"] = dfw["Age_Group"].map(age_mapping)
        cols_for_corr.append("Age_Group_encoded")
        print(f"\nAge_Group encodée: {age_mapping}")
    
    cols_for_corr = [c for c in cols_for_corr if c in dfw.columns]
    
    print(f"\n📊 Variables incluses dans la matrice: {cols_for_corr}")
    
    corr_matrix_full = dfw[cols_for_corr].corr()
    
    plt.figure(figsize=(16, 12))
    sns.heatmap(corr_matrix_full, annot=True, fmt=".2f", cmap="coolwarm", center=0, 
                square=True, linewidths=1, cbar_kws={"shrink": 0.8})
    plt.title("Matrice de Corrélation - Variables Marketing & Campagnes", fontsize=14, fontweight='bold')
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig(f"{output_dir}/correlation_matrix.png", dpi=150)
    plt.show()

In [None]:
# ========================================
# EXÉCUTION PRINCIPALE
# ========================================

# Charger les données
df = load_data()

# Obtenir les colonnes
PURCHASE_COLS, SPEND_COLS = get_column_lists(df)

# Préparer les données
dfw = prepare_data(df, PURCHASE_COLS, SPEND_COLS)

# Calculer les KPIs de base
kpi_base, responders = calculate_base_kpis(dfw)

# Générer les graphiques et analyses
print("\n=== Génération des graphiques ===")

# 1. Matrice de corrélation des KPIs
plot_kpi_correlation(responders, output_dir='../KPI')

# 2. Analyse des canaux
by_channel_volume = analyze_channels(dfw, PURCHASE_COLS, output_dir='../KPI')

# 3. Distributions
plot_distributions(dfw, output_dir='../KPI')

# 4. AOV par segment
aov_by_segment_tables = analyze_aov_by_segment(dfw, segments=['Education', 'Childrens'], output_dir='../KPI')

# 5. Distributions âge et revenu
plot_age_income_distributions(dfw, output_dir='../KPI')

# 6. Matrice de corrélation complète
plot_full_correlation_matrix(dfw, PURCHASE_COLS, SPEND_COLS, output_dir='../KPI')

print("\n✅ Toutes les analyses et graphiques ont été générés avec succès !")