# Amazon Customer Segment: Actionable Insights Report


## Introduction
This report analyzes pre-defined customer segments to answer critical business questions. The goal is to provide targeted, data-driven strategies for marketing, customer retention, and operational improvements.


In [None]:
# --- Setup and Imports ---
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

# Set plot style
sns.set_theme(style="whitegrid")

# --- Output Directory Setup ---
output_dir = 'output_2'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)



In [None]:
# --- Load Data ---
clustered_data_file = 'outputs/amazon_customer_clusters.csv'
original_data_file = 'outputs/Amazon Customer Behavior Survey.csv'

try:
    df = pd.read_csv(clustered_data_file)
    df_original = pd.read_csv(original_data_file)
except FileNotFoundError as e:
    print(f"File not found: {e}")
    raise

# Display preview of main dataframe
df.head()


## Analysis 1: Prime Membership Candidates Identifier
**Business Question:** Which non-Prime members are the most valuable targets for a Prime membership marketing campaign?

**Methodology:** We identify customers who are not currently Prime members but exhibit behaviors of a loyal customer (i.e., high purchase frequency).


In [None]:
# --- Analysis 1: Prime Candidates Function ---
from typing import Tuple

def identify_prime_candidates(data: pd.DataFrame, frequency_threshold: int = 3) -> pd.DataFrame:
    """Return customers who are not Prime members but have high purchase frequency.

    Parameters
    ----------
    data : pd.DataFrame
        Input dataframe containing Membership_Status and Purchase_Frequency columns.
    frequency_threshold : int, default 3
        Minimum purchase frequency to qualify as a candidate.

    Returns
    -------
    pd.DataFrame
        Filtered dataframe of candidate customers.
    """
    required_cols = {"Membership_Status", "Purchase_Frequency"}
    missing = required_cols - set(data.columns)
    if missing:
        raise KeyError(f"Missing required columns for Analysis 1: {missing}")

    candidates = data[(data["Membership_Status"] == "Not a Member") & (data["Purchase_Frequency"] >= frequency_threshold)]
    return candidates.copy()

# Execute Analysis 1
prime_candidates = identify_prime_candidates(df, frequency_threshold=3)
print(f"Prime candidates found: {len(prime_candidates)}")
prime_candidates_path = os.path.join(output_dir, 'prime_candidates.csv')
prime_candidates.to_csv(prime_candidates_path, index=False)
prime_candidates.head()


## Analysis 2: "At-Risk" Churn Scorecard
**Business Question:** Which customer segment is most likely to stop using our service (churn)?

**Methodology:** We create a scorecard for each cluster based on a combination of low shopping satisfaction, high return frequency, and a high number of customer service interactions.


In [None]:
# --- Analysis 2: At-Risk Scorecard Function ---
def calculate_at_risk_scorecard(data: pd.DataFrame) -> pd.DataFrame:
    """Compute cluster-level risk indicators.

    Returns a dataframe with average Shopping_Satisfaction, Return_Frequency,
    and Customer_Service_Interactions by Cluster.
    """
    required_cols = {"Cluster", "Shopping_Satisfaction", "Return_Frequency", "Customer_Service_Interactions"}
    missing = required_cols - set(data.columns)
    if missing:
        raise KeyError(f"Missing required columns for Analysis 2: {missing}")

    scorecard = (
        data.groupby("Cluster", as_index=False)[
            ["Shopping_Satisfaction", "Return_Frequency", "Customer_Service_Interactions"]
        ]
        .mean()
        .rename(columns={
            "Shopping_Satisfaction": "Avg_Shopping_Satisfaction",
            "Return_Frequency": "Avg_Return_Frequency",
            "Customer_Service_Interactions": "Avg_Customer_Service_Interactions"
        })
    )
    return scorecard

# Execute Analysis 2
at_risk_scorecard = calculate_at_risk_scorecard(df)
print(at_risk_scorecard)
at_risk_path = os.path.join(output_dir, 'at_risk_scorecard.csv')
at_risk_scorecard.to_csv(at_risk_path, index=False)
at_risk_scorecard.head()


## Analysis 3: Targeted Advertising Channel Advisor
**Business Question:** Where should we spend our advertising budget to most effectively reach each customer segment?

**Methodology:** We analyze the most commonly used product search method for each cluster.


In [None]:
# --- Analysis 3: Ad Advisor Function and Visualization ---
from statistics import mode

def get_top_search_methods(data: pd.DataFrame, original_data: pd.DataFrame) -> pd.DataFrame:
    """Join Product_Search_Method from original data and compute cluster modes.

    Returns a dataframe with the most common Product_Search_Method per Cluster.
    """
    # Determine join key(s). Assume both have a stable unique index if present; else fall back to row order.
    left = data.reset_index(drop=True)
    right = original_data.reset_index(drop=True)

    if "Product_Search_Method" not in right.columns:
        raise KeyError("'Product_Search_Method' not found in original_data")

    merged = left.copy()
    merged["Product_Search_Method"] = right["Product_Search_Method"]

    # Compute mode per cluster; handle multimodal by choosing the first.
    top_methods = (
        merged.groupby("Cluster")["Product_Search_Method"]
        .agg(lambda s: s.mode().iat[0] if not s.mode().empty else None)
        .reset_index()
        .rename(columns={"Product_Search_Method": "Top_Product_Search_Method"})
    )

    return top_methods, merged

# Execute Analysis 3
top_methods, df_with_search = get_top_search_methods(df, df_original)
print(top_methods)

plt.figure(figsize=(10, 6))
sns.countplot(data=df_with_search, x="Product_Search_Method", hue="Cluster")
plt.title("Distribution of Product Search Methods by Cluster")
plt.xlabel("Product Search Method")
plt.ylabel("Count")
plt.xticks(rotation=30, ha='right')
plt.tight_layout()
advertising_plot_path = os.path.join(output_dir, 'advertising_channels.png')
plt.savefig(advertising_plot_path, dpi=300)
plt.close()

top_methods.head()


## Analysis 4: Recommendation Engine Effectiveness Check
**Business Question:** Does using our personalized recommendations correlate with higher customer satisfaction?

**Methodology:** We compare the average shopping satisfaction for customers who use recommendations 'Often' versus those who use them 'Rarely'/'Never'.


In [None]:
# --- Analysis 4: Recommendation Effectiveness Function and Visualization ---

def check_recommendation_effectiveness(data: pd.DataFrame) -> pd.DataFrame:
    """Compare average Shopping_Satisfaction between high vs low recommendation usage groups.

    Categorize 'Personalized_Recommendation_Frequency' into High Usage vs Low Usage and
    compute mean Shopping_Satisfaction by group.
    """
    required_cols = {"Personalized_Recommendation_Frequency", "Shopping_Satisfaction"}
    missing = required_cols - set(data.columns)
    if missing:
        raise KeyError(f"Missing required columns for Analysis 4: {missing}")

    usage_map = {
        "Often": "High Usage",
        "Frequently": "High Usage",
        "Always": "High Usage",
        "Sometimes": "Low Usage",
        "Rarely": "Low Usage",
        "Never": "Low Usage"
    }

    temp = data.copy()
    temp["Recommendation_Usage"] = temp["Personalized_Recommendation_Frequency"].map(usage_map).fillna("Low Usage")

    comparison = (
        temp.groupby("Recommendation_Usage", as_index=False)["Shopping_Satisfaction"].mean()
        .rename(columns={"Shopping_Satisfaction": "Avg_Shopping_Satisfaction"})
    )

    return comparison, temp

# Execute Analysis 4
rec_effectiveness, df_with_usage = check_recommendation_effectiveness(df)
print(rec_effectiveness)

plt.figure(figsize=(8, 5))
sns.barplot(data=rec_effectiveness, x="Recommendation_Usage", y="Avg_Shopping_Satisfaction", palette="viridis")
plt.title("Shopping Satisfaction by Recommendation Usage")
plt.xlabel("Recommendation Usage")
plt.ylabel("Average Shopping Satisfaction")
plt.tight_layout()
rec_plot_path = os.path.join(output_dir, 'recommendation_effectiveness.png')
plt.savefig(rec_plot_path, dpi=300)
plt.close()

rec_effectiveness


## Analysis 5: Customer Service Demand Forecaster
**Business Question:** Which customer segment requires the most attention from our customer service team?

**Methodology:** We calculate the average number of customer service interactions for each cluster.


In [None]:
# --- Analysis 5: Service Demand Function and Visualization ---

def forecast_service_demand(data: pd.DataFrame) -> pd.DataFrame:
    """Compute average customer service interactions per cluster."""
    required_cols = {"Cluster", "Customer_Service_Interactions"}
    missing = required_cols - set(data.columns)
    if missing:
        raise KeyError(f"Missing required columns for Analysis 5: {missing}")

    demand = (
        data.groupby("Cluster", as_index=False)["Customer_Service_Interactions"].mean()
        .rename(columns={"Customer_Service_Interactions": "Avg_Customer_Service_Interactions"})
    )
    return demand

# Execute Analysis 5
service_demand = forecast_service_demand(df)
print(service_demand)

plt.figure(figsize=(8, 5))
sns.barplot(data=service_demand, x="Cluster", y="Avg_Customer_Service_Interactions", palette="mako")
plt.title("Forecast: Customer Service Interactions by Cluster")
plt.xlabel("Cluster")
plt.ylabel("Average Customer Service Interactions")
plt.tight_layout()
service_plot_path = os.path.join(output_dir, 'service_demand_forecast.png')
plt.savefig(service_plot_path, dpi=300)
plt.close()

service_demand.head()


## Report Conclusion
The analyses above provide several actionable insights for enhancing business performance.
