In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Set visual style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

def create_plots_folder():
    if not os.path.exists("plots"):
        os.makedirs("plots")
        print("Created 'plots' directory to save visualizations.")

def load_and_clean_data(filepath):
    print(f"Loading data from {filepath}...")
    try:
        df = pd.read_csv(filepath)
    except FileNotFoundError:
        print(f"Error: {filepath} not found.")
        return None

    # 1. Date Conversions
    date_cols = ['Issue_reported at', 'issue_responded']
    for col in date_cols:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce')

    # 2. Calculate Response Time (Minutes)
    if 'issue_responded' in df.columns and 'Issue_reported at' in df.columns:
        df['response_time_minutes'] = (df['issue_responded'] - df['Issue_reported at']).dt.total_seconds() / 60.0

    # 3. Numeric Conversions
    if 'Item_price' in df.columns:
        df['Item_price'] = pd.to_numeric(df['Item_price'], errors='coerce')

    return df

def analyze_target_variable(df):
    """Visualizes the distribution of CSAT Scores."""
    print("Analyzing Target Variable (CSAT Score)...")
    plt.figure(figsize=(8, 5))
    ax = sns.countplot(x='CSAT Score', data=df, palette='viridis')
    plt.title('Distribution of Customer Satisfaction Scores')
    plt.xlabel('CSAT Score')
    plt.ylabel('Count')
    
    # Add percentage labels
    total = len(df)
    for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() / 2 - 0.05
        y = p.get_height()
        ax.annotate(percentage, (x, y), ha='center', va='bottom')
        
    plt.savefig('plots/01_csat_distribution.png')
    plt.close()

def analyze_channel_performance(df):
    """Analyzes CSAT Score by Channel."""
    print("Analyzing Channel Performance...")
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='channel_name', y='CSAT Score', data=df, palette='Set2')
    plt.title('CSAT Score by Channel')
    plt.savefig('plots/02_csat_by_channel.png')
    plt.close()

def analyze_tenure_impact(df):
    """Analyzes if agent tenure impacts CSAT."""
    print("Analyzing Agent Tenure Impact...")
    if 'Tenure Bucket' in df.columns:
        order = ['On Job Training', '0-30', '31-60', '61-90', '>90']
        # Filter only existing categories in data
        existing_order = [o for o in order if o in df['Tenure Bucket'].unique()]
        
        plt.figure(figsize=(10, 6))
        sns.barplot(x='Tenure Bucket', y='CSAT Score', data=df, order=existing_order, palette='magma', errorbar=None)
        plt.title('Average CSAT Score by Agent Tenure')
        plt.ylim(1, 5.5) # Zoom in to see differences
        plt.savefig('plots/03_csat_by_tenure.png')
        plt.close()

def analyze_response_time(df):
    """Analyzes relationship between Response Time and CSAT."""
    print("Analyzing Response Time...")
    if 'response_time_minutes' in df.columns:
        # Filter extreme outliers for better visualization (e.g., < 24 hours)
        subset = df[df['response_time_minutes'] < 1440] 
        
        plt.figure(figsize=(10, 6))
        sns.scatterplot(x='response_time_minutes', y='CSAT Score', data=subset, alpha=0.5)
        plt.title('Response Time vs CSAT Score')
        plt.xlabel('Response Time (Minutes)')
        plt.savefig('plots/04_response_time_vs_csat.png')
        plt.close()

def perform_eda():
    create_plots_folder()
    
    # --- ROBUST PATH FINDING LOGIC ---
    filename = "eCommerce_Customer_support_data.csv"
    
    # Places to look for the 'data' folder
    search_roots = [
        os.getcwd(),                                  # Current CLI location
        os.path.dirname(os.path.abspath(__file__)),   # Script location
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # Parent of script
    ]
    
    file_path = None
    
    for root in search_roots:
        possible_path = os.path.join(root, "data", filename)
        if os.path.exists(possible_path):
            file_path = possible_path
            break
            
    # Fallback: Check strictly current directory if flat structure
    if file_path is None:
        if os.path.exists(filename):
            file_path = filename
    
    if file_path is None or not os.path.exists(file_path):
        print("❌ Error: Dataset not found.")
        print(f"   Looking for: {filename}")
        print("   Please ensure the CSV file is in a 'data' folder or the current directory.")
        return

    # Load Data
    df = load_and_clean_data(file_path)
    
    if df is not None:
        print("\n--- Dataset Info ---")
        print(df.info())
        print("\n--- Missing Values ---")
        print(df.isnull().sum())
        
        # Run Visualizations
        analyze_target_variable(df)
        analyze_channel_performance(df)
        analyze_tenure_impact(df)
        analyze_response_time(df)
        
        print("\n✅ EDA Complete! Check the 'plots' folder for images.")

if __name__ == "__main__":
    perform_eda()

NameError: name '__file__' is not defined