In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np
from datetime import datetime
from scipy import stats # This import is not used in the provided code, but kept as it was in the original.
import matplotlib.ticker as ticker

class InsuranceDataAnalyzer:
    """Comprehensive insurance data analysis and reporting tool.

    This class provides functionalities to load, clean, analyze,
    visualize, and report on insurance data from a CSV file.
    """
    
    def __init__(self, file_path='insurance.csv'):
        """
        Initializes the InsuranceDataAnalyzer with a given file path.

        Args:
            file_path (str): The path to the insurance CSV file.
        """
        self.file_path = file_path
        self.df = None
        self.current_date = datetime.now().strftime('%Y-%m-%d')
        self.output_dir = self._get_output_directory()
        # Define a color palette for consistent plotting
        self.palette = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b"]
        self.initialize_plot_settings()

        # Initialize attributes for cleaning metrics
        self.initial_count = 0
        self.duplicates_removed = 0
        self.missing_removed = 0
        self.range_removed = 0
        
    def _get_output_directory(self):
        """
        Determines and creates the output directory for saving plots and reports.
        If the file_path contains a directory, it uses that; otherwise, it uses the current working directory.

        Returns:
            str: The absolute path to the output directory.
        """
        if os.path.dirname(self.file_path):
            output_dir = os.path.dirname(os.path.abspath(self.file_path))
        else:
            output_dir = os.getcwd()
        os.makedirs(output_dir, exist_ok=True)
        print(f"Output directory: {output_dir}")
        return output_dir
    
    def initialize_plot_settings(self):
        """
        Configures consistent plot styling using seaborn and matplotlib rcParams.
        Sets font family, sizes, figure dimensions, and DPI for high-quality outputs.
        """
        sns.set_style("whitegrid")
        plt.rcParams.update({
            'font.family': 'sans-serif',
            'font.size': 12,
            'axes.titlesize': 14,
            'axes.titleweight': 'bold',
            'figure.figsize': (10, 6),
            'figure.dpi': 300,
            'savefig.dpi': 300
        })
    
    @staticmethod
    def safe_format_float(value, format_spec=".0f"):
        """
        Safely formats float values, handling NaN/Inf by returning "N/A".

        Args:
            value (float): The float value to format.
            format_spec (str): The format specification string (e.g., ".2f" for two decimal places).

        Returns:
            str: The formatted string or "N/A" if the value is NaN/Inf.
        """
        if pd.isna(value) or np.isinf(value):
            return "N/A"
        try:
            return f"{value:{format_spec}}"
        except (ValueError, TypeError):
            return "N/A"
    
    def load_data(self):
        """
        Loads data from the specified CSV file into a pandas DataFrame.
        Raises FileNotFoundError if the file does not exist or a general Exception if loading fails.
        """
        if not os.path.exists(self.file_path):
            raise FileNotFoundError(f"Input file not found: {self.file_path}")
        
        try:
            self.df = pd.read_csv(self.file_path)
            self.initial_count = len(self.df)
            print(f"Data loaded successfully. Initial records: {len(self.df)}")
        except Exception as e:
            raise Exception(f"Failed to load data: {str(e)}")
    
    def clean_data(self):
        """
        Performs comprehensive data cleaning on the loaded DataFrame.
        Steps include:
        1. Removing duplicate rows.
        2. Removing rows with any missing values.
        3. Filtering rows based on valid data ranges for 'age', 'bmi', 'children', and 'charges'.
        4. Converting specified categorical columns to 'category' dtype.
        5. Saves the cleaned data to a new CSV file.
        """
        if self.df is None:
            raise ValueError("Data not loaded. Call load_data() first.")
            
        initial_count_for_cleaning = len(self.df)

        # Remove duplicates
        self.df = self.df.drop_duplicates()
        self.duplicates_removed = initial_count_for_cleaning - len(self.df)
        print(f"Removed {self.duplicates_removed} duplicate rows.")
        
        # Handle missing values
        missing_before = len(self.df)
        self.df = self.df.dropna()
        self.missing_removed = missing_before - len(self.df)
        print(f"Removed {self.missing_removed} rows with missing values.")
        
        # Validate data ranges
        range_before = len(self.df)
        self.df = self.df[
            (self.df['age'].between(18, 100)) &
            (self.df['bmi'].between(10, 60)) &
            (self.df['children'].between(0, 10)) &
            (self.df['charges'] > 0)
        ]
        self.range_removed = range_before - len(self.df)
        print(f"Removed {self.range_removed} rows with invalid data ranges.")
        
        # Convert categorical columns to 'category' dtype for memory efficiency and better plotting
        categorical_cols = ['sex', 'smoker', 'region']
        for col in categorical_cols:
            if col in self.df.columns:
                self.df[col] = self.df[col].astype('category')
        
        # Save cleaned data
        cleaned_path = os.path.join(self.output_dir, "Star_cleaned_insurance.csv")
        self.df.to_csv(cleaned_path, index=False)
        print(f"Saved cleaned data to: {cleaned_path}")
        print(f"Final cleaned records: {len(self.df)}")
    
    def generate_visualizations(self):
        """
        Generates all predefined visualizations if data is available.
        Prints a message if visualizations cannot be generated due to missing data.
        """
        print("\n--- Generating Visualizations ---")
        if self.df is None or self.df.empty:
            print("Cannot generate visualizations: No data available after cleaning.")
            return

        self.generate_distribution_plots()
        self.generate_relationship_plots()
        self.generate_categorical_plots()
        self.generate_smoker_effect_plot()
        self.generate_correlation_matrix()
    
    def generate_distribution_plots(self):
        """
        Generates and saves histogram plots for numerical features: age, bmi, children, and charges.
        Includes KDE for continuous variables and appropriate binning.
        """
        if not all(col in self.df.columns for col in ['age', 'bmi', 'children', 'charges']):
            print("Skipping distribution plots: Missing required columns")
            return
            
        fig, axes = plt.subplots(2, 2, figsize=(14, 10))
        fig.suptitle('Distribution Analysis of Key Numerical Features', fontsize=16, y=1.02)
        
        # Age distribution
        sns.histplot(self.df['age'], bins=30, kde=True, ax=axes[0, 0], color=self.palette[0])
        axes[0, 0].set_title('Age Distribution')
        axes[0, 0].set_xlabel('Age (years)')
        axes[0, 0].set_ylabel('Count')
        
        # BMI distribution
        sns.histplot(self.df['bmi'], bins=30, kde=True, ax=axes[0, 1], color=self.palette[1])
        axes[0, 1].set_title('BMI Distribution')
        axes[0, 1].set_xlabel('BMI (kg/m²)')
        axes[0, 1].set_ylabel('Count')
        
        # Children distribution
        sns.histplot(self.df['children'], bins=6, discrete=True, ax=axes[1, 0], color=self.palette[2])
        axes[1, 0].set_title('Number of Children Distribution')
        axes[1, 0].set_xlabel('Number of Children')
        axes[1, 0].set_ylabel('Count')
        
        # Charges distribution
        sns.histplot(self.df['charges'], bins=50, kde=True, ax=axes[1, 1], color=self.palette[3])
        axes[1, 1].set_title('Insurance Charges Distribution')
        axes[1, 1].set_xlabel('Charges ($)')
        axes[1, 1].set_ylabel('Count')
        axes[1, 1].xaxis.set_major_formatter(ticker.ScalarFormatter(useOffset=False, useMathText=False))
        axes[1, 1].ticklabel_format(style='plain', axis='x')
        
        plt.tight_layout(rect=[0, 0.03, 1, 0.98]) # Adjust rect to prevent suptitle overlap
        self.save_plot('Distribution_Analysis_of_Key_Numerical_Features')
        plt.close()
    
    def generate_relationship_plots(self):
        """
        Generates and saves scatter plots and box plots to show relationships between
        numerical features (age, bmi, children) and insurance charges.
        """
        if not all(col in self.df.columns for col in ['age', 'bmi', 'children', 'charges']):
            print("Skipping relationship plots: Missing required columns")
            return
            
        plt.figure(figsize=(18, 6))
        plt.suptitle('Relationships Between Key Features and Medical Charges', fontsize=16)
        
        # Age vs Charges
        plt.subplot(1, 3, 1)
        sns.regplot(x='age', y='charges', data=self.df, color=self.palette[0], scatter_kws={'alpha':0.5})
        plt.title('Age vs. Charges')
        plt.xlabel('Age (years)')
        plt.ylabel('Charges ($)')
        
        # BMI vs Charges
        plt.subplot(1, 3, 2)
        sns.regplot(x='bmi', y='charges', data=self.df, color=self.palette[1], scatter_kws={'alpha':0.5})
        plt.title('BMI vs. Charges')
        plt.xlabel('BMI (kg/m²)')
        plt.ylabel('Charges ($)')
        
        # Children vs Charges
        plt.subplot(1, 3, 3)
        sns.boxplot(x='children', y='charges', data=self.df, hue='children', palette=self.palette, legend=False)
        plt.title('Number of Children vs. Charges')
        plt.xlabel('Number of Children')
        plt.ylabel('Charges ($)')
        
        plt.tight_layout(rect=[0, 0.03, 1, 0.95]) # Adjust rect to prevent suptitle overlap
        self.save_plot('Relationships_Between_Key_Features_and_Medical_Charges')
        plt.close()
    
    def generate_categorical_plots(self):
        """
        Generates and saves box plots to show insurance charges distribution
        across categorical variables: sex, smoker status, and region.
        """
        if not all(col in self.df.columns for col in ['sex', 'smoker', 'region', 'charges']):
            print("Skipping categorical plots: Missing required columns")
            return
            
        plt.figure(figsize=(18, 6))
        plt.suptitle('Insurance Charges by Categorical Variables', fontsize=16)
        
        # Charges by Sex
        plt.subplot(1, 3, 1)
        sns.boxplot(x='sex', y='charges', data=self.df, hue='sex', palette=self.palette[:2], legend=False)
        plt.title('Charges by Sex')
        plt.xlabel('Sex')
        plt.ylabel('Charges ($)')
        
        # Charges by Smoker
        plt.subplot(1, 3, 2)
        sns.boxplot(x='smoker', y='charges', data=self.df, hue='smoker', palette=self.palette[2:4], legend=False)
        plt.title('Charges by Smoker Status')
        plt.xlabel('Smoker')
        plt.ylabel('Charges ($)')
        
        # Charges by Region
        plt.subplot(1, 3, 3)
        sns.boxplot(x='region', y='charges', data=self.df, hue='region', palette=self.palette[1:5], legend=False)
        plt.title('Charges by Region')
        plt.xlabel('Region')
        plt.ylabel('Charges ($)')
        
        plt.tight_layout(rect=[0, 0.03, 1, 0.95]) # Adjust rect to prevent suptitle overlap
        self.save_plot('Insurance_Charges_by_Categorical_Variables')
        plt.close()
    
    def generate_smoker_effect_plot(self):
        """
        Generates and saves a scatter plot illustrating the interaction effect of
        age and smoking status on insurance charges.
        """
        if not all(col in self.df.columns for col in ['age', 'smoker', 'charges']):
            print("Skipping smoker effect plot: Missing required columns")
            return
            
        plt.figure(figsize=(12, 7))
        sns.scatterplot(x='age', y='charges', hue='smoker', data=self.df,
                        palette={'yes': self.palette[3], 'no': self.palette[0]}, 
                        alpha=0.7, s=50)
        plt.title('Impact of Age and Smoking Status on Insurance Charges', fontsize=16)
        plt.xlabel('Age (years)')
        plt.ylabel('Charges ($)')
        plt.grid(True, alpha=0.2)
        plt.legend(title='Smoker')
        plt.tight_layout()
        self.save_plot('Impact_of_Age_and_Smoking_Status_on_Insurance_Charges')
        plt.close()
    
    def generate_correlation_matrix(self):
        """
        Generates and saves a heatmap of the correlation matrix for numerical features.
        """
        numeric_df = self.df.select_dtypes(include=np.number)
        if len(numeric_df.columns) < 2 or 'charges' not in numeric_df.columns or numeric_df.empty:
            print("Skipping correlation matrix: Not enough numeric columns or data is empty")
            return
            
        plt.figure(figsize=(9, 7))
        sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', 
                    fmt=".2f", linewidths=0.5, linecolor='black')
        plt.title('Correlation Matrix of Numerical Features', fontsize=16)
        plt.tight_layout()
        self.save_plot('Correlation_Matrix_of_Numerical_Features')
        plt.close()
    
    def save_plot(self, title):
        """
        Saves the current matplotlib figure to the output directory with a standardized filename.

        Args:
            title (str): The base title for the plot, used to create the filename.
        """
        filename = "Star_" + title.replace(" ", "_").replace(":", "") + ".png"
        path = os.path.join(self.output_dir, filename)
        plt.savefig(path, bbox_inches='tight')
        print(f"Saved plot: {filename}")
    
    def generate_report(self):
        """
        Generates a comprehensive text report summarizing data cleaning,
        descriptive statistics, and key insights from the analysis.
        The report is saved to a text file in the output directory.
        """
        if self.df is None or self.df.empty:
            print("Cannot generate report: No data available")
            return
            
        report_path = os.path.join(self.output_dir, "Star_insurance_analysis_report.txt")
        
        with open(report_path, 'w') as f:
            f.write("=== INSURANCE DATA ANALYSIS REPORT ===\n")
            f.write(f"Generated on: {self.current_date}\n\n")
            
            # Data cleaning summary
            f.write("=== DATA CLEANING SUMMARY ===\n")
            f.write(f"Initial records: {self.initial_count}\n")
            f.write(f"Duplicate records removed: {self.duplicates_removed}\n")
            f.write(f"Records with missing values removed: {self.missing_removed}\n")
            f.write(f"Records with invalid ranges removed: {self.range_removed}\n")
            f.write(f"Final cleaned records: {len(self.df)}\n")
            # Avoid division by zero if initial_count is 0
            retention_rate = len(self.df) / self.initial_count if self.initial_count > 0 else 0
            f.write(f"Data retention rate: {retention_rate:.1%}\n\n")
            
            # Descriptive statistics
            f.write("=== DESCRIPTIVE STATISTICS ===\n")
            f.write("Numerical features:\n")
            # Ensure describe() output is not truncated and is properly formatted
            f.write(self.df.describe().to_string())
            f.write("\n\nCategorical features:\n")
            f.write(self.df.describe(include='category').to_string())
            f.write("\n\n")
            
            # Key insights
            f.write("=== KEY INSIGHTS ===\n")
            # Age analysis
            if 'age' in self.df.columns:
                age_stats = self.df['age'].describe()
                f.write(f"Age Distribution:\n")
                f.write(f"- Average age: {age_stats['mean']:.1f} years\n")
                f.write(f"- Age range: {age_stats['min']:.0f} to {age_stats['max']:.0f} years\n")
                f.write(f"- 25% of customers are under {age_stats['25%']:.0f} years\n")
                f.write(f"- 75% of customers are under {age_stats['75%']:.0f} years\n\n")
            
            # Charges analysis
            if 'charges' in self.df.columns:
                charges_stats = self.df['charges'].describe()
                f.write(f"Insurance Charges:\n")
                f.write(f"- Average charge: ${charges_stats['mean']:,.2f}\n")
                f.write(f"- Minimum charge: ${charges_stats['min']:,.2f}\n")
                f.write(f"- Maximum charge: ${charges_stats['max']:,.2f}\n")
                f.write(f"- Standard deviation: ${charges_stats['std']:,.2f}\n\n")
            
            # Smoker analysis
            if 'smoker' in self.df.columns and not self.df['smoker'].empty:
                smoker_counts = self.df['smoker'].value_counts()
                smoker_charges = self.df.groupby('smoker')['charges'].mean()
                f.write(f"Smoker Analysis:\n")
                
                yes_count = smoker_counts.get('yes', 0)
                no_count = smoker_counts.get('no', 0)
                total_count = len(self.df)

                f.write(f"- Smokers: {yes_count} ({yes_count/total_count:.1%})\n")
                f.write(f"- Non-smokers: {no_count} ({no_count/total_count:.1%})\n")
                
                avg_charges_yes = smoker_charges.get('yes', 0)
                avg_charges_no = smoker_charges.get('no', 0)

                f.write(f"- Average charges for smokers: ${avg_charges_yes:,.2f}\n")
                f.write(f"- Average charges for non-smokers: ${avg_charges_no:,.2f}\n")
                
                # Handle division by zero for the ratio if no non-smokers exist
                if avg_charges_no > 0:
                    f.write(f"- Smokers pay {avg_charges_yes/avg_charges_no:.1f}x more on average\n\n")
                else:
                    f.write("- Cannot calculate smoker charge ratio (no non-smoker data).\n\n")
            
            # Regional analysis
            if 'region' in self.df.columns and not self.df['region'].empty:
                region_counts = self.df['region'].value_counts()
                region_charges = self.df.groupby('region')['charges'].mean()
                f.write(f"Regional Analysis:\n")
                for region in region_counts.index:
                    f.write(f"- {region}: {region_counts[region]} customers ({region_counts[region]/len(self.df):.1%}), ")
                    f.write(f"avg charges ${region_charges[region]:,.2f}\n")
                f.write("\n") # Add a newline for better spacing
            
            f.write("=== ANALYSIS COMPLETED ===\n")
        
        print(f"Report generated successfully: {report_path}")

# Example usage:
if __name__ == "__main__":
    # Ensure 'insurance.csv' is in the same directory as this script, or provide a full path.
    analyzer = InsuranceDataAnalyzer('insurance.csv')
    try:
        analyzer.load_data()
        analyzer.clean_data()
        analyzer.generate_visualizations()
        analyzer.generate_report()
    except Exception as e:
        print(f"An error occurred during analysis: {e}")

Output directory: c:\Users\Younu\Desktop\CornerstoneProject\data-analytics-template\Notebooks_and_PytonFiles_With_Raw_Insurancecsv
Data loaded successfully. Initial records: 1338
Removed 1 duplicate rows.
Removed 0 rows with missing values.
Removed 0 rows with invalid data ranges.
Saved cleaned data to: c:\Users\Younu\Desktop\CornerstoneProject\data-analytics-template\Notebooks_and_PytonFiles_With_Raw_Insurancecsv\Star_cleaned_insurance.csv
Final cleaned records: 1337

--- Generating Visualizations ---
Saved plot: Star_Distribution_Analysis_of_Key_Numerical_Features.png
Saved plot: Star_Relationships_Between_Key_Features_and_Medical_Charges.png
Saved plot: Star_Insurance_Charges_by_Categorical_Variables.png
Saved plot: Star_Impact_of_Age_and_Smoking_Status_on_Insurance_Charges.png
Saved plot: Star_Correlation_Matrix_of_Numerical_Features.png
Report generated successfully: c:\Users\Younu\Desktop\CornerstoneProject\data-analytics-template\Notebooks_and_PytonFiles_With_Raw_Insurancecsv\St

  smoker_charges = self.df.groupby('smoker')['charges'].mean()
  region_charges = self.df.groupby('region')['charges'].mean()
