# U.S. Medical Insurance Costs

In [21]:
# Import necessary libraries
import csv
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind, f_oneway
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt

# Load the insurance dataset
dataframe = pd.read_csv('insurance.csv')

## Compute basic descriptive statistics

In [None]:
# Compute basic descriptive statistics
descriptives = dataframe.describe()

# Calculate mode for categorical variables
mode = dataframe.mode().iloc[0]

# Display descriptive statistics and mode
print("Descriptive Statistics:")
print(descriptives)
print("\nMode:")
print(mode)

# Average charges by different categories
avg_charges_by_smoker = dataframe.groupby('smoker')['charges'].mean()
avg_charges_by_bmi = dataframe.groupby('bmi')['charges'].mean()
avg_charges_by_children = dataframe.groupby('children')['charges'].mean()
avg_charges_by_region = dataframe.groupby('region')['charges'].mean()
avg_charges_by_sex = dataframe.groupby('sex')['charges'].mean()

print("\nAverage Charges by Smoker Status:")
print(avg_charges_by_smoker)

print("\nAverage Charges by BMI Bracket:")
print(avg_charges_by_bmi)

print("\nAverage Charges by Number of Children:")
print(avg_charges_by_children)

print("\nAverage Charges by Region:")
print(avg_charges_by_region)

print("\nAverage Charges by Sex:")
print(avg_charges_by_sex)

## Create visualizations: histograms, box plots, and scatter plots

In [None]:
# Create visualizations: histograms, box plots, and scatter plots

# Histogram of charges
plt.figure(figsize=(8, 6))
sns.histplot(dataframe['charges'], bins=30, kde=True)
plt.title('Distribution of Medical Charges')
plt.xlabel('Charges')
plt.ylabel('Frequency')
plt.show()

# Box plot of charges by smoker status
plt.figure(figsize=(8, 6))
sns.boxplot(x='smoker', y='charges', data=dataframe)
plt.title('Box Plot of Charges by Smoker Status')
plt.xlabel('Smoker')
plt.ylabel('Charges')
plt.show()

## Implement functionality to calculate confidence intervals for sample means

In [None]:
# Implement functionality to calculate confidence intervals for sample means

def calculate_confidence_interval(data, confidence=0.95):
    """
    Calculate confidence interval for sample mean.
    
    Parameters:
        data (array-like): The data from which to calculate the confidence interval.
        confidence (float): The level of confidence for the interval, default is 0.95.
    
    Returns:
        tuple: The confidence interval for the sample mean.
    """
    n = len(data)
    mean = np.mean(data)
    std_err = np.std(data, ddof=1) / np.sqrt(n)  # ddof=1 for sample standard deviation
    margin_of_error = std_err * stats.t.ppf((1 + confidence) / 2, n - 1)
    return mean - margin_of_error, mean + margin_of_error

# Example of calculating confidence interval for charges
charges = dataframe['charges']
confidence_interval = calculate_confidence_interval(charges)
print("\nConfidence Interval for Charges (95%):", confidence_interval)

## Perform inferential statistics for group comparisons

### T-test: Comparing charges between smokers and non-smokers

In [None]:
# Perform inferential statistics for group comparisons

# T-test: Comparing charges between smokers and non-smokers
smoker_charges = dataframe[dataframe['smoker'] == 'yes']['charges']
non_smoker_charges = dataframe[dataframe['smoker'] == 'no']['charges']

t_stat, p_value = ttest_ind(smoker_charges, non_smoker_charges)
print(f'\nT-test: t-statistic = {t_stat}, p-value = {p_value}')

In [None]:
# Visualize t-test results
plt.figure(figsize=(8, 6))
sns.barplot(x=['Smoker', 'Non-Smoker'], y=[np.mean(smoker_charges), np.mean(non_smoker_charges)])
plt.title('Average Charges: Smoker vs Non-Smoker')
plt.ylabel('Average Charges')
plt.show()

### ANOVA: Comparing charges across regions

In [16]:
# ANOVA: Comparing charges across regions
f_stat, p_value_anova = f_oneway(dataframe['charges'][dataframe['region'] == 'southwest'],
                                  dataframe['charges'][dataframe['region'] == 'southeast'],
                                  dataframe['charges'][dataframe['region'] == 'northwest'],
                                  dataframe['charges'][dataframe['region'] == 'northeast'])
print(f'ANOVA: F-statistic = {f_stat}, p-value = {p_value_anova}')

ANOVA: F-statistic = 2.96962669358912, p-value = 0.0308933560705201


In [None]:
# Visualize ANOVA results
plt.figure(figsize=(8, 6))
sns.boxplot(x='region', y='charges', data=dataframe)
plt.title('Charges by Region')
plt.ylabel('Charges')
plt.xlabel('Region')
plt.show()

### Linear regression with age, BMI, children, and smoking status predicting medical charges

In [20]:
# Regression Analysis
# Linear regression with age, BMI, children, and smoking status predicting medical charges

# Define predictors and target variable
X = dataframe[['age', 'bmi', 'children', 'smoker']]
y = dataframe['charges']

# Add constant to predictor variables
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()

# Print model summary
print("\nRegression Analysis Summary:")
print(model.summary())

In [None]:
# Scatter plot of actual charges vs. predicted charges
plt.figure(figsize=(8, 6))
plt.scatter(model.predict(), y, alpha=0.5)
plt.title('Actual Charges vs. Predicted Charges (Regression Model)')
plt.xlabel('Predicted Charges')
plt.ylabel('Actual Charges')
plt.show()

# Plot residuals
residuals = model.resid
plt.figure(figsize=(8, 6))
plt.scatter(model.predict(), residuals, alpha=0.5)
plt.title('Residuals vs. Predicted Charges (Regression Model)')
plt.xlabel('Predicted Charges')
plt.ylabel('Residuals')
plt.axhline(y=0, color='r', linestyle='--')
plt.show()

# Code Explanation

In [None]:
# Provide explanations
print("\n---------------------------------------------")
print("Explanation:")
print("- Descriptive statistics, including mean, median, quartiles, standard deviation, and count, provide a summary of the distribution of each numerical variable in the dataset.")
print("- Mode represents the most frequent value in each column.")
print("- Visualizations such as histograms and box plots help visualize the distribution and identify potential outliers.")
print("- Confidence intervals for sample means provide an estimate of the range in which the population mean is likely to fall, given a certain level of confidence.")
print("- T-tests and ANOVA are used for group comparisons, assessing differences in charges based on different categorical variables.")
print("- Regression analysis allows us to understand the relationship between predictors (age, BMI, children, smoker status) and the target variable (charges), and to make predictions based on these relationships.")
print("---------------------------------------------")