## Import Libraries and Load Data

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import os

In [2]:
# Prints files in current directory
print(os.listdir())

['.ipynb_checkpoints', 'DataCleaning.ipynb', 'insurance.csv']


In [3]:
# Loads insurance.csv into a DataFrame
df = pd.read_csv("insurance.csv")

In [4]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


## Convert to Numeric Values

In [5]:
# Defines categorical values to numeric values for region, smoker, sex
regionConvert = {'northwest': 0, 'northeast': 1, 'southwest': 2, 'southeast': 3}
smokerConvert = {'yes': 1, 'no': 0}
sexConvert = {'male': 1, 'female': 0}

In [6]:
# Converts categorical values to numeric values for region, smoker, sex
df['region'] = df['region'].map(regionConvert)
df['smoker'] = df['smoker'].map(smokerConvert)
df['sex'] = df['sex'].map(sexConvert)

## Identify and Remove Duplicates

In [7]:
# Identify duplications 
dupInfo = df[df.duplicated(keep=False)] 

In [8]:
dupInfo

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
195,19,1,30.59,0,0,0,1639.5631
581,19,1,30.59,0,0,0,1639.5631


In [9]:
# Removes duplications
df = df.drop_duplicates()

## Descriptive Statistics

In [10]:
# Returns statistics about the numerical columns in a dataset ('age', 'bmi', 'children', 'charges')
descStats = df.describe(include='all').drop(['sex', 'smoker', 'region'], axis=1)

In [11]:
descStats

Unnamed: 0,age,bmi,children,charges
count,1337.0,1337.0,1337.0,1337.0
mean,39.222139,30.663452,1.095737,13279.121487
std,14.044333,6.100468,1.205571,12110.359656
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29,0.0,4746.344
50%,39.0,30.4,1.0,9386.1613
75%,51.0,34.7,2.0,16657.71745
max,64.0,53.13,5.0,63770.42801


In [12]:
# Calculate mean, minimum, and maximum charges
chargesMean = df['charges'].mean()
chargesMin = df['charges'].min()
chargesMax = df['charges'].max()

In [13]:
chargesMean

13279.121486655948

In [14]:
chargesMin

1121.8739

In [15]:
chargesMax

63770.42801

## Output Results to File

In [16]:
OutputFile = (
    "Duplicate Rows Removed:\n" + dupInfo.to_string(index=False) +
    "\n\nDescriptive Statistics Overview:\n" + descStats.to_string() + 
    "\n\nAdditional Statistics for Charges:\n" +
    f"Mean Charges: {chargesMean}\n" +
    f"Minimum Charges: {chargesMin}\n" +
    f"Maximum Charges: {chargesMax}\n"
)

In [17]:
# Write the results to a text file
with open('descStats.txt', 'w') as file:
    file.write(OutputFile)

In [18]:
print("Analysis results have been written to 'descStats.txt'.")

Analysis results have been written to 'descStats.txt'.


In [19]:
# Save the cleaned data to a new CSV file (cleanedInsurance.csv)
df.to_csv('cleanedInsurance.csv', index=False)
print("Cleaned data has been saved to 'cleanedInsurance.csv'.")

Cleaned data has been saved to 'cleanedInsurance.csv'.
