In [None]:
import pandas as pd

# Specify the file path
file_path = '/content/Health_insurance.csv'

# Load the CSV file into a DataFrame
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print(data.head())


   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520


In [None]:
import plotly.express as px  # Importing Plotly Express for interactive visualizations

# Creating a histogram to visualize the number of smokers by gender
figure = px.histogram(data, x="sex", color="smoker", title="Number of Smokers")
figure.show()

# Mapping categorical string values to numerical values for 'sex' and 'smoker' columns
data["sex"] = data["sex"].map({"female": 0, "male": 1})
data["smoker"] = data["smoker"].map({"no": 0, "yes": 1})
print(data.head())  # Displaying the updated dataset with numerical mappings


   age  sex     bmi  children  smoker     region      charges
0   19    0  27.900         0       1  southwest  16884.92400
1   18    1  33.770         1       0  southeast   1725.55230
2   28    1  33.000         3       0  southeast   4449.46200
3   33    1  22.705         0       0  northwest  21984.47061
4   32    1  28.880         0       0  northwest   3866.85520


In [None]:
# Creating a pie chart to visualize the distribution of regions
pie = data["region"].value_counts()
regions = pie.index
population = pie.values
fig = px.pie(data, values=population, names=regions, title="Region Distribution")
fig.show()

# One-hot encoding the 'region' column to convert categorical data into numerical data
data = pd.get_dummies(data, columns=["region"], drop_first=True)
print(data.head())  # Displaying the dataset after one-hot encoding


   age  sex     bmi  children  smoker      charges  region_northwest  \
0   19    0  27.900         0       1  16884.92400             False   
1   18    1  33.770         1       0   1725.55230             False   
2   28    1  33.000         3       0   4449.46200             False   
3   33    1  22.705         0       0  21984.47061              True   
4   32    1  28.880         0       0   3866.85520              True   

   region_southeast  region_southwest  
0             False              True  
1              True             False  
2              True             False  
3             False             False  
4             False             False  


In [None]:
from sklearn.model_selection import train_test_split  # Importing train_test_split for data splitting

# Defining features (X) and target variable (y)
X = data.drop(columns=["charges"])
y = data["charges"]

# Splitting the data into training and testing sets
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.ensemble import RandomForestRegressor  # Importing RandomForestRegressor model

# Initializing the Random Forest Regressor model
forest = RandomForestRegressor()
forest.fit(xtrain, ytrain)  # Training the model on the training data

# Predicting the insurance charges for the test set
ypred = forest.predict(xtest)

# Creating a DataFrame to display the predicted premium amounts
predicted_data = pd.DataFrame(data={"Predicted Premium Amount": ypred})
print(predicted_data.head())  # Displaying the first few predicted premium amounts

   Predicted Premium Amount
0               9499.403498
1               5639.209358
2              28599.672176
3              11175.035375
4              34842.986178
