In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

print("TF Version: " + tf.__version__)

TF Version: 2.16.1


In [2]:
#Read data from file

raw_dataset = pd.read_csv("insurance_dataset.csv", sep=',',decimal='.')


In [3]:
dataset = raw_dataset.copy()
dataset.tail()

Unnamed: 0,age,gender,bmi,children,smoker,region,medical_history,family_medical_history,exercise_frequency,occupation,coverage_level,charges
999995,59,male,46.67,2,no,northeast,High blood pressure,,Frequently,Student,Basic,11584.1349
999996,33,male,36.83,2,no,northeast,,High blood pressure,Frequently,Unemployed,Basic,9834.871456
999997,39,male,39.84,0,yes,northeast,Heart disease,High blood pressure,Rarely,Blue collar,Standard,22076.63286
999998,37,female,45.06,4,yes,northeast,High blood pressure,Diabetes,Occasionally,Unemployed,Premium,20297.61873
999999,41,male,30.02,2,yes,northeast,Diabetes,Heart disease,Occasionally,Blue collar,Standard,23429.72503


In [4]:
from sklearn.preprocessing import LabelEncoder



# Define ordinal mappings for ordinal variables
ordinal_mappings = {
    "occupation": {"Unemployed": 0, "Student": 1, "Blue collar": 2, "White collar": 3},
    "coverage_level": {"Basic": 0, "Standard": 1, "Premium": 2}
}

# Apply ordinal encoding to ordinal variables
dataset.replace(ordinal_mappings, inplace=True)



# For non-ordinal variables, you can use regular label encoding
label_encoder = LabelEncoder()
non_ordinal_columns = ['gender', 'smoker', 'medical_history', 'family_medical_history','exercise_frequency']
for column in non_ordinal_columns:
    dataset[column] = label_encoder.fit_transform(dataset[column])


dataset.tail()

Unnamed: 0,age,gender,bmi,children,smoker,region,medical_history,family_medical_history,exercise_frequency,occupation,coverage_level,charges
999995,59,1,46.67,2,0,northeast,2,3,0,1,0,11584.1349
999996,33,1,36.83,2,0,northeast,3,2,0,0,0,9834.871456
999997,39,1,39.84,0,1,northeast,1,2,3,2,1,22076.63286
999998,37,0,45.06,4,1,northeast,2,0,2,0,2,20297.61873
999999,41,1,30.02,2,1,northeast,0,1,2,2,1,23429.72503



Lower the amount of data in the dataset

In [16]:
dataset = dataset.sample(frac=0.025, random_state=42)
dataset.tail()

Unnamed: 0,age,gender,bmi,children,smoker,region,medical_history,family_medical_history,exercise_frequency,occupation,coverage_level,charges
491103,58,0,19.77,0,0,northwest,1,1,1,1,1,16748.11039
74472,62,1,37.65,4,0,northwest,2,2,0,0,0,11340.75798
159813,44,1,34.65,1,1,southeast,2,0,2,2,2,21550.98873
467441,57,0,25.74,1,1,southwest,2,3,1,3,2,17682.42709
387605,45,0,28.16,1,0,northwest,0,2,1,0,2,13105.12302


In [17]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error







# Define features (X) and target variables (y_charges, y_coverage)
#Define features and target variables
X = dataset[['age', 'gender', 'bmi', 'children', 'smoker', 'medical_history', 
             'family_medical_history', 'exercise_frequency', 'occupation']]
y_charges = dataset['charges']




In [18]:
# Split the data into training and testing sets
#X_train, X_test, y_charges_train, y_charges_test, y_coverage_train, y_coverage_test = train_test_split(X, y_charges, y_coverage, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y_charges, test_size=0.2, random_state=42)


In [9]:
# Define parameter grid for hyperparameter tuning
##param_grid = {
#    'n_estimators': [100, 200, 300],  # Number of trees in the forest
#    'max_depth': [None, 10, 20],  # Maximum depth of the tree
#    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
#    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
#    'max_features': ['auto', 'sqrt']  # Number of features to consider when looking for the best split
#}

param_grid_rf = {
    'estimator__estimator__max_depth': [10, 20, 30],
    'estimator__estimator__n_estimators': [100, 200, 300]
}


# Instantiate Random Forest regressor
rf_regressor = RandomForestRegressor(random_state=42)

# Wrap the Random Forest regressor with MultiOutputRegressor
#multioutput_regressor = MultiOutputRegressor(rf_regressor)

# Perform grid search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=rf_regressor, param_grid=param_grid_rf, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train_charges, y_train_charges)

# Get the best model from grid search
best_rf_model = grid_search.best_estimator_



ValueError: Invalid parameter 'estimator' for estimator RandomForestRegressor(random_state=42). Valid parameters are: ['bootstrap', 'ccp_alpha', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'].

In [None]:
# Make predictions on the testing data
y_charges_pred, y_coverage_pred = best_rf_model.predict(X_test)

# Evaluate the model
mse_charges = mean_squared_error(y_charges_test, y_charges_pred)
mse_coverage = mean_squared_error(y_coverage_test, y_coverage_pred)
print("Mean Squared Error (Charges):", mse_charges)
print("Mean Squared Error (Coverage Level):", mse_coverage)

In [21]:
from sklearn.preprocessing import MinMaxScaler

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Fit scaler on training data and transform both training and testing data
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [22]:
from sklearn.svm import SVR

# 2. Instantiate the SVM Regressor
svm_regressor = SVR()

# 3. Hyperparameter Tuning
param_grid = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 10],
    'epsilon': [0.1, 0.2, 0.5]
}
grid_search = GridSearchCV(estimator=svm_regressor, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# 4. Train the Model
grid_search.fit(X_train, y_train)

# 5. Evaluate the Model
best_svm_model = grid_search.best_estimator_
y_pred = best_svm_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# 6. Make Predictions (if needed)
# y_pred_new = best_svm_model.predict(X_new_data)

Mean Squared Error: 9419857.296335671
