In [None]:
!git clone https://github.com/cesarlegendre/credit_scoring_7904_Q4_2024

# Health insurance cost

### Introduction

This notebook aims to predict individual medical costs billed by health insurance based on several factors using LogisticRegression, KNeighborsClassifier,RandomForestClassifier, RandomForestClassifier, SVC (Support Vector Classifier), GradientBoostingClassifier. We will clean the data, feature engieniring, data split for crosss valitation, hyper parameter tunning and seletion of the models.


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

file = 'credit_scoring_7904_Q4_2024/data_sets/health_cost/insurance.csv'


# Loading the Data
We load the dataset which contains information about age, sex, BMI, smoking habits, number of children, region, and medical charges.

In [None]:
# Load data
data = pd.read_csv(file)
data.sample(4)


## Step 3: Understanding the Data

##Column Descriptions:

* **age**: Age of the primary beneficiary.
* **sex**: Gender of the beneficiary.
* **bmi**: Body Mass Index, a measure of body fat based on height and weight.
* **children**: Number of dependents.
* **smoker**: Whether the beneficiary is a smoker.
* **region**: Region in the US where the beneficiary resides.
* **charges**: Medical costs billed by the * health insurance (target variable).

We inspect the structure of the dataset and calculate basic statistics.

 # This problem is transformed into a classification one (targer eng.)

In [None]:
# prompt: plot the distribution of the charges

plt.figure(figsize=(10, 6))
sns.histplot(data['charges'], bins=30, kde=True)
plt.title('Distribution of Medical Charges')
plt.xlabel('Medical Charges')
plt.ylabel('Frequency')
plt.show()


In [None]:

data['Expensive Client'] = data['charges'] > 10000
data = data.drop('charges', axis=1)


In [None]:
data

# Feature Engineering

## Encoding the 'sex' Variable
The sex variable has two categories: 'female' and 'male'. We'll map these to numerical values.

Checking Unique Values

In [None]:
# check sex

data['sex'].unique()


In [None]:
data['sex_male'] = data['sex'].map({'female': 0, 'male': 1})

data = data.drop('sex', axis=1)


## Encoding the 'smoker' Variable
The smoker variable has two categories: 'yes' and 'no'.

Checking Unique Values

In [None]:
data['smoker'].unique()


In [None]:
data['smoker_yes'] = data['smoker'].map({'no': 0, 'yes': 1})

data = data.drop('smoker', axis=1)



## Encoding the 'region' Variable
The region variable has four categories:

* '**southwest**'
* '**southeast**'
* '**northwest**'
* '**northeast**'

We need to create dummy variables for these, dropping one category to avoid the dummy variable trap.

Checking Unique Values

In [None]:
data['region'].unique()


In [None]:
regions = ['southwest', 'southeast', 'northwest']  # Excluding 'northeast'

for region in regions:
    column_name = 'region_' + region
    data[column_name] = data['region'].apply(lambda x: 1 if x == region else 0)

data = data.drop('region', axis=1)


## Converting 'Expensive Client' to Numerical
If the Expensive Client column is of boolean type, we'll convert it to integers.

In [None]:
data['Expensive Client'].dtype

In [None]:
data['Expensive Client'] = data['Expensive Client'].astype(int)


In [None]:
data

# Checking distribution of the numerical values

As the values are correct, not normalization will be applied

In [None]:
# prompt: # Plotting histograms for bmi, age, and children and expensive client

plt.figure(figsize=(15, 5))

plt.subplot(1, 4, 1)
sns.histplot(data['bmi'], bins=30, kde=True)
plt.title('Distribution of BMI')

plt.subplot(1, 4, 2)
sns.histplot(data['age'], bins=30, kde=True)
plt.title('Distribution of Age')

plt.subplot(1, 4, 3)
sns.histplot(data['children'], bins=30, kde=True)
plt.title('Distribution of Children')

plt.subplot(1, 4, 4)
sns.histplot(data['Expensive Client'], bins=30, kde=True)
plt.title('Distribution of Expensive Client')

plt.tight_layout()
plt.show()


In [None]:
# Plotting histograms for bmi, age, and children
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.hist(data['bmi'], bins=20)
plt.title('Distribution of BMI')
plt.xlabel('BMI')
plt.ylabel('Frequency')

plt.subplot(1, 3, 2)
plt.hist(data['age'], bins=20)
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')

plt.subplot(1, 3, 3)
plt.hist(data['children'], bins=20)
plt.title('Distribution of Children')
plt.xlabel('Children')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()


# Hyperparameters tunning

we'll train several classifiers to predict whether a client is expensive based on their features. The classifiers we'll use are:

Logistic Regression

*  K-Nearest Neighbors Classifier
* Random Forest Classifier
* Support Vector Classifier (SVC)
* Gradient Boosting Classifie

We'll use K-fold cross-validation and Grid Search to find the best hyperparameters for each model. Let's proceed step by step with explanations.

In [None]:
import pandas as pd
import numpy as np

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

# Model selection
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, StratifiedKFold

# Metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


## Preparing the Data
We'll assume the data has been preprocessed as per the previous steps, and we have a DataFrame data with the following columns:

* age
* bmi
* children
* Expensive Client (target variable)
* sex_male
* smoker_yes
* region_southwest
* region_southeast
* region_northwest

### Splitting Features and Target Variable

In [None]:
X = data.drop('Expensive Client', axis=1)
y = data['Expensive Client']


### Splitting Data into Training and Test Sets
We'll split the data into training and test sets to evaluate the final model performance after hyperparameter tuning.

In [None]:
from sklearn.model_selection import train_test_split

# Using a 80/20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

# Note: We use stratify=y to maintain the same class distribution in both training and test sets.


### Defining K-Fold Cross-Validation Strategy
We'll use Stratified K-Fold cross-validation to maintain class distribution during cross-validation.

In [None]:
from sklearn.model_selection import StratifiedKFold

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


## 1. Logistic Regression
### Hyperparameter Grid
For Logistic Regression, we'll tune the following hyperparameters:

* C: Inverse of regularization strength
* solver: Algorithm to use in the optimization problem

In [None]:
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['lbfgs', 'liblinear']
}


grid_search_lr = GridSearchCV(
    estimator=LogisticRegression(max_iter=1000),
    param_grid=param_grid_lr,
    cv=kfold,
    scoring='accuracy',
    n_jobs=-1
)

grid_search_lr.fit(X_train, y_train)


In [None]:
print("Best Hyperparameters for Logistic Regression:")
print(grid_search_lr.best_params_)

### Evaluation

In [None]:
best_lr = grid_search_lr.best_estimator_
y_pred_lr = best_lr.predict(X_test)

print("\nClassification Report for Logistic Regression:")
print(classification_report(y_test, y_pred_lr))


## 2. K-Nearest Neighbors Classifier

###  Hyperparameter Grid
We'll tune the following hyperparameters:

* n_neighbors: Number of neighbors to use
* weights: Weight function used in prediction
* metric: Distance metric

In [None]:
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}


grid_search_knn = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid=param_grid_knn,
    cv=kfold,
    scoring='accuracy',
    n_jobs=-1
)

grid_search_knn.fit(X_train, y_train)

print("Best Hyperparameters for KNN:")
print(grid_search_knn.best_params_)


### Evaluating on Test Set


In [None]:
best_knn = grid_search_knn.best_estimator_
y_pred_knn = best_knn.predict(X_test)

print("\nClassification Report for KNN:")
print(classification_report(y_test, y_pred_knn))


## 3. Random Forest Classifier

### Hyperparameter Grid
We'll tune the following hyperparameters:

* n_estimators: Number of trees
* max_depth: Maximum depth of the tree
* min_samples_split: Minimum number of samples required to split
* min_samples_leaf: Minimum number of samples required at a leaf node
* max_features: Number of features to consider when looking for the best split
python

In [None]:
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}


from sklearn.model_selection import RandomizedSearchCV

random_search_rf = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_grid_rf,
    n_iter=50,  # Number of parameter settings that are sampled
    cv=kfold,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)

random_search_rf.fit(X_train, y_train)


In [None]:
print("Best Hyperparameters for Random Forest:")
print(random_search_rf.best_params_)


### Evaluating on Test Set


In [None]:
best_rf = random_search_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test)

print("\nClassification Report for Random Forest:")
print(classification_report(y_test, y_pred_rf))


##  4. Support Vector Classifier (SVC)

### Hyperparameter Grid
We'll tune the following hyperparameters:

* C: Regularization parameter
* kernel: Specifies the kernel type



In [None]:
param_grid_svc = {
    'C': [0.1, 1, 10],
    'kernel': ['rbf', 'sigmoid'],
}


In [None]:
grid_search_svc = GridSearchCV(
    estimator=SVC(),
    param_grid=param_grid_svc,
    cv=kfold,
    scoring='accuracy',
    n_jobs=-1
)


grid_search_svc.fit(X_train, y_train)


In [None]:
print("Best Hyperparameters for SVC:")
print(grid_search_svc.best_params_)


### Evaluating on Test Set


In [None]:
best_svc = grid_search_svc.best_estimator_
y_pred_svc = best_svc.predict(X_test)

print("\nClassification Report for SVC:")
print(classification_report(y_test, y_pred_svc))


## 5. Gradient Boosting Classifier
Hyperparameter Grid
We'll tune the following hyperparameters:

* n_estimators: Number of boosting stages
* learning_rate: Learning rate shrinks the contribution of each tree
* max_depth: Maximum depth of the individual regression estimators
* min_samples_split: Minimum number of samples required to split
* min_samples_leaf: Minimum number of samples required at a leaf node

In [None]:
param_grid_gb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}


random_search_gb = RandomizedSearchCV(
    estimator=GradientBoostingClassifier(random_state=42),
    param_distributions=param_grid_gb,
    n_iter=50,
    cv=kfold,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)

random_search_gb.fit(X_train, y_train)


In [None]:
print("Best Hyperparameters for Gradient Boosting:")
print(random_search_gb.best_params_)


# Evaluation on test set

In [None]:
best_gb = random_search_gb.best_estimator_
y_pred_gb = best_gb.predict(X_test)

print("\nClassification Report for Gradient Boosting:")
print(classification_report(y_test, y_pred_gb))


# Comparing Model Performances
Let's compile the accuracy scores of all models for comparison.

In [None]:
models = {
    'Logistic Regression': accuracy_score(y_test, y_pred_lr),
    'K-Nearest Neighbors': accuracy_score(y_test, y_pred_knn),
    'Random Forest': accuracy_score(y_test, y_pred_rf),
    'Support Vector Classifier': accuracy_score(y_test, y_pred_svc),
    'Gradient Boosting': accuracy_score(y_test, y_pred_gb)
}

print("\nAccuracy Scores:")
for model_name, accuracy in models.items():
    print(f"{model_name}: {accuracy:.4f}")
