In [4]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

print("TF Version: " + tf.__version__)

TF Version: 2.16.1


### Data Preperation
Here we are selecting a small percentage of the original dataset, this is to increase the speed at which the models train.
This is done using the line dataset = dataset.sample

We then drop the region column from the dataset as we will not be using it in the training of the models
e:

In [None]:
#Read data from file

raw_dataset = pd.read_csv("insurance_dataset.csv", sep=',',decimal='.')


In [None]:
dataset = raw_dataset.copy()
dataset = dataset.sample(frac=0.005, random_state=42)#0.5% of 1 million = 5,000, likely need to increase
dataset.drop(columns=['region'], inplace=True) #Drop the region as we will not be using it
print("Number of rows in the dataset after sampling:", dataset.shape[0])
dataset.reset_index(drop=True, inplace=True)
dataset.tail(9)

In [None]:
from sklearn.preprocessing import LabelEncoder
# Create label encoder
label_encoder = LabelEncoder()
# Define ordinal mappings for ordinal variables
ordinal_mappings = {
    "gender": {"male": 0, "female": 1},
    "smoker": {"no": 0, "yes": 1},
    "medical_history": {np.NAN: 0, "Heart disease": 1, "High blood pressure": 2, "Diabetes": 3},
    "family_medical_history": {np.NAN: 0, "Heart disease": 1, "High blood pressure": 2, "Diabetes": 3},
    "exercise_frequency": {"Never": 0, "Rarely": 1, "Occasionally": 2, "Frequently": 3},
    "occupation": {"Unemployed": 0, "Student": 1, "Blue collar": 2, "White collar": 3},
    "coverage_level": {"Basic": 0, "Standard": 1, "Premium": 2}
}
def encode_values(new_dataset):
    # Apply ordinal encoding to ordinal variables
    new_dataset.replace(ordinal_mappings, inplace=True)
    return new_dataset

# Apply the function to your dataset
dataset = encode_values(dataset)


dataset.tail(9)

In [None]:
'''from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

# Assuming 'coverage_level' is a column in your dataset
# Convert 'coverage_level' to categorical data type (optional but recommended)
dataset['coverage_level'] = dataset['coverage_level'].astype('category')

# Drop the 'coverage_level' column temporarily
coverage_level = dataset.pop('coverage_level')

# Initialize SimpleImputer to handle missing values
imputer = SimpleImputer(strategy='most_frequent')
dataset_imputed = pd.DataFrame(imputer.fit_transform(dataset), columns=dataset.columns)

# Initialize OneHotEncoder
encoder = OneHotEncoder()

# Fit and transform the coverage_level column
coverage_encoded = encoder.fit_transform(coverage_level.values.reshape(-1, 1))

# Convert the encoded sparse matrix to a dense array and create a DataFrame
coverage_encoded_df = pd.DataFrame(coverage_encoded.toarray(), columns=encoder.get_feature_names_out(['coverage_level']))

# Concatenate the imputed dataset with the one-hot encoded coverage level
dataset_encoded = pd.concat([dataset_imputed, coverage_encoded_df], axis=1)

dataset = dataset_encoded

# Print the head of the encoded dataset
dataset.tail()'''

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error

# Split the encoded dataset into features (X) and target variables (y_charges, y_coverage)
X = dataset.drop(columns=['charges', 'coverage_level'])  # Features
y_charges = dataset['charges']  # Target variable for charges
y_coverage = dataset['coverage_level']  # Target variable for coverage

# Split the data into training and testing sets for charges and coverage separately
X_train, X_test, y_charges_train, y_charges_test = train_test_split(
    X, y_charges, test_size=0.2, random_state=42
)
X_train, X_test, y_coverage_train, y_coverage_test = train_test_split(
    X, y_coverage, test_size=0.2, random_state=42
)

# Concatenate the target variables
y_train = pd.concat([y_charges_train, y_coverage_train], axis=1)
y_test = pd.concat([y_charges_test, y_coverage_test], axis=1)

print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)

print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)


In [None]:
from sklearn.preprocessing import MinMaxScaler

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Fit scaler on training data and transform both training and testing data
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Multi Output Random Forest Regressor

This random forest regressor is wrapped with a multi output regressor to make it so that we can predict both 

In [None]:
# Define parameter grid for hyperparameter tuning
##param_grid = {
#    'n_estimators': [100, 200, 300],  # Number of trees in the forest
#    'max_depth': [None, 10, 20],  # Maximum depth of the tree
#    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
#    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
#    'max_features': ['auto', 'sqrt']  # Number of features to consider when looking for the best split
#}

param_grid_rf = {
    'estimator__max_depth': [10, 20, 30],
    'estimator__n_estimators': [100, 200, 300]
}


# Instantiate Random Forest regressor
rf_regressor = RandomForestRegressor(random_state=42)

# Wrap the Random Forest regressor with MultiOutputRegressor
multioutput_regressor = MultiOutputRegressor(rf_regressor)

# Perform grid search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=multioutput_regressor, param_grid=param_grid_rf, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model from grid search
best_rf_model = grid_search.best_estimator_

# Make predictions on the testing data
y_pred_rf = best_rf_model.predict(X_test)

# Split the predictions into separate arrays for charges and coverage
y_charges_pred_rf = y_pred_rf[:, 0]  # Assuming charges is the first output variable
y_coverage_pred_rf = y_pred_rf[:, 1]  # Assuming coverage is the second output variable

# Evaluate the model
mse_charges_rf = mean_squared_error(y_charges_test, y_charges_pred)
mse_coverage_rf = mean_squared_error(y_coverage_test, y_coverage_pred)
print("Mean Squared Error (Charges):", mse_charges_rf)
print("Mean Squared Error (Coverage Level):", mse_coverage_rf)

### Multi output SVM Regressor

Here we use the SVR Model from sklearn to create a SVM regression model that will predict the charges and the coverage type


In [None]:
from sklearn.svm import SVR

# 1. Instantiate the SVM Regressor
svm_regressor = SVR()

# 2. Hyperparameter Tuning
param_grid = {
    'estimator__kernel': ['linear', 'rbf'],
    'estimator__C': [0.1, 1, 10],
    'estimator__epsilon': [0.1, 0.2, 0.5]
}
##Wrap the svm regressor in a multi output regressor
grid_search = GridSearchCV(estimator=MultiOutputRegressor(svm_regressor), param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# 3. Train the Model
grid_search.fit(X_train, y_train)

# 4. Evaluate the Model
best_svm_model = grid_search.best_estimator_
y_pred = best_svm_model.predict(X_test)
y_charges_pred = y_pred[:, 0]  #Charges is the first output variable
y_coverage_pred = y_pred[:, 1]  #Coverage is the second output variable


mse_charges = mean_squared_error(y_charges_test, y_charges_pred)
mse_coverage = mean_squared_error(y_coverage_test, y_coverage_pred)
print("Mean Squared Error (Charges):", mse_charges)
print("Mean Squared Error (Coverage Level):", mse_coverage)



### Multi-output Neural Network 

Here we use the NN Model from tensorflow to create a NN regression model that will predict the charges and the coverage type

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import mean_squared_error

# Define the neural network model
nn_model = Sequential([
    Dense(64, activation='relu', input_shape=(9,)),
    Dense(32, activation='relu'),
    Dense(2)  # Two outputs for charges and coverage
])

# Compile the model
nn_model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
trained_data = nn_model.fit(X_train, y_train, epochs=4, batch_size=8, validation_split=0.2)

# Make predictions on the test data
y_pred = nn_model.predict(X_test)
# Split the predictions into charges and coverage level
y_charges_pred = y_pred[:, 0]  # Charges is the first output variable
y_coverage_pred = y_pred[:, 1]  # Coverage is the second output variable

# Calculate Mean Squared Error for each output separately
mse_charges = mean_squared_error(y_charges_test, y_charges_pred)
mse_coverage = mean_squared_error(y_coverage_test, y_coverage_pred)

print("Mean Squared Error (Charges):", mse_charges)
print("Mean Squared Error (Coverage Level):", mse_coverage)
#y_pred_df = pd.DataFrame(y_pred, columns=['Predicted Charges', 'Predicted Coverage Level'])
#y_test_df = pd.DataFrame(y_test, columns=['Actual Charges', 'Actual Coverage Level'])
#combined_df = pd.concat([y_test_df, y_pred_df], axis=1)
                          
#combined_df.tail(9)
#y_charges_test.tail()

### Predict data from test user

Using sample data the model then predicts an output

In [None]:
# Sample input data
input_data = {
    'age': [35],
    'gender': ['female'],
    'bmi': [25.5],
    'children': [2],
    'smoker': ['no'],
    'medical_history': [np.NAN],
    'family_medical_history': [np.NAN],
    'exercise_frequency': ['Occasionally'],
    'occupation': ['White collar']
}

# Create a DataFrame
input_df = pd.DataFrame(input_data)

input_df.tail(9)

# Convert categorical variables to numerical using label encoding
input_df_encoded = encode_values(input_df)

input_df_encoded.tail(9)

#### Random Forest Prediction

In [None]:
# 3. Make prediction for the input data
prediction = best_rf_model.predict(input_df_encoded)

# 4. Output the prediction
print("Predicted Charges:", prediction[0][0])  # Assuming charges is the first output variable
print("Predicted Coverage Level:", prediction[0][1])  # Assuming coverage is the second output variable

#### SVM Prediction

In [None]:
predictions = best_svm_model.predict(input_df)
# Display the predictions
print("Predicted Charges:", predictions[:, 0])
print("Predicted Coverage Level:", predictions[:, 1])

#### Neural Network Prediction

In [None]:
# Make predictions using the trained model
predictions = nn_model.predict(input_df_encoded)

# Display the predictions
print("Predicted Charges:", predictions[0][0])
print("Predicted Coverage Level:", predictions[0][1])


As you can see the random forest model has the most accurate predictions of all the models, this is because the SVM model predicted a coverage of 50,000, which is much higher than the largest charge in the dataset at 32561.56037 , and the Neural Network model predicted a coverage level of 14, which is not possible as the only option are Basic(0), Standard(1) and Premium(2).



### Cross Validation

In [None]:
from sklearn.model_selection import cross_validate

# Define cross-validation strategy (e.g., 5-fold cross-validation)
cv_results = cross_validate(grid_search, X_train, y_train, cv=5, scoring='neg_mean_squared_error', return_train_score=True)

# Extract train and test scores from cross-validation results
train_scores = -cv_results['train_score']  # Use negative scores for MSE
test_scores = -cv_results['test_score']

# Plot boxplot of training and testing MSE
plt.figure(figsize=(10, 6))
plt.boxplot([train_scores, test_scores], labels=['Training MSE', 'Testing MSE'])
plt.title('Training and Testing Mean Squared Error (MSE) Across Folds')
plt.ylabel('Mean Squared Error (MSE)')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(y_charges_test, y_charges_pred_rf, color='blue', label='Predicted Cost')
plt.plot([y_charges_test.min(), y_charges_test.max()], [y_charges_test.min(), y_charges_test.max()], 'k--', lw=2)
plt.xlabel('Real Cost of Insurance')
plt.ylabel('Predicted Cost of Insurance')
plt.title('Real vs Predicted Cost of Insurance')
plt.legend()
plt.grid(True)
plt.show()

### Part 2 - Optimisation

In [1]:
class Surgeon:
    def __init__(self, surgery_type, name, surgeries_per_day, anesthesiologist_required):
        self.surgery_type = surgery_type
        self.name = name
        self.surgeries_per_day = surgeries_per_day
        self.anesthesiologist_required = anesthesiologist_required

    def __str__(self):
        return f"Surgeon: {self.name}, Surgery Type: {self.surgery_type}, " \
               f"Surgeries per Day: {self.surgeries_per_day}, " \
               f"Anesthesiologist Required: {self.anesthesiologist_required}"
    
        

In [2]:
# Open the text file for reading
surgeons = [0 for _ in range(5)]
with open("Surgery.txt", "r") as file:
    i = 0
    
    for line in file:
        # Split the line into its components based on '|'
        components = line.strip().split('|')
        new_surgeon = Surgeon(components[0].strip(), components[1].strip(), int(components[2].strip()), components[3].strip() == "Yes")
        # Extract the components
        surgeons[i] = new_surgeon
        i+=1
        

print(surgeons[0].surgery_type)
print(surgeons[0].name)
print(surgeons[0].surgeries_per_day)
print(surgeons[0].anesthesiologist_required)

Cholecystectomy
Meredith Gery
4
True


In [5]:
timetable = [[0 for _ in range(9)] for _ in range(3)]

def timetable_dataframe():
    return pd.DataFrame(timetable)

timetable_df = timetable_dataframe()
timetable_df.tail()
    

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0


In [15]:
import random

#30 Attempts to just place a random surgery in a random timeslot, 
# do not worry about conflicts as this will be improved upon by the hillclimb algorithm
def generate_random_timetable():
    for j in range(3):
        for k in range(9):
            timetable[j][k] = "Empty"
    
    for i in range (20):
        random.seed()
        x = random.randint(0, 2)#Get a random slot in the timetable
        y = random.randint(0, 8)
        random_surgeon = random.randint(0, 4)#Get a random surgeon
        timetable[x][y] = surgeons[random_surgeon]
        



generate_random_timetable()



timetable_df = timetable_dataframe()
timetable_df.tail()



Unnamed: 0,0,1,2,3,4,5,6,7,8
0,Empty,"Surgeon: Beverly Crusher, Surgery Type: Dupuyt...","Surgeon: Leonard McCoy, Surgery Type: Broken B...",Empty,"Surgeon: Meredith Gery, Surgery Type: Cholecys...",Empty,"Surgeon: Meredith Gery, Surgery Type: Cholecys...","Surgeon: Beverly Crusher, Surgery Type: Dupuyt...","Surgeon: Leonard McCoy, Surgery Type: Broken B..."
1,"Surgeon: Meredith Gery, Surgery Type: Cholecys...",Empty,Empty,Empty,Empty,"Surgeon: Leonard McCoy, Surgery Type: Broken B...",Empty,"Surgeon: Leonard McCoy, Surgery Type: Broken B...","Surgeon: Preston Burke, Surgery Type: Heart By..."
2,"Surgeon: Beverly Crusher, Surgery Type: Dupuyt...","Surgeon: Leonard McCoy, Surgery Type: Broken B...",Empty,Empty,Empty,"Surgeon: Cristina Yang, Surgery Type: Carpal T...","Surgeon: Cristina Yang, Surgery Type: Carpal T...","Surgeon: Meredith Gery, Surgery Type: Cholecys...",Empty


In [16]:



anaesthetists_count = 2 #Start with 2 available

# Step 2: Implement the fitness function
def calculate_fitness(timetable):
    # Initialize constraint violation counters
    concurrence_violations = 0
    precedence_violations = 0


    # Iterate through each time slot in the timetable
    for x in range(3):#Loop for each timeslot
        for y in range(9):#Loop for the 9 timeslots of the day
            #Check for concurrence contraints
            try:
                if(timetable[x-1][y] == timetable[x][y]):
                    #Concurrence constraint has happened, add one to the counter
                    concurrence_violations += 1
                    print("Conc", x-1," ",y)
                if(timetable[x-2][y] == timetable[x][y]):
                    #Concurrence constraint has happened, add one to the counter
                    concurrence_violations += 1
                    print("Conc", x-2," ",y)
            except:
                print()#Just pass error
                
            #Check for precedence violations 
            try:
                if(timetable[x][y-1] == timetable[x][y]):
                    precedence_violations += 1
                    print("Pres", x," ",y-1)
                if(timetable[x-1][y-1] == timetable[x][y]):
                    precedence_violations += 1
                    print("Pres", x-1," ",y-1)
                if(timetable[x-2][y-1] == timetable[x][y]):
                    precedence_violations += 1
                    print("Pres", x-2," ",y-1)
            except:
                print()#Pass error again

            
    
    # Calculate fitness score
    print(concurrence_violations)
    print(precedence_violations)
    fitness_score = concurrence_violations * precedence_violations
    return fitness_score







In [18]:
# Generate a random timetable

generate_random_timetable()



# Evaluate the fitness of the random timetable
fitness = calculate_fitness(timetable)
print("Fitness of Random Timetable:", fitness)

timetable_df = timetable_dataframe()
timetable_df.tail()

Conc -2   0
Pres -1   -1
Pres -1   1
Pres -2   1
Conc -1   3
Conc -2   4
Conc -1   5
Conc -2   5
Pres -1   4
Pres 0   7
Conc 0   0
Pres -1   -1
Conc -1   1
Pres 1   0
Pres 0   0
Conc -1   2
Pres 0   1
Pres 0   2
Conc 0   4
Conc 0   5
Conc -1   5
Pres -1   4
Pres -1   6
Conc 1   1
Pres 1   0
Pres 0   0
Conc 1   2
Pres 0   1
Conc 0   3
Pres 1   3
Conc 1   5
Conc 0   5
Pres 2   4
Pres 2   5
Pres 1   5
Pres 0   5
Pres 0   6
Pres 1   7
16
22
Fitness of Random Timetable: 352


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,Empty,"Surgeon: Preston Burke, Surgery Type: Heart By...",Empty,"Surgeon: Beverly Crusher, Surgery Type: Dupuyt...","Surgeon: Preston Burke, Surgery Type: Heart By...",Empty,"Surgeon: Preston Burke, Surgery Type: Heart By...","Surgeon: Leonard McCoy, Surgery Type: Broken B...","Surgeon: Leonard McCoy, Surgery Type: Broken B..."
1,Empty,Empty,"Surgeon: Preston Burke, Surgery Type: Heart By...",Empty,"Surgeon: Preston Burke, Surgery Type: Heart By...",Empty,"Surgeon: Cristina Yang, Surgery Type: Carpal T...",Empty,"Surgeon: Meredith Gery, Surgery Type: Cholecys..."
2,"Surgeon: Cristina Yang, Surgery Type: Carpal T...",Empty,"Surgeon: Preston Burke, Surgery Type: Heart By...","Surgeon: Beverly Crusher, Surgery Type: Dupuyt...",Empty,Empty,Empty,"Surgeon: Preston Burke, Surgery Type: Heart By...",Empty
