In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

print("TF Version: " + tf.__version__)

TF Version: 2.16.1


### Data Preperation
Here we are selecting a small percentage of the original dataset, this is to increase the speed at which the models train.
This is done using the line dataset = dataset.sample

We then drop the region column from the dataset as we will not be using it in the training of the models
e:

In [3]:
#Read data from file

raw_dataset = pd.read_csv("insurance_dataset.csv", sep=',',decimal='.')


In [9]:
dataset = raw_dataset.copy()
dataset = dataset.sample(frac=0.005, random_state=42)#3% of 1 million = 30,000, likely need to increase
dataset.drop(columns=['region'], inplace=True) #Drop the region as we will not be using it
print("Number of rows in the dataset after sampling:", dataset.shape[0])


Number of rows in the dataset after sampling: 5000


In [10]:
dataset.tail()

Unnamed: 0,age,gender,bmi,children,smoker,medical_history,family_medical_history,exercise_frequency,occupation,coverage_level,charges
512623,46,female,43.61,3,yes,High blood pressure,Heart disease,Rarely,Student,Standard,19765.11189
949760,28,female,34.46,3,no,,Diabetes,Frequently,Unemployed,Basic,9406.823145
355691,65,male,33.47,0,no,Heart disease,High blood pressure,Occasionally,Blue collar,Basic,15161.96877
826650,56,male,39.0,3,no,,Heart disease,Frequently,Student,Standard,16416.19964
112171,65,male,18.34,1,yes,,High blood pressure,Frequently,White collar,Premium,20258.38419


In [11]:
from sklearn.preprocessing import LabelEncoder



# Define ordinal mappings for ordinal variables
ordinal_mappings = {
    "occupation": {"Unemployed": 0, "Student": 1, "Blue collar": 2, "White collar": 3},
    "coverage_level": {"Basic": 0, "Standard": 1, "Premium": 2}
}

# Apply ordinal encoding to ordinal variables
dataset.replace(ordinal_mappings, inplace=True)



# For non-ordinal variables, you can use regular label encoding
label_encoder = LabelEncoder()
non_ordinal_columns = ['gender', 'smoker',  'medical_history', 'family_medical_history','exercise_frequency']
for column in non_ordinal_columns:
    dataset[column] = label_encoder.fit_transform(dataset[column])


dataset.tail()

Unnamed: 0,age,gender,bmi,children,smoker,medical_history,family_medical_history,exercise_frequency,occupation,coverage_level,charges
512623,46,0,43.61,3,1,2,1,3,1,1,19765.11189
949760,28,0,34.46,3,0,3,0,0,0,0,9406.823145
355691,65,1,33.47,0,0,1,2,2,2,0,15161.96877
826650,56,1,39.0,3,0,3,1,0,1,1,16416.19964
112171,65,1,18.34,1,1,3,2,0,3,2,20258.38419


In [12]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error







# Define features (X) and target variables (y_charges, y_coverage)
#Define features and target variables
X = dataset[['age', 'gender', 'bmi', 'children', 'smoker', 'medical_history', 
             'family_medical_history', 'exercise_frequency', 'occupation']]
y_charges = dataset['charges'].values.reshape(-1, 1)  # Reshape to a 2D array
y_coverage = dataset['coverage_level'].values.reshape(-1, 1)  # Reshape to a 2D array




In [13]:
# Split the data into training and testing sets
#X_train, X_test, y_charges_train, y_charges_test, y_coverage_train, y_coverage_test = train_test_split(X, y_charges, y_coverage, test_size=0.2, random_state=42)
X_train, X_test, y_charges_train, y_charges_test = train_test_split(
    X, y_charges, test_size=0.2, random_state=42)
X_train, X_test, y_coverage_train, y_coverage_test = train_test_split(
    X, y_coverage, test_size=0.2, random_state=42)

y_train = np.concatenate((y_charges_train, y_coverage_train), axis=1)
y_test = np.concatenate((y_charges_test, y_coverage_test), axis=1)

print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)

print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)


Shape of X_train: (4000, 9)
Shape of y_train: (4000, 2)
Shape of X_test: (1000, 9)
Shape of y_test: (1000, 2)


In [14]:
from sklearn.preprocessing import MinMaxScaler

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Fit scaler on training data and transform both training and testing data
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Multi Output Random Forest Regressor

This random forest regressor is wrapped with a multi output regressor to make it so that we can predict both 

In [16]:
# Define parameter grid for hyperparameter tuning
##param_grid = {
#    'n_estimators': [100, 200, 300],  # Number of trees in the forest
#    'max_depth': [None, 10, 20],  # Maximum depth of the tree
#    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
#    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
#    'max_features': ['auto', 'sqrt']  # Number of features to consider when looking for the best split
#}

param_grid_rf = {
    'estimator__max_depth': [10, 20, 30],
    'estimator__n_estimators': [100, 200, 300]
}


# Instantiate Random Forest regressor
rf_regressor = RandomForestRegressor(random_state=42)

# Wrap the Random Forest regressor with MultiOutputRegressor
multioutput_regressor = MultiOutputRegressor(rf_regressor)

# Perform grid search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=multioutput_regressor, param_grid=param_grid_rf, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model from grid search
best_rf_model = grid_search.best_estimator_

# Make predictions on the testing data
y_pred = best_rf_model.predict(X_test)

# Split the predictions into separate arrays for charges and coverage
y_charges_pred = y_pred[:, 0]  # Assuming charges is the first output variable
y_coverage_pred = y_pred[:, 1]  # Assuming coverage is the second output variable

# Evaluate the model
mse_charges = mean_squared_error(y_charges_test, y_charges_pred)
mse_coverage = mean_squared_error(y_coverage_test, y_coverage_pred)
print("Mean Squared Error (Charges):", mse_charges)
print("Mean Squared Error (Coverage Level):", mse_coverage)

Mean Squared Error (Charges): 5148155.407757504
Mean Squared Error (Coverage Level): 0.6472388267817346


### Multi output SVM Regressor

Here we use the SVR Model from sklearn to create a SVM regression model that will predict the charges and the coverage type


In [15]:
from sklearn.svm import SVR

# 1. Instantiate the SVM Regressor
svm_regressor = SVR()

# 2. Hyperparameter Tuning
param_grid = {
    'estimator__kernel': ['linear', 'rbf'],
    'estimator__C': [0.1, 1, 10],
    'estimator__epsilon': [0.1, 0.2, 0.5]
}
##Wrap the svm regressor in a multi output regressor
grid_search = GridSearchCV(estimator=MultiOutputRegressor(svm_regressor), param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# 3. Train the Model
grid_search.fit(X_train, y_train)

# 4. Evaluate the Model
best_svm_model = grid_search.best_estimator_
y_pred = best_svm_model.predict(X_test)
y_charges_pred = y_pred[:, 0]  #Charges is the first output variable
y_coverage_pred = y_pred[:, 1]  #Coverage is the second output variable


mse_charges = mean_squared_error(y_charges_test, y_charges_pred)
mse_coverage = mean_squared_error(y_coverage_test, y_coverage_pred)
print("Mean Squared Error (Charges):", mse_charges)
print("Mean Squared Error (Coverage Level):", mse_coverage)



Mean Squared Error (Charges): 10775805.128468255
Mean Squared Error (Coverage Level): 0.8777131514850689


### Multi-output Neural Network 

Here we use the NN Model from tensorflow to create a NN regression model that will predict the charges and the coverage typeb

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential([
    Dense(64, activation='relu', input_shape=(9,)),
    Dense(32, activation='relu'),
    Dense(2)  # Two outputs for charges and coverage
])

model.compile(optimizer='adam', loss='mean_squared_error')

trained_data = model.fit(X_train, y_train, epochs=40, batch_size=8, validation_split=0.2)

mse = model.evaluate(X_test, y_test)
print("Mean Squared Error:", mse)

Epoch 1/40
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 151947728.0000 - val_loss: 115702520.0000
Epoch 2/40
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 85084840.0000 - val_loss: 18033396.0000
Epoch 3/40
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 14725986.0000 - val_loss: 11440710.0000
Epoch 4/40
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 11519511.0000 - val_loss: 10757870.0000
Epoch 5/40
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 10876959.0000 - val_loss: 10137167.0000
Epoch 6/40
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 920us/step - loss: 10333354.0000 - val_loss: 9560100.0000
Epoch 7/40
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 952us/step - loss: 9590385.0000 - val_loss: 9029530.0000
Epoch 8/40
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

### Part 2 - Optimisation

### 

In [None]:
# Step 1: Read the file and extract relevant information
def read_file(file_path):
    # Implement file parsing logic to extract surgeon details, surgery details, and constraints
    pass

# Step 2: Implement the fitness function
def calculate_fitness(timetable):
    # Initialize constraint violation counters
    concurrence_violations = 0
    precedence_violations = 0
    
    # Iterate through each time slot in the timetable
    for time_slot in timetable:
        # Check concurrence constraints
        # Update concurrence_violations if needed
        
        # Check precedence constraints
        # Update precedence_violations if needed
    
    # Calculate fitness score
    fitness_score = concurrence_violations * precedence_violations
    return fitness_score

# Step 3: Test with a random timetable
def generate_random_timetable():
    # Implement logic to generate a random timetable
    pass

# Read the file and extract relevant information
surgeons, surgeries, constraints = read_file("file_path")

# Generate a random timetable
random_timetable = generate_random_timetable()

# Evaluate the fitness of the random timetable
fitness = calculate_fitness(random_timetable)
print("Fitness of Random Timetable:", fitness)