In [1]:
import sklearn
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow import keras
from tensorflow.keras import layers

data = pd.read_excel("./upd_data.xlsx", header=0)

2023-09-11 16:52:09.996025: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  _warn(("h5py is running against HDF5 {0} when it was built against {1}, "


In [2]:
# Gender
data['Gender'] = data['Gender'].fillna(0).map({0: 'Unknown', 1: 'Male', 2: 'Female'})
data = pd.get_dummies(data, columns=['Gender'], drop_first=False)

In [3]:
# State
data['State'].fillna(0, inplace=True)
data = pd.get_dummies(data, columns=['State'], drop_first=False)

In [4]:
# Intervention Required
data['Intervention Required'] = data['Intervention Required'].fillna(0).map({0: 'Unknown', 1: 'Yes', 2: 'No'})
data = pd.get_dummies(data, columns=['Intervention Required'], drop_first=False)

In [5]:
# Dates
temp = []
for i in range(len(data)):
    temp.append(int(data["Claim_Finalised_Date"][i].timestamp() - data["Date_of_Accident"][i].timestamp()))
    
data["Accident_to_Claim_Time"] = list(temp)

In [6]:
# How do you find your X
data['How Do You Find Your Doctor?'] = data['How Do You Find Your Doctor?'].fillna(0).map({0: 'Unknown', 1: '1', 2: '2', 3: '3'})
data = pd.get_dummies(data, columns=['How Do You Find Your Doctor?'], drop_first=False)

data['How Do You Find Your Case Manager?'] = data['How Do You Find Your Case Manager?'].fillna(0).map({0: 'Unknown', 1: '1', 2: '2', 3: '3'})
data = pd.get_dummies(data, columns=['How Do You Find Your Case Manager?'], drop_first=False)

In [7]:
# Fill in the rare blanks
data['Med_Cert_Capacity'].fillna(0, inplace=True)
data['Med_cert_unfit_restricted_weekdays'].fillna(0, inplace=True)

data['Payment_early_intervention_rehab'].fillna(0, inplace=True)
data['Payment_medicolegal'].fillna(0, inplace=True)
data['Payment_Rehab'].fillna(0, inplace=True)
data['Payment_travel_accomodation'].fillna(0, inplace=True)
data['Payment_weekly_compensation'].fillna(0, inplace=True)
data['Work Status at Referral'].fillna(0, inplace=True)
data['Other_Paid'].fillna(0, inplace=True)
data["How are you going financially?"].fillna(0, inplace=True)

data["Unable to control the important things?"].fillna(0, inplace=True)
data["You felt that things were going your way"].fillna(0, inplace=True)
data["First Orebro Score"].fillna(0, inplace=True)
data["Orebro Musculoskeletal Pain Total"].fillna(25, inplace=True)

In [8]:
# Dropped
data = data.dropna(subset = "Fitness_week2")
data = data.dropna(subset = "Total_Paid")
data = data.dropna(subset= "Felt difficulties were piling up high?")
data.drop([
    "Are you seeing a Therapist?",
    "Is therapy helpful for you?",
    "Claim_Risk_Assessment"
    ], axis=1, inplace=True)

data.drop(labels= "Date_of_Accident", axis=1, inplace=True)
data.drop(labels= "Claim_Finalised_Date", axis=1, inplace=True)

In [9]:
print(data.isna().sum().sum())

0


In [10]:
def errors_continuous(df, continuous_column):
    X = df.drop(columns=[continuous_column]) 
    y = df[continuous_column] 

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    rf = RandomForestRegressor()

    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_

    y_pred = best_model.predict(X_test)

    return sklearn.metrics.mean_squared_error(y_test, y_pred)

In [11]:
def predict_week(df, fitness_week_column):
    df[fitness_week_column] = np.clip(data[fitness_week_column], 0, 3 - 1)

    X = df.drop(fitness_week_column, axis=1)
    y = df[fitness_week_column]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    model = keras.Sequential([
        layers.Input(shape=(X_train.shape[1],)), 
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.5), 
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.3), 
        layers.Dense(3, activation='softmax')
    ])

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    model.fit(X_train, y_train, epochs=10, batch_size=32)

    loss, accuracy = model.evaluate(X_test, y_test)
    return accuracy

In [12]:
def predict_on_all(input):
    return {
        "Net_total_incurred_MSE": errors_continuous(input.copy(), "Net_total_incurred"),
        "Total_Paid_MSE": errors_continuous(input.copy(), "Total_Paid"),
        "Other_Paid_MSE": errors_continuous(input.copy(), "Other_Paid"),
        "Payment_medical_MSE": errors_continuous(input.copy(), "Payment_medical"),
        "Other_paid_risk_MSE": errors_continuous(input.copy(), "Other_paid_risk"),
        "Fitness_week2_Acc": predict_week(input.copy(), "Fitness_week2"),
        "Fitness_week6_Acc": predict_week(input.copy(), "Fitness_week6"),
        "Fitness_week12_Acc": predict_week(input.copy(), "Fitness_week12"),
        "Fitness_week26_Acc": predict_week(input.copy(), "Fitness_wekk26"),
        "Fitness_week52_Acc": predict_week(input.copy(), "Fitness_week52")
    }

In [13]:
prediction_data = predict_on_all(data)

Epoch 1/10


2023-09-11 16:55:27.861417: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-09-11 16:55:27.877936: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-09-11 16:55:27.878080: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [16]:
print(prediction_data)

{'Net_total_incurred_MSE': 8695190.412833737, 'Total_Paid_MSE': 46649930.16917009, 'Other_Paid_MSE': 103497513.64712854, 'Payment_medical_MSE': 4978232.169598891, 'Other_paid_risk_MSE': 3936302.361803603, 'Fitness_week2_Acc': 0.7570093274116516, 'Fitness_week6_Acc': 0.8598130941390991, 'Fitness_week12_Acc': 0.9065420627593994, 'Fitness_week26_Acc': 0.9626168012619019, 'Fitness_week52_Acc': 0.9345794320106506}
