# Predictive Modeling

## Decision Tree Classifier using Industry and Highest Process

Attempting to build a model using just industry was not successful, as shown by the accuracy score of 0.5813

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

final_output_file = 'WC_Claims_2020-2022.csv'
df = pd.read_csv(final_output_file)

#'age': df['Age at Injury'],
#        'gender': df['Gender'],
    
data = {'industry': df['Industry Code Description'],
        'highest_process': df['Highest Process']}
df = pd.DataFrame(data)

# One-hot encode categorical columns
df = pd.get_dummies(df, columns=['industry'], drop_first=True)

# Split the data into independent and dependent variables
X = df.drop('highest_process', axis=1)
y = df['highest_process']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a decision tree classifier
model = DecisionTreeClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_report_result = classification_report(y_test, y_pred, zero_division=0)

print(f"Accuracy: {accuracy}")

Accuracy: 0.5813634048939991


## Decision Tree Classifier using Age, Represented, Claim Injury Type, and Highest Process

After creating the decision tree model, the accuracy, precision, recall, and F1 scores were printed in order to evaluate the model. Due to the low number of records with the highest process of 3 (CONCILIATION - MEETING), precision, recall, and F1 scores were not available for that group. 

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

final_output_file = 'WC_Claims_2020-2022.csv'
df = pd.read_csv(final_output_file)

data = {'age': df['Age at Injury'],
        'represented': df['Represented'],
        'claim_injury_type': df['Claim Injury Type'],
        'highest_process': df['Highest Process']}
                              
df = pd.DataFrame(data)

# One-hot encode the 'claim_injury_type' column
df = pd.get_dummies(df, columns=['claim_injury_type'], drop_first=True)

# Split the data into independent and dependent variables
X = df.drop('highest_process', axis=1)
y = df['highest_process']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a decision tree classifier
model = DecisionTreeClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
precision = precision_score(y_test, y_pred, average=None, zero_division=0)
print(f"Precision: {precision}")
recall = recall_score(y_test, y_pred, average=None, zero_division=0)
print(f"Recall: {recall}")
f1_score = f1_score(y_test, y_pred, average=None, zero_division=0)
print(f"F1 score: {f1_score}")
classification_report_result = classification_report(y_test, y_pred, zero_division=0)
print("Classification Report:\n", classification_report_result)


Accuracy: 0.8033086757256573
Precision: [0.826981   0.74755205 0.         0.76111642]
Recall: [0.95252694 0.49952357 0.         0.73743017]
F1 score: [0.88532528 0.59887285 0.         0.7490861 ]
Classification Report:
               precision    recall  f1-score   support

           1       0.83      0.95      0.89     86765
           2       0.75      0.50      0.60     31484
           3       0.00      0.00      0.00      1818
           4       0.76      0.74      0.75     29177

    accuracy                           0.80    149244
   macro avg       0.58      0.55      0.56    149244
weighted avg       0.79      0.80      0.79    149244



## Logistic Regression, Random Forest, and Support Vector Machine

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

final_output_file = 'WC_Claims_2020-2022.csv'
df = pd.read_csv(final_output_file)

data = {'represented': df['Represented'],
        'claim_injury_type': df['Claim Injury Type'],
        'body_part': df['Part of Body'],
        'highest_process': df['Highest Process']}

df = pd.DataFrame(data)

# One-hot encode categorical columns
df = pd.get_dummies(df, columns=['claim_injury_type',], drop_first=True)

# Split the data into independent and dependent variables
X = df.drop('highest_process', axis=1)
y = df['highest_process']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression
logreg_model = LogisticRegression(max_iter=1000, random_state=42)
logreg_model.fit(X_train, y_train)
logreg_pred = logreg_model.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, logreg_pred))
print("Logistic Regression Classification Report:\n", classification_report(y_test, logreg_pred, zero_division=0))

# Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
print("\nRandom Forest Accuracy:", accuracy_score(y_test, rf_pred))
print("Random Forest Classification Report:\n", classification_report(y_test, rf_pred, zero_division=0))

# Support Vector Machine (SVM)
svm_model = SVC(random_state=42)
svm_model.fit(X_train_scaled, y_train)
svm_pred = svm_model.predict(X_test_scaled)
print("\nSupport Vector Machine Accuracy:", accuracy_score(y_test, svm_pred))
print("Support Vector Machine Classification Report:\n", classification_report(y_test, svm_pred))


Logistic Regression Accuracy: 0.8015062582080351
Logistic Regression Classification Report:
               precision    recall  f1-score   support

           1       0.83      0.95      0.89     86765
           2       0.73      0.50      0.60     31484
           3       0.00      0.00      0.00      1818
           4       0.77      0.72      0.74     29177

    accuracy                           0.80    149244
   macro avg       0.58      0.54      0.56    149244
weighted avg       0.78      0.80      0.79    149244


Random Forest Accuracy: 0.8039318163544263
Random Forest Classification Report:
               precision    recall  f1-score   support

           1       0.83      0.95      0.89     86765
           2       0.74      0.50      0.60     31484
           3       0.00      0.00      0.00      1818
           4       0.76      0.74      0.75     29177

    accuracy                           0.80    149244
   macro avg       0.58      0.55      0.56    149244
weighted a

The support vector is having trouble running, making it apparent that it is not the model that should be utilized.