# Table of Contents

<a class="anchor" id="top"></a>

# 1. Importing Libraries & Data

In [1]:
import pandas as pd

# Models
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, \
    AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, \
    QuadraticDiscriminantAnalysis
from xgboost import XGBClassifier  
from lightgbm import LGBMClassifier 
#from catboost import CatBoostClassifier

# metrics
import metrics as m

import warnings
warnings.filterwarnings("ignore")

**Import Data**

In [2]:
X_train = pd.read_csv('./project_data/x_train.csv', index_col = 'Claim Identifier')
X_val = pd.read_csv('./project_data/x_val.csv', index_col = 'Claim Identifier')
y_train = pd.read_csv('./project_data/y_train.csv', index_col = 'Claim Identifier')
y_val = pd.read_csv('./project_data/y_val.csv', index_col = 'Claim Identifier')

In [18]:
test = pd.read_csv('./project_data/test_treated.csv', index_col = 'Claim Identifier')

# 2. Modelling

<a href="#top">Top &#129033;</a>

In [3]:
print(X_train.columns.to_list(), '\n')
print(len(X_train.columns))

['Age at Injury', 'Average Weekly Wage', 'Assembly Year', 'C-2 Month', 'C-2 Year', 'First Hearing Year', 'IME-4 Count Log', 'Attorney/Representative', 'Carrier Name', 'Carrier Name Log', 'Carrier Type', 'County of Injury', 'District Name', 'Gender', 'Industry Code', 'Medical Fee Region', 'WCIO Cause of Injury Code', 'WCIO Nature of Injury Code', 'WCIO Part Of Body Code', 'C-3 Date Binary'] 

20


**Baseline Model**

In [4]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [5]:
train_pred = model.predict(X_train)
val_pred = model.predict(X_val)

In [6]:
metrics(y_train, train_pred , y_val, val_pred)

              precision    recall  f1-score   support

           1       0.00      0.00      0.00      9981
           2       0.72      0.95      0.82    232862
           3       0.21      0.01      0.02     55125
           4       0.53      0.67      0.59    118806
           5       0.22      0.00      0.01     38624
           6       0.00      0.00      0.00      3369
           7       0.00      0.00      0.00        77
           8       0.00      0.00      0.00       376

    accuracy                           0.66    459220
   macro avg       0.21      0.20      0.18    459220
weighted avg       0.55      0.66      0.57    459220

              precision    recall  f1-score   support

           1       0.00      0.00      0.00      2495
           2       0.72      0.95      0.82     58216
           3       0.21      0.01      0.02     13781
           4       0.53      0.66      0.59     29701
           5       0.17      0.00      0.01      9656
           6       0.00 

**SGD Classifier**

In [7]:
sgd = SGDClassifier()
sgd.fit(X_train, y_train)

In [8]:
train_pred_sgd = sgd.predict(X_train)
val_pred_sgd = sgd.predict(X_val)

In [14]:
m.metrics(y_train, train_pred_sgd , y_val, val_pred_sgd)

______________________________________________________________________
                                TRAIN                                 
----------------------------------------------------------------------
              precision    recall  f1-score   support

           1       0.00      0.00      0.00      9981
           2       0.73      0.99      0.84    232862
           3       0.12      0.00      0.00     55125
           4       0.59      0.73      0.65    118806
           5       0.22      0.00      0.00     38624
           6       0.00      0.00      0.00      3369
           7       0.00      0.00      0.00        77
           8       0.00      0.00      0.00       376

    accuracy                           0.69    459220
   macro avg       0.21      0.21      0.19    459220
weighted avg       0.56      0.69      0.60    459220

______________________________________________________________________
                                VALIDATION                       

**Decision Tree Classifier**

In [15]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

In [16]:
train_pred_dt = dt.predict(X_train)
val_pred_dt = dt.predict(X_val)

In [17]:
m.metrics(y_train, train_pred_dt , y_val, val_pred_dt)

______________________________________________________________________
                                TRAIN                                 
----------------------------------------------------------------------
              precision    recall  f1-score   support

           1       0.99      1.00      1.00      9981
           2       1.00      1.00      1.00    232862
           3       1.00      1.00      1.00     55125
           4       1.00      1.00      1.00    118806
           5       1.00      1.00      1.00     38624
           6       1.00      1.00      1.00      3369
           7       1.00      1.00      1.00        77
           8       1.00      1.00      1.00       376

    accuracy                           1.00    459220
   macro avg       1.00      1.00      1.00    459220
weighted avg       1.00      1.00      1.00    459220

______________________________________________________________________
                                VALIDATION                       

**Random Forest Classifier**

In [18]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [19]:
train_pred_rf = rf.predict(X_train)
val_pred_rf = rf.predict(X_val)

In [20]:
m.metrics(y_train, train_pred_rf , y_val, val_pred_rf)

______________________________________________________________________
                                TRAIN                                 
----------------------------------------------------------------------
              precision    recall  f1-score   support

           1       1.00      0.99      1.00      9981
           2       1.00      1.00      1.00    232862
           3       1.00      1.00      1.00     55125
           4       1.00      1.00      1.00    118806
           5       1.00      1.00      1.00     38624
           6       1.00      1.00      1.00      3369
           7       1.00      1.00      1.00        77
           8       1.00      1.00      1.00       376

    accuracy                           1.00    459220
   macro avg       1.00      1.00      1.00    459220
weighted avg       1.00      1.00      1.00    459220

______________________________________________________________________
                                VALIDATION                       

**Gradient Boosting Classifier**

In [21]:
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

In [22]:
train_pred_gb = gb.predict(X_train)
val_pred_gb = gb.predict(X_val)

In [23]:
m.metrics(y_train, train_pred_gb , y_val, val_pred_gb)

______________________________________________________________________
                                TRAIN                                 
----------------------------------------------------------------------
              precision    recall  f1-score   support

           1       0.72      0.47      0.57      9981
           2       0.85      0.96      0.90    232862
           3       0.50      0.06      0.11     55125
           4       0.69      0.88      0.77    118806
           5       0.70      0.56      0.62     38624
           6       0.67      0.00      0.01      3369
           7       0.77      0.22      0.34        77
           8       0.23      0.09      0.13       376

    accuracy                           0.78    459220
   macro avg       0.64      0.40      0.43    459220
weighted avg       0.75      0.78      0.74    459220

______________________________________________________________________
                                VALIDATION                       

**AdaBoost Classifier**

In [25]:
ab = AdaBoostClassifier()
ab.fit(X_train, y_train)

In [26]:
train_pred_ab = ab.predict(X_train)
val_pred_ab = ab.predict(X_val)

In [27]:
m.metrics(y_train, train_pred_ab , y_val, val_pred_ab)

______________________________________________________________________
                                TRAIN                                 
----------------------------------------------------------------------
              precision    recall  f1-score   support

           1       0.19      0.57      0.29      9981
           2       0.84      0.88      0.86    232862
           3       0.50      0.02      0.04     55125
           4       0.64      0.86      0.74    118806
           5       0.56      0.35      0.43     38624
           6       0.00      0.00      0.00      3369
           7       0.00      0.00      0.00        77
           8       0.12      0.63      0.21       376

    accuracy                           0.71    459220
   macro avg       0.36      0.41      0.32    459220
weighted avg       0.71      0.71      0.67    459220

______________________________________________________________________
                                VALIDATION                       

**SVC**

In [None]:
svc = SVC()
svc.fit(X_train, y_train)

In [None]:
train_pred_svc = svc.predict(X_train)
val_pred_svc = svc.predict(X_val)

In [None]:
m.metrics(y_train, train_pred_svc , y_val, val_pred_svc)

**Gaussian Process Classifier**

In [None]:
## kills the kernel

In [None]:
# gau_p = GaussianProcessClassifier()
# gau_p.fit(X_train, y_train)

In [None]:
# train_pred_gau_p = gau_p.predict(X_train)
# val_pred_gau_p = gau_p.predict(X_val)

In [None]:
# metrics(y_train, train_pred_gau_p , y_val, val_pred_gau_p)

**Linear Discriminant Analysis**

In [4]:
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

In [5]:
train_pred_lda = lda.predict(X_train)
val_pred_lda = lda.predict(X_val)

In [7]:
m.metrics(y_train, train_pred_lda , y_val, val_pred_lda)

______________________________________________________________________
                                TRAIN                                 
----------------------------------------------------------------------
              precision    recall  f1-score   support

           1       0.41      0.39      0.40      9981
           2       0.69      0.93      0.79    232862
           3       0.29      0.02      0.03     55125
           4       0.55      0.44      0.49    118806
           5       0.48      0.38      0.43     38624
           6       0.08      0.15      0.11      3369
           7       0.00      0.00      0.00        77
           8       0.09      0.59      0.15       376

    accuracy                           0.63    459220
   macro avg       0.32      0.36      0.30    459220
weighted avg       0.58      0.63      0.58    459220

______________________________________________________________________
                                VALIDATION                       

**Quadratic Discriminant Analysis**

In [8]:
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)

In [9]:
train_pred_qda = qda.predict(X_train)
val_pred_qda = qda.predict(X_val)

In [10]:
m.metrics(y_train, train_pred_qda , y_val, val_pred_qda)

______________________________________________________________________
                                TRAIN                                 
----------------------------------------------------------------------
              precision    recall  f1-score   support

           1       0.21      0.61      0.31      9981
           2       0.73      0.88      0.80    232862
           3       0.23      0.08      0.12     55125
           4       0.56      0.05      0.10    118806
           5       0.53      0.35      0.42     38624
           6       0.04      0.40      0.07      3369
           7       0.00      0.92      0.00        77
           8       0.02      0.94      0.03       376

    accuracy                           0.52    459220
   macro avg       0.29      0.53      0.23    459220
weighted avg       0.59      0.52      0.49    459220

______________________________________________________________________
                                VALIDATION                       

**XGB Classifier**

In [13]:
# xgb = XGBClassifier()
# xgb.fit(X_train, y_train)

In [None]:
# train_pred_xgb = xgb.predict(X_train)
# val_pred_xgb = xgb.predict(X_val)

In [None]:
# m.metrics(y_train, train_pred_xgb , y_val, val_pred_xgb)

**LGBM Classifier**

In [None]:
## kills the kernel

In [None]:
# lgbm = LGBMClassifier()
# lgbm.fit(X_train, y_train)

In [None]:
# train_pred_lgbm = lgbm.predict(X_train)
# val_pred_lgbm = lgbm.predict(X_val)

In [None]:
# m.metrics(y_train, train_pred_lgbm , y_val, val_pred_lgbm)

## 2.1 Hyperparameter Tuning

<a href="#top">Top &#129033;</a>

## 2.2 Combining Models

<a href="#top">Top &#129033;</a>

In [6]:
from sklearn.ensemble import StackingClassifier, VotingClassifier

from sklearn.calibration import CalibratedClassifierCV

In [27]:
calibrated_sgd = CalibratedClassifierCV(SGDClassifier())

estimators = [
    ('sgd',calibrated_sgd),
    ('rf', RandomForestClassifier()),
    ('dt', DecisionTreeClassifier()),
    ('gb', GradientBoostingClassifier()),
    ('ab', AdaBoostClassifier()),
]


**Voting Classifier**

In [10]:

voting_clf = VotingClassifier(estimators=estimators, voting='soft')

In [11]:
voting_clf.fit(X_train, y_train)

In [12]:
train_pred_voting = voting_clf.predict(X_train)
val_pred_voting = voting_clf.predict(X_val)

In [13]:
m.metrics(y_train, train_pred_voting , y_val, val_pred_voting)

______________________________________________________________________
                                TRAIN                                 
----------------------------------------------------------------------
              precision    recall  f1-score   support

           1       1.00      0.88      0.93      9981
           2       0.99      1.00      0.99    232862
           3       1.00      0.98      0.99     55125
           4       1.00      1.00      1.00    118806
           5       1.00      1.00      1.00     38624
           6       1.00      1.00      1.00      3369
           7       1.00      1.00      1.00        77
           8       1.00      0.99      1.00       376

    accuracy                           0.99    459220
   macro avg       1.00      0.98      0.99    459220
weighted avg       0.99      0.99      0.99    459220

______________________________________________________________________
                                VALIDATION                       

**Stacking Classifier**

In [28]:
stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression() # Meta-model, often a simpler model
)

In [None]:
stacking_clf.fit(X_train, y_train)

In [None]:
train_pred_stack = stacking_clf.predict(X_train)
val_pred_stack = stacking_clf.predict(X_val)

In [None]:
m.metrics(y_train, train_pred_stack , y_val, val_pred_stack)

In [None]:
import play_song as song
song.play_('audio.mp3')

# 3. Final Predictions

<a href="#top">Top &#129033;</a>

In [19]:
test = test[X_train.columns]

In [20]:
test['Claim Injury Type'] = voting_clf.predict(test)

Map Predictions to Original Values

In [21]:
label_mapping = {
    1: "1. CANCELLED",
    2: "2. NON-COMP",
    3: "3. MED ONLY",
    4: "4. TEMPORARY",
    5: "5. PPD SCH LOSS",
    6: "6. PPD NSL",
    7: "7. PTD",
    8: "8. DEATH"
}

test['Claim Injury Type'] = test['Claim Injury Type'].replace(label_mapping)

Check each category inside the target

In [22]:
test['Claim Injury Type'].value_counts() 

Claim Injury Type
2. NON-COMP        316454
4. TEMPORARY        44563
3. MED ONLY         16405
1. CANCELLED         5340
5. PPD SCH LOSS      5150
6. PPD NSL             52
8. DEATH               11
Name: count, dtype: int64

# 5. Export

<a href="#top">Top &#129033;</a>

**Select Columns for predictions**

In [23]:
predictions = test['Claim Injury Type']

**Export**

In [24]:
name = None

In [25]:
predictions.to_csv(f'./predictions/{name}.csv')

__*<center>Models*__ 
    
| Model | Feature Selection | Parameters | Kaggle Score |
| ----- | ----------------- | ---------- | -------------|
| Voting (sgd_rf_dt_gb_ab)  | 2 | - | 0.37300 |
| ----- | ----------------- | ---------- | -------------|
| ----- | ----------------- | ---------- | -------------|
| ----- | ----------------- | ---------- | -------------|
| ----- | ----------------- | ---------- | -------------|
    
<br><br>
    
    
__*<center>Models K-Fold*__ 

| Model | Feature Selection | Log | Parameters | Kaggle Score | Fold |
| ----- | ------------------ | --- | ---------- | ------------ | ---- |
| LogReg | - | - | -  | 0.21122 | 5 |
| RF | 1 | X | - | 0.29078 | 5 |
| XGB | 1 | X | - | 0.20642 | 10 |
| RF | - | - | - | 0.26616 | 5 |
    
<br><br>
    
__*<center>Models w/ Stratified K-Fold*__   
    
| Model | Feature Selection | Log | Parameters | Kaggle Score | Fold | 
| ----- | ------------------ | --- | ---------- | ------------ | ---- |
| RF | - | - | - | 0.26912 | 10 |
| DT | - | - | - | 0.14236 | 10 |
| DT | - | X | - | 0.15589 | 10 |

<br><br>
    
**Features for Feature Selection 1**

['C-2 Day', 'Accident Year', 'Birth Year', 'Assembly Month',
            'C-2 Month', 'Average Weekly Wage', 'Age at Injury', 
            'C-2 Year', 'Number of Dependents', 'Accident Day', 
            'Assembly Year', 'First Hearing Year', 'IME-4 Count', 
            'Assembly Day', 'Accident Month', 
            'WCIO Cause of Injury Code', 'Gender', 
            'COVID-19 Indicator', 'WCIO Part Of Body Code', 
            'County of Injury', 'Attorney/Representative', 
            'Carrier Type', 'District Name', 'Medical Fee Region', 
            'Zip Code', 'Carrier Name', 'C-3 Date Binary', 
            'Alternative Dispute Resolution', 
            'WCIO Nature of Injury Code', 'Industry Code']
    
    
**Features for Feature Selection 2**    
    
['Age at Injury',
 'Average Weekly Wage',
 'Assembly Year',
 'C-2 Month',
 'C-2 Year',
 'First Hearing Year',
 'IME-4 Count Log',
 'Attorney/Representative',
 'Carrier Name',
 'Carrier Name Log',
 'Carrier Type',
 'County of Injury',
 'District Name',
 'Gender',
 'Industry Code',
 'Medical Fee Region',
 'WCIO Cause of Injury Code',
 'WCIO Nature of Injury Code',
 'WCIO Part Of Body Code',
 'C-3 Date Binary']
