In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier  
from lightgbm import LGBMClassifier 
from catboost import CatBoostClassifier 

# Metrics
from sklearn.metrics import classification_report


pd.set_option('display.max_columns', None)

# Warnings
import warnings
warnings.filterwarnings("ignore")

import NA_outliers as n
import play_song as song

ModuleNotFoundError: No module named 'lightgbm'

In [None]:
df = pd.read_csv('./project_data/out_eda1.csv', 
                 index_col = 'Claim Identifier')

In [None]:
test = pd.read_csv('./project_data/test_treated.csv', index_col = 'Claim Identifier')

## 1. K Fold

<a href="#top">Top &#129033;</a>

In [None]:
from sklearn.metrics import classification_report
#from collections import Counter
import time
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
#from functools import partial
from sklearn.preprocessing import RobustScaler
import joblib


In [None]:
def k_fold(df, features, target, k = 5, model = LogisticRegression()):
    
    start_time = time.time()
    
    X = df[features]
    y = df[target]
    
    kf = KFold(n_splits= k, shuffle=True, random_state=1)
    predictions = []
    
    for train_idx, val_idx in kf.split(X):

        ### SPLIT
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        original_columns = X_train.columns
        
        ### PIPELINE
        
        pipeline = Pipeline([
        ('imputer', FunctionTransformer(n.custom_impute, validate=False)), 
        ('log_transform', FunctionTransformer(n.log_transform, validate=False)),  
        ('scaler', RobustScaler()),
        ])
        
        X_train = pipeline.fit_transform(X_train, y_train)
        X_val = pipeline.transform(X_val)
        X_train = pd.DataFrame(X_train, columns=original_columns)
        X_val = pd.DataFrame(X_val, columns=original_columns)

        
        # fit model
        model = model
        model.fit(X_train, y_train)

        # make predictions
        train_pred = model.predict(X_train)
        val_pred = model.predict(X_val)

        # compute metrics
        print(classification_report(y_train, train_pred))
        print(classification_report(y_val, val_pred))

        # save predictions and best model's parameters

        predictions.append({'Train Predictions': train_pred, 'Validation Predictions': val_pred})
    
    model_name = type(model).__name__
    joblib.dump(model, f'{model_name}.joblib')  

    end_time = time.time()
    elapsed_time = (end_time - start_time) / 60
    print(elapsed_time)
        
    return predictions


In [None]:
features = ['C-2 Day', 'Accident Year', 'Birth Year', 'Assembly Month',
            'C-2 Month', 'Average Weekly Wage', 'Age at Injury', 
            'C-2 Year', 'Number of Dependents', 'Accident Day', 
            'Assembly Year', 'First Hearing Year', 'IME-4 Count', 
            'Assembly Day', 'Accident Month', 
            'WCIO Cause of Injury Code', 'Gender', 
            'COVID-19 Indicator', 'WCIO Part Of Body Code', 
            'County of Injury', 'Attorney/Representative', 
            'Carrier Type', 'District Name', 'Medical Fee Region', 
            'Zip Code', 'Carrier Name', 'C-3 Date Binary', 
            'Alternative Dispute Resolution', 
            'WCIO Nature of Injury Code', 'Industry Code']



In [None]:
k_fold(df, features = features, 
       target = 'Claim Injury Type', k = 2, 
       model = LogisticRegression())

In [None]:
song.play_('audio.mp3')

## test

In [None]:
# Load the model from the file
model = joblib.load('trained_model.joblib')

In [None]:
for col in test.columns:
    test = n.custom_impute(test)

In [None]:
test.isna().sum()

In [None]:
test['Claim Injury Type'] = model.predict(test)

In [None]:
test['Claim Injury Type'].value_counts() 

In [None]:
predictions = test['Claim Injury Type']
predictions

In [None]:
predictions.to_csv('./predictions/pred1.csv')