In [5]:
%matplotlib notebook
import pandas as pd
import matplotlib.pyplot as plot
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score
import warnings
warnings.filterwarnings('ignore')


In [6]:
#Read dataset and load it into a dataframe
data = pd.read_csv("A:\dataset\compworker\Assembled_Workers__Compensation_Claims___Beginning_2000.csv")
data.head(5)


Unnamed: 0,Claim Identifier,Claim Type,District Name,Average Weekly Wage,Current Claim Status,Claim Injury Type,Age at Injury,Assembly Date,Accident Date,ANCR Date,...,Hearing Count,Closed Count,Attorney/Representative,Carrier Name,Carrier Type,IME-4 Count,Interval Assembled to ANCR,Accident,Occupational Disease,County of Injury
0,53,Workers Compensation,NYC,$0.00,NO FURTHER ACTION,NON-COMP,0,01/05/2000,12/01/1999,,...,0,1,N,NYC TRANSIT AUTHORITY,3A. SELF PUBLIC,,,Y,N,KINGS
1,44,Workers Compensation,NYC,$0.00,NO FURTHER ACTION,NON-COMP,53,01/04/2000,11/09/1999,,...,4,1,Y,*** CARRIER UNDETERMINED ***,UNKNOWN,,,Y,N,BRONX
2,56,Workers Compensation,NYC,$0.00,NO FURTHER ACTION,NON-COMP,24,01/05/2000,12/09/1999,,...,0,1,N,NYC HOUSING AUTHORITY,3A. SELF PUBLIC,,,Y,N,KINGS
3,4036260,Workers Compensation,BUFFALO,$0.00,ADMINISTRATIVELY CLOSED,UNKNOWN,0,07/02/2015,02/12/2015,,...,0,1,N,SELECTIVE WAY INS CO,1A. PRIVATE,,,Y,N,ERIE
4,4035874,Workers Compensation,SYRACUSE,$0.00,NO FURTHER ACTION,UNKNOWN,0,07/14/2015,07/02/2015,,...,0,1,N,STATE INSURANCE FUND,2A. SIF,,,Y,N,ONEIDA


In [3]:
#load the shape of the data
# We have 3257907 and 51 columns or, instances and features respectively
data.shape

(3257907, 51)

In [4]:
#Here we get the percentage of missing data 
#we plot only those attributes whose missing value is greater than 0
import seaborn as sns
(data.isnull().sum()/data.shape[0])*100
#percent_missing[(percent_missing > 0)].plot(kind = 'bar')



Claim Identifier                       0.000000
Claim Type                             0.000000
District Name                          0.000000
Average Weekly Wage                    0.000000
Current Claim Status                   0.000000
Claim Injury Type                      0.000000
Age at Injury                          0.000000
Assembly Date                          0.000000
Accident Date                          0.507289
ANCR Date                             39.504995
Controverted Date                     90.670882
Section 32 Date                       95.400084
PPD Scheduled Loss Date               87.357036
PPD Non-Scheduled Loss Date           97.360054
PTD Date                              99.870899
First Appeal Date                     95.728914
WCIO Part Of Body Code                 0.000000
WCIO Part Of Body Description          0.000000
WCIO Nature of Injury Code             0.000000
WCIO Nature of Injury Description      0.000000
WCIO Cause of Injury Code              0

we see lot of attributes have almost 100% of data missing

In [7]:
data['Accident'].value_counts()/data['Accident'].shape[0]
#We observe Accident column is 98% yes we could drop this column too

Y    0.981451
N    0.018549
Name: Accident, dtype: float64

In [8]:
# Now we inspect data and decide what columns to drop
#data[(data['Accident'] != 'Y') & (data['Attorney/Representative'] == 'Y')]
#data = data.drop(['Claim Identifier', 'Assembly Date','ANCR Date','Carrier Name','IME-4 Count','Interval Assembled to ANCR'])

In [9]:
data_input = data[['Claim Type', 'District Name', 'Claim Injury Type','WCIO Part Of Body Code', 'WCIO Part Of Body Description','WCIO Nature of Injury Code','WCIO Cause of Injury Code','Medical Fee Region','Hearing Count','Carrier Type','Occupational Disease','Attorney/Representative']]
#We will count number of empty values, we have zero unknown values all coulmns have some values in them
data_input.head(10)

Unnamed: 0,Claim Type,District Name,Claim Injury Type,WCIO Part Of Body Code,WCIO Part Of Body Description,WCIO Nature of Injury Code,WCIO Cause of Injury Code,Medical Fee Region,Hearing Count,Carrier Type,Occupational Disease,Attorney/Representative
0,Workers Compensation,NYC,NON-COMP,UK,UNKNOWN,UK,UK,IV,0,3A. SELF PUBLIC,N,N
1,Workers Compensation,NYC,NON-COMP,UK,UNKNOWN,UK,UK,IV,4,UNKNOWN,N,Y
2,Workers Compensation,NYC,NON-COMP,UK,UNKNOWN,UK,UK,IV,0,3A. SELF PUBLIC,N,N
3,Workers Compensation,BUFFALO,UNKNOWN,37,THUMB,40,16,I,0,1A. PRIVATE,N,N
4,Workers Compensation,SYRACUSE,UNKNOWN,30,MULTIPLE UPPER EXTREMITIES,52,98,II,0,2A. SIF,N,N
5,Workers Compensation,SYRACUSE,UNKNOWN,19,FACIAL BONES,40,09,I,0,2A. SIF,N,N
6,Workers Compensation,SYRACUSE,MED ONLY,16,TEETH,59,76,I,0,3A. SELF PUBLIC,N,N
7,Workers Compensation,SYRACUSE,MED ONLY,55,ANKLE,10,75,I,0,4A. SELF PRIVATE,N,N
8,Workers Compensation,SYRACUSE,MED ONLY,36,FINGER(S),40,16,I,0,4A. SELF PRIVATE,N,N
9,Workers Compensation,NYC,NON-COMP,UK,UNKNOWN,UK,UK,IV,1,3A. SELF PUBLIC,N,N


In [10]:
#for column in data_input.columns:
 #   print(data_input[column].value_counts())
    

In [8]:
N, Y = data['Attorney/Representative'].value_counts()
N / data.shape[0]
#print("number of attorneys : " + str(Y / data.shape[0])
#67.7 % do not hire attorney and rest 32.3 do hire attorney

0.67716942196324204

In [9]:
#Now we will convert values into categorical vairable
category_col = data[['Claim Type', 'Claim Injury Type','WCIO Part Of Body Code', 'WCIO Part Of Body Description','WCIO Nature of Injury Code','WCIO Cause of Injury Code','Medical Fee Region','Hearing Count','Carrier Type','Occupational Disease','Attorney/Representative']]
for col in category_col:
            b, c = np.unique(data[col], return_inverse=True)
            category_col[col] = c
category_col

Unnamed: 0,Claim Type,Claim Injury Type,WCIO Part Of Body Code,WCIO Part Of Body Description,WCIO Nature of Injury Code,WCIO Cause of Injury Code,Medical Fee Region,Hearing Count,Carrier Type,Occupational Disease,Attorney/Representative
0,3,3,56,46,54,76,3,0,3,0,0
1,3,3,56,46,54,76,3,4,9,0,1
2,3,3,56,46,54,76,3,0,3,0,0
3,3,8,24,43,18,15,0,0,0,0,0
4,3,8,17,33,25,74,1,0,1,0,0
5,3,8,9,11,18,8,0,0,1,0,0
6,3,2,6,42,30,54,0,0,3,0,0
7,3,2,42,1,5,53,0,0,4,0,0
8,3,2,23,12,18,15,0,0,4,0,0
9,3,3,56,46,54,76,3,1,3,0,0


In [10]:
X = category_col.iloc[:,:-1]
y = category_col.iloc[:,-1]



In [11]:
#I have my training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [12]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators= 150, min_samples_split= 20, criterion='gini')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("F1 for Random Forest :" + str(f1_score(y_test, y_pred, average="macro")))
print("Precison for Random Forest :" + str(precision_score(y_test, y_pred, average="macro")))
print("Recall for Random Forest :" + str(recall_score(y_test, y_pred, average="macro"))  )




F1 for Random Forest :0.870393580051
Precison for Random Forest :0.88222112152
Recall for Random Forest :0.861168365909


In [14]:
confusion_matrix(y_test, y_pred)

array([[622764,  38656],
       [ 69263, 246690]])