###### Notes
If enough time, make visualizations for the time variables, map visualizations for location variables and try to predict "Agreement Reached"

# Importing libraries

In [380]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

In [381]:
low_memory = False

# Importing the datasets

In [383]:
data = pd.read_csv("C:\\Users\\gonca\\Downloads\\project_data\\train_data.csv")
data = data.copy()
data_test = pd.read_csv("C:\\Users\\gonca\\Downloads\\project_data\\test_data.csv")

  data = pd.read_csv("C:\\Users\\gonca\\Downloads\\project_data\\train_data.csv")


# Exploratory analysis

In [None]:
#Visualize our data and all columns
pd.set_option("display.max_columns", None)
data

In [None]:
#Checking the columns data types
data.info()

In [None]:
#Converting all date variables to datatype64 so we can work with them
data["Accident Date"] = pd.to_datetime(data["Accident Date"], 
 format = "%Y-%m-%d", 
 errors = "coerce")

data["Assembly Date"] = pd.to_datetime(data["Assembly Date"], 
 format = "%Y-%m-%d", 
 errors = "coerce")

data["C-2 Date"] = pd.to_datetime(data["C-2 Date"], 
 format = "%Y-%m-%d", 
 errors = "coerce")

data["C-3 Date"] = pd.to_datetime(data["C-3 Date"], 
 format = "%Y-%m-%d", 
 errors = "coerce")

data["First Hearing Date"] = pd.to_datetime(data["First Hearing Date"], 
 format = "%Y-%m-%d", 
 errors = "coerce")

In [None]:
#Checking for incoherences in the dates
print((data["Accident Date"] > data["Assembly Date"]).sum()) 
print((data["Accident Date"] > data["C-2 Date"]).sum())
print((data["Accident Date"] > data["C-3 Date"]).sum())
print((data["Accident Date"] > data["First Hearing Date"]).sum())

There are several inconsitencies that we will have to remove in the next step

In [None]:
#Checking statistics for our numerical features (except the codes but its still usefull to see their count)
data.describe().round(2).T

In [None]:
#Checking statistics for our categorical features
data.describe(include=["O"]).T

We can conclude that: <br>
There are a lot of missing values in _C-3 Date_, _First Hearing Date_, _IME-4 Count_ and _OIICS Nature of Injury Description_. <br>
There are no values for _OIICS Nature of Injury Description_. <br>
_Birth Year_ has a minimum value of 0 which is obviously a missing value since no one as we know of lived for more than 122.5 years <br>
There is also another that shouldn't have 0's as their minimum such as _Average Weekly Wage_  but we will be able to detect more of these anomalies in the visualization <br>
We only have three binary variables, _Attorney/Representative_ , _COVID-19 Indicator_ and _Agreement Reached_ <br>
We have a univarite variable that is _WCB Decision_ that only has the value Not Work Related


In [None]:
#Checking the percentage of NA per variable in descending order

print((data.isna().sum() / data.shape[0] *100).sort_values(ascending=False))

There are 4 variables with more than half of their values missing (as we already expected), and only 2 that don't have missing values <br>
The rest of our data has around 5% missing values which we will have to deal with in the next step
Also we have a lot of variables that have the exact same number of missing values which means they probably have missing values in the same rows but we will have to check

In [None]:
#Checking if the following variables have all their missing values in the same rows:
data[data["Gender"].isna()][["Gender","Age at Injury","District Name","COVID-19 Indicator","Medical Fee Region","County of Injury","Claim Injury Type","Carrier Type","Carrier Name","Attorney/Representative","Alternative Dispute Resolution","Agreement Reached","WCB Decision","Number of Dependents"]].info()
data[data["WCIO Part Of Body Code"].isna()][["WCIO Part Of Body Code","WCIO Part Of Body Description"]].info()
data[data["WCIO Nature of Injury Code"].isna()][["WCIO Nature of Injury Code","WCIO Nature of Injury Description"]].info()
data[data["WCIO Cause of Injury Code"].isna()][["WCIO Cause of Injury Code","WCIO Cause of Injury Description"]].info()
data[data["Industry Code"].isna()][["Industry Code","Industry Code Description"]].info()

All the missing values that are in variable codes are in the respective variable descriptions too, showing no inconsistencies <br>
What is interesting is that there are 14 variables that show missing values in all the same rows and they aren't dependent of each other

### Visualizing the data

###### Making visualization functions for different plots

In [None]:
def plot_bar(col, degrees = 0):
    
    column = data[col].value_counts()
    plt.figure(figsize=(10, 6))
    sns.barplot(x=column.index.astype(str), y=column.values)
    plt.xlabel(f"{col}")
    plt.ylabel("Frequency")
    plt.xticks(rotation=degrees)  
    plt.show()

In [None]:
def plot_hist(col, rotate = False):
    column = data[col].value_counts()
    plt.figure(figsize=(10, 6))
    sns.histplot(data[col], bins=30, kde = True)
    plt.xlabel(f"{col}")
    plt.ylabel("Frequency")
    if rotate:
        plt.xticks(rotation=45)  
    plt.show()

In [None]:
def plot_box(col):
    plt.figure(figsize=(10, 6))
    sns.boxplot(x=data[col])
    plt.xlabel(f"{col}")
    plt.show()  

In [None]:
def plot_pie(col):
    value_counts = data[col].value_counts()
    total = value_counts.sum()
    percentages = (value_counts / total) * 100

    above_threshold = percentages[percentages >= 2]
    below_threshold = percentages[percentages < 2]

    if len(below_threshold) > 0:
        other_percentage = below_threshold.sum()
        above_threshold['Others'] = other_percentage

    plt.figure(figsize=(8, 8))
    above_threshold.plot.pie(
        autopct='%1.1f%%', 
        startangle=90, 
        labels=above_threshold.index
    )
    plt.ylabel("")
    plt.xlabel(f"{col}")
    plt.legend(title="Legend", loc='upper left')  
    plt.show()


In [None]:
def plot_heatmap(numerical):
    plt.figure(figsize=(12,10))
    sns.heatmap(data = data[numerical].corr(method = 'spearman'), annot = True, cmap = "coolwarm", fmt='.1')
    plt.show()

###### Saving the columns with numerical features, categorical features and the target

In [None]:
num_features = ['Accident Date', 'Age at Injury', 'Assembly Date',
       'Average Weekly Wage', 'Birth Year', 'C-2 Date', 'C-3 Date',
        'First Hearing Date', 'IME-4 Count', 'Number of Dependents']

In [None]:
cat_features = ['Alternative Dispute Resolution', 'Attorney/Representative',
       'Carrier Name', 'Carrier Type','County of Injury',
       'COVID-19 Indicator', 'District Name', 'Gender','Industry Code',
       'Industry Code Description', 'Medical Fee Region','WCIO Cause of Injury Code',
       'WCIO Cause of Injury Description', 'WCIO Nature of Injury Code', 
       'WCIO Nature of Injury Description', 'WCIO Part Of Body Code', 'Agreement Reached',
       'WCIO Part Of Body Description', 'Zip Code', 'WCB Decision']

In [None]:
target = ["Claim Injury Type"]

###### Finally seeing plots of our data minus date variables and categorical ones with lots of variables

In [None]:
plot_hist("Age at Injury")
plot_bar("Alternative Dispute Resolution")
plot_pie("Attorney/Representative")
plot_box("Average Weekly Wage")
plot_hist("Birth Year")
plot_pie("Carrier Type")
plot_bar("Claim Injury Type", 45)
plot_pie("COVID-19 Indicator")
plot_bar("District Name")
plot_bar("Gender")
plot_bar("IME-4 Count", 45)
plot_box("IME-4 Count")
plot_pie("Medical Fee Region")
plot_pie("Agreement Reached")
plot_pie("Number of Dependents")
plot_heatmap(num_features)


### Conclusions
We can see in Age at _Injury_ there are a lot of 0 that are impossible but there are also a few values below 18 and above 80 that should also be impossible. <br>
Almost all values in  _Alternative Dispute Resolution_ are N and there are practically no U's (5 observations) <br>
_Average Weekly Wage_ has some very extreme outliers <br>
_Birth Year_ suffers from the same problem as Age as expected and has a lot of 0's <br>
Half of the possible values of _Carrier Type_ have very few observation and are either unkown or special funds <br>
_Gender_ has very rare categories such as U and X that only has 46 observations <br>
_IME-4_ seems to have a Half Normal Distribution and has an outlier at 73.0 <br>
_Number of Dependents_ weirdly has around the same number of variables for each amount between 0 and 6 which doesn't mimic the population <br>
The numerical variables that are highly correlated with each other are Age at Injury and Birth Year as expected, Assembly Date and Aciddent Date, Accident Date and C-2 Date, C-2 Date and Assembly date. <br>

# Data Pre-processment

In [None]:
#Defining Claim identifier as the index
data.set_index('Claim Identifier', inplace = True)

In [None]:
# Dropping variables that are definitely useless from our dataset
# The first variable has 100% missing values and the second only contains one type of value,
# meaning it doesn't provide any useful information
data = data.drop(["OIICS Nature of Injury Description", "WCB Decision"], axis=1)
cat_features.remove('WCB Decision')

In [None]:
# Dropping "Agreement Reached" because it is not on the validation dataset
#Later we can try to predict this column and then predict the target but for now lets drop ot
data = data.drop(['Agreement Reached'], axis=1)
cat_features.remove("Agreement Reached")

In [None]:
#Remove rows where the target variable is NaN
data.dropna(axis = 0 , subset=["Claim Injury Type"],inplace = True)

In [None]:
#Removing duplicates
data.drop_duplicates(inplace = True)

### Spliting the data

In [None]:
X = data.drop(["Claim Injury Type"], axis = 1)

In [None]:
y = data["Claim Injury Type"]

In [None]:
X_train, X_validation,y_train, y_validation = train_test_split(X,y,
                                                               train_size = 0.75, 
                                                               shuffle = True, 
                                                               stratify = y)

### Removing inconsistencies

In [None]:
#Remove the rows where the age of the injury is greater than 75 or between 1 and 17
X_train = X_train[(data['Age at Injury'] > 75) | ((data["Age at Injury"] <18) & (data["Age at Injury"] > 0))]

In [None]:
#Remove Changed Dates 
def is_date_order_correct(row):
    accident_date = row["Accident Date"]
    assembly_date = row["Assembly Date"]
    first_hearing_date = row["First Hearing Date"]
    
    if pd.notna(accident_date) and pd.notna(assembly_date):
        if accident_date > assembly_date:
            return False
    
    if pd.notna(assembly_date) and pd.notna(first_hearing_date):
        if assembly_date > first_hearing_date:
            return False

    return True

X_train = X_train[X_train.apply(is_date_order_correct, axis=1)]


Check for inconsistencies between code columns and their description

In [None]:
# For check  any inconsistencies between codes and descriptions
def inconsistent_pairs(data, code, descripion):
    duplos = data.groupby(code)[descripion].nunique()

    inconsistent_codes = duplos[duplos > 1].index
    inconsistent_rows = data[data[code].isin(inconsistent_codes)]

    return inconsistent_rows

In [None]:
# Chamar a função para cada par de colunas
print(inconsistent_pairs(data, 'Industry Code', 'Industry Code Description'))
print(inconsistent_pairs(data, 'WCIO Cause of Injury Code', 'WCIO Cause of Injury Description'))
print(inconsistent_pairs(data, 'WCIO Nature of Injury Code', 'WCIO Nature of Injury Description'))
print(inconsistent_pairs(data, 'WCIO Part Of Body Code', 'WCIO Part Of Body Description'))

### Missing Values

Categorical Variables

In [None]:
# Replace the missing values in the categorical variables

#We fill missing values in categorical data with missing because if it is not missing at random, it might hold predictive power
existing_cat_features = [col for col in cat_features if col in data.columns]
X_train[existing_cat_features] = X_train[existing_cat_features].fillna("Missing")
X_validation[existing_cat_features] = X_validation[existing_cat_features].fillna("Missing")

In [None]:
num_features = ['Accident Date', 'Age at Injury', 'Assembly Date','Average Weekly Wage', 'Birth Year','C-2 Date', 'C-3 Date', 'First Hearing Date', 'IME-4 Count', 'Number of Dependents']

Numeric Variables

In [None]:
# Decision Tree to input numerical missing values 
def impute_with_decision_tree(data, target_column):
    # Separate our data in the missing and non missing values
    available_data = X_train[X_train[target_column].notna()]
    missing_data = X_train[X_train[target_column].isna()]

    # Make sure we have data to train or input
    if len(available_data) == 0 or len(missing_data) == 0:
        return
    
    # making sure we are using only numerical data
    available_data_numeric = available_data.select_dtypes(include=['float64', 'int64', 'datetime64[ns]']).copy()
    missing_data_numeric = missing_data.select_dtypes(include=['float64', 'int64', 'datetime64[ns]']).copy()

    # Converting date variables into integers
    available_data_numeric = available_data_numeric.apply(lambda x: x.astype('int64') // 10**9 if x.dtype == 'datetime64[ns]' else x)
    missing_data_numeric = missing_data_numeric.apply(lambda x: x.astype('int64') // 10**9 if x.dtype == 'datetime64[ns]' else x)

    # Making sure again if we still have data after filtering our previous data
    if available_data_numeric.empty or missing_data_numeric.empty:
        return

    # Separating the target column from the training ones
    X_available = available_data_numeric.drop(columns=target_column)
    y_available = available_data_numeric[target_column]

    # Training and fiting the model
    model = DecisionTreeRegressor()
    model.fit(X_available, y_available)

    # Predicting the missing variables
    X_missing = missing_data_numeric.drop(columns=target_column)
    predicted_values = model.predict(X_missing)

    # If the column we are feeling is a date variable we convert it back to its type
    if pd.api.types.is_datetime64_ns_dtype(data[target_column]):
        predicted_values = pd.to_datetime(predicted_values, unit='s')  

    # Changing themissing values to the predicted ones
    data.loc[data[target_column].isna(), target_column] = predicted_values

# Filling the missing values
for column in num_features:
    impute_with_decision_tree(X_train, column)
    impute_with_decision_tree(X_validation, column)


