# Feel the Rhythm

[Challenge Link](https://unearthed.solutions/u/competitions/110/description)


## Install Libraries 

In [None]:
!pip install -q xgboost tensorflow_decision_forests

In [1]:
from collections import Counter
import os
import pickle
import random
import string 
import time

import numpy as np
import pandas as pd
# from sklearn.metrics import roc_auc_score
# from sklearn.model_selection import GroupShuffleSplit

In [2]:
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

## Download Dataset

In [None]:
from google.colab import drive
drive.mount('/gdrive')

In [None]:
!cp /gdrive/MyDrive/public.csv.zip .

In [None]:
!unzip -q public.csv.zip

## PREPROCESSING

In [3]:
wa_hols = [
    "2009-01-01",
    "2009-01-26",
    "2009-03-02",
    "2009-04-10",
    "2009-04-11",
    "2009-04-12",
    "2009-04-13",
    "2009-04-25",
    "2009-04-27",
    "2009-06-01",
    "2009-09-28",
    "2009-12-25",
    "2009-12-26",
    "2009-12-28",
    "2010-01-01",
    "2010-01-26",
    "2010-03-01",
    "2010-04-02",
    "2010-04-05",
    "2010-04-26",
    "2010-06-07",
    "2010-09-27",
    "2010-12-25",
    "2010-12-26",
    "2010-12-27",
    "2010-12-28",
    "2011-01-01",
    "2011-01-26",
    "2011-03-07",
    "2011-04-22",
    "2011-04-25",
    "2011-04-26",
    "2011-06-06",
    "2011-10-28",
    "2011-12-25",
    "2011-12-26",
    "2011-12-27",
    "2012-01-01",
    "2012-01-02",
    "2012-01-26",
    "2012-03-05",
    "2012-04-06",
    "2012-04-09",
    "2012-04-25",
    "2012-06-04",
    "2012-10-01",
    "2012-12-25",
    "2012-12-26",
    "2013-01-01",
    "2013-01-26",
    "2013-03-04",
    "2013-03-29",
    "2013-04-01",
    "2013-04-25",
    "2013-06-03",
    "2013-09-30",
    "2013-12-25",
    "2013-12-26",
    "2014-01-01",
    "2014-01-27",
    "2014-03-03",
    "2014-04-18",
    "2014-04-19",
    "2014-04-21",
    "2014-04-25",
    "2014-06-02",
    "2014-09-29",
    "2014-12-25",
    "2014-12-26",
    "2015-01-01",
    "2015-01-26",
    "2015-03-02",
    "2015-04-03",
    "2015-04-04",
    "2015-04-06",
    "2015-04-25",
    "2015-04-27",
    "2015-06-01",
    "2015-09-28",
    "2015-12-25",
    "2016-01-01",
    "2016-01-26",
    "2016-03-07",
    "2016-03-25",
    "2016-03-28",
    "2016-04-25",
    "2016-06-06",
    "2016-09-26",
    "2016-12-25",
    "2016-12-26",
    "2016-12-27",
    "2017-01-01",
    "2017-01-02",
    "2017-01-26",
    "2017-03-06",
    "2017-04-14",
    "2017-04-17",
    "2017-04-25",
    "2017-06-05",
    "2017-09-25",
    "2017-12-25",
    "2017-12-26",
    "2018-01-01",
    "2018-01-26",
    "2018-03-05",
    "2018-03-30",
    "2018-04-02",
    "2018-04-25",
    "2018-06-04",
    "2018-09-24",
    "2018-12-25",
    "2018-12-26",
    "2019-01-01",
    "2019-01-28",
    "2019-03-04",
    "2019-04-19",
    "2019-04-22",
    "2019-04-25",
    "2019-06-03",
    "2019-09-30",
    "2019-12-25",
    "2019-12-26",
    "2020-01-01",
    "2020-01-27",
    "2020-03-02",
    "2020-04-10",
    "2020-04-13",
    "2020-04-25",
    "2020-04-27",
    "2020-06-01",
    "2020-09-28",
    "2020-12-25",
    "2020-12-26",
    "2020-12-28",
    "2021-01-01",
    "2021-01-26",
    "2021-03-01",
    "2021-04-02",
    "2021-04-05",
    "2021-04-25",
    "2021-04-26",
    "2021-06-07",
    "2021-09-27",
    "2021-12-25",
    "2021-12-26",
    "2021-12-27",
    "2021-12-28",
]

work_desc = {"TRAINING": "TRAINING", 
             "ADMIN": "ADMIN", 
             "TCS POLE DISTRIBUTION": "TCS POLE DISTRIBUTION",
             "TCS POLE DISTRIBUTION TFIXED": "TCS POLE DISTRIBUTION", 
             "NC DISTRIBUTION STANDARD JOBS": "NC DISTRIBUTION STANDARD JOBS", 
             "TCS CONDUCTOR": "TCS CONDUCTOR", 
             "SAFETY": "SAFETY", 
             "HOUSEKEEPING": "HOUSEKEEPING", 
             "TCS NP NO POWER": "TCS NP NO POWER", 
             "DATA MAINTENENCE": "DATA MAINTENENCE", 
             "NC TRANSMISSION STANDARD JOBS": "NC TRANSMISSION STANDARD JOBS", 
             "VISUAL MANAGEMENT": "VISUAL MANAGEMENT", 
             "DOWNTIME": "DOWNTIME", 
             "DATA MANAGEMENT": "DATA MANAGEMENT", 
             "TCS PB POLE BROKENDAMAGED": "TCS PB POLE BROKENDAMAGED", 
             "TEMPORARY DISCONNECTION ONLY": "TEMPORARY DISCONNECTION ONLY", 
             "NCIMP CONTINUOUS IMPROVEMENT": "NCIMP CONTINUOUS IMPROVEMENT", 
             "TEN HOUR BREAK": "TEN HOUR BREAK", # can probably set target to False
             "TCS LV CABLE UNDERGROUND": "TCS LV CABLE UNDERGROUND", 
             "GENERAL TRAINING TRAVEL": "GENERAL TRAINING TRAVEL", 
             "PROJECT SCOPING": "PROJECT SCOPING", 
             "TCS DISTRIBUTION TRANSFORMER": "TCS DISTRIBUTION TRANSFORMER", 
             "TCS RT RECLOSER TRIP": "TCS RT RECLOSER TRIP", 
             "HIGH LOAD SURVEY METRO FUTURE GRID": "HIGH LOAD SURVEY METRO FUTURE GRID", 
             "TCS HV CABLE UNDERGROUND": "TCS HV CABLE UNDERGROUND", 
             "FAULT READY": "FAULT READY", 
             "TCS ES ELECTRIC SHOCK": "TCS ES ELECTRIC SHOCK", 
             "NEW CONNECTIONS INSTALL METER ENERGISE": "NEW CONNECTIONS INSTALL METER ENERGISE", 
             "TCS PP PART POWER": "TCS PP PART POWER", 
            #  "LEVEL 3 EVENT WAROONA BFIRES JAN 2016": "LEVEL 3 EVENT WAROONA BFIRES JAN 2016", 
             "TCS DOFT DROP OUT FUSE TRIP": "TCS DOFT DROP OUT FUSE TRIP", 
             "METER RECONFIGURATION": "RECONFIGURATION",
             "PROTECTION COMMISSIONING": "PROTECTION COMMISSIONING", 
             "XA21 SUPPORT 201314 LABOUR": "LABOUR SUPPORT", 
             "MISCELLANEOUS HAZARD": "HAZARD", 
             "MISCELLANEOUS NON HAZARD": "NON HAZARD", 
             "POLICYSTRATEGY": "POLICY STRATEGY", 
             "TCS MINI PILLAR": "TCS MINI PILLAR", 
             "TCS DROP OUT FUSE": "TCS DROP OUT FUSE", 
             "TCS RC RECONNECTION": "TCS RC RECONNECTION", 
             "METER REPLACEMENT INTERNAL": "METER REPLACEMENT INTERNAL", 
             "ASSET INFORMATION SYSTEMS": "ASSET INFORMATION SYSTEMS",
             "XA21 SUPPORT 201314 LABOUR": "XA21 SUPPORT 201314 LABOUR", 
             "DX VEGETATION RESPONSE MET": "DX VEGETATION RESPONSE MET", 
             "TCS LV CROSS ARM": "TCS LV CROSS ARM", 
             # SELF ADDED
             "DESIGN CONNECTION ASSETS": "DESIGN CONNECTION ASSETS",
             "NETWORK PLANNING": "NETWORK PLANNING",
             "DESIGN INTERNAL": "DESIGN INTERNAL",
             "REPLACE WOOD POLE PWOD/PINT": "REPLACE WOOD POLE PWOD/PINT",
             "DESIGN SYSTEM ASSETS": "DESIGN SYSTEM ASSETS",
             "REPAIR EARTH PWOD/PINT": "REPAIR EARTH PWOD/PINT",
             "PROJECT PLANNING": "PROJECT PLANNING",
             "NETWORK FULL INSPECTION": "NETWORK FULL INSPECTION",
             "REINFORCE WOOD POLE": "REINFORCE WOOD POLE",
             "REPLACE OVERHEAD SERVICE": "REPLACE OVERHEAD SERVICE",
             "DESIGN WORK ORDER": "DESIGN WORK ORDER", 
             "TRAINING": "TRAINING", 
             "GENERAL": "GENERAL", 
             "PROJECT MANAGEMENT": "PROJECT MANAGEMENT", 
             "PROJECT PLANNING": "PROJECT PLANNING",
             "PROJECT ESTIMATION": "PROJECT ESTIMATION", 
             "ADMIN WORK ORDER": "ADMIN WORK ORDER", 
             "TCS POLE DISTRIBUTION": "TCS POLE DISTRIBUTION", 
             "OFFSITE LABOUR COST": "OFFSITE LABOUR COST", 
             "CIVIL CONSTRUCTION": "CIVIL CONSTRUCTION", 
             "LINE CONSTRUCTION": "LINE CONSTRUCTION", 
             "ELECTRICAL CONSTRUCTION": "ELECTRICAL CONSTRUCTION", 
             "CONSTRUCTION MANAGER": "CONSTRUCTION MANAGER", 
             "CONSTRUCTION": "CONSTRUCTION", 
             "NETWORK PLANNING": "NETWORK PLANNING", 
             "DATA MAINTAINENCE": "DATA MAINTAINENCE", 
             "DATA WAREHOUSE": "DATA WAREHOUSE", 
             "DATA ANALYSIS": "DATA ANALYSIS", 
             "ELECTRICAL WORK": "ELECTRICAL WORK", 
             "UNDERGROUND": "UNDERGROUND", 
             "DESIGN INTERNAL": "DESIGN INTERNAL", 
             "INTERNAL DESIGN": "DESIGN INTERNAL", 
             "NETWORK SUPPORT": "NETWORK SUPPORT", 
             "STATION NETWORK": "STATION NETWORK", 
             "OFFSITE": "OFFSITE", 
             "MEETING": "MEETING", "CONTRACT": "CONTRACT", 
             "RETAILER": "RETAILER REQUESTED", "OVERHEAD": "OVERHEAD",
             "TRANSMISSION": "TRANSMISSION", 
             "SWITCHING": "ANNUAL SWITCHING", 
             "HAZARD": "REMOVE HAZARD FROM CONDUCTOR", 
             "ELECTRICAL": "ELECTRICAL WORK",  
             "DELIVERY": "PROJECT DELIVERY", 
             "ADMINISTRATION": "ADMININSTRATION", 
             "EMERGENCY": "EMERGENCY RESCUE", 
             "INCIDENT REPORTING": "INCIDENT REPORTING",
             "INCIDENT INVESTIGATION": "INCIDENT REPORTING", 
             "OPERATIONS": "OPERATIONS", 
             "FLEET": "FLEET WORK", "CIVIL": "CIVIL WORK", 
             "SUPPLY ABOLISHMENT": "SUPPLY ABOLISHMENT", 
             "RISK ANALYSIS": "RISK ANALYSIS", 
             "RISK MANAGEMENT": "RISK ANALYSIS", 
             "RISK MODELLING": "RISK ANALYSIS",
             "COMMUNITY": "COMMUNITY WORK", "FITTING": "FITTING WORK", 
             "CONTRACTORS": "CONTRACTORS TIMESHEET", 
             "VOLTAGE LOW" : "VOLTAGE LOW & FLUCTUATION", 
             "VOLTAGE FLUCTUATION": "VOLTAGE LOW & FLUCTUATION",
             "OVERTIME": "OVERTIME", "RAILWAY": "RAILWAY WORK", 
             "OIL SPILL RESPONSECLEANUP": "CLEAN", 
             "LIVE INSULATOR CLEAN": "LIVE INSULATOR CLEAN", 
             "SECURITY": "COMPUTER SECURITY", "INSPECTION": "INSPECTION", 
             "STREETLIGHT": "REPLACE STREETLIGHT", 
             "BOND HARDWARE": "BOND HARDWARE", "DECOMISSION": "DECOMISSION", 
             "METER READING": "METER READING", "PAYROLL": "PAYROLL", 
             "AUTOMATION": "AUTOMATION PROJECT", "BUSINESS": "BUSINESS TEAM", 
             "COMMUNICATIONS": "METER COMMUNICATIONS", 
             "INSTALLATION": "INSTALLATION"
             }

In [4]:
def add_time_feat(data):
    data["hour"] = data["Work_DateTime"].dt.hour
    data["year"] = data["Work_DateTime"].dt.year
    data["month"] = data["Work_DateTime"].dt.month
    data["date"] = data["Work_DateTime"].dt.date
    data["day_of_week"] = data["Work_DateTime"].dt.dayofweek
    return data

In [5]:
def get_shift_hour(group):
    group["shift_hour"] = group["Work_DateTime"] - group.iloc[0]["Work_DateTime"]
    return group

In [6]:
def get_date_group(group):
    return group.groupby("date").apply(get_shift_hour)

In [7]:
def preprocess(data_file):
    print(f"LOADING DATA")
    df = pd.read_csv(data_file)

    print("\nADD TIME FEATURES")
    df["Work_DateTime"] = pd.to_datetime(df["Work_DateTime"], errors="coerce")
    df = add_time_feat(df)

    print('\nENCODING "TIME_TYPE" & "FUNC_CAT"')
    df["TIME_TYPE"] = df["TIME_TYPE"].replace({"Normal Time": 0, "Overtime": 1})
    df["FUNC_CAT"] = df["FUNC_CAT"].replace({"Operational": 1, "Network or Asset": 2, "Support": 3})

    print("\nWESTERN AUS PUBLIC HOLIDAY")
    df["holiday"] = df["Work_DateTime"].dt.round("1D").isin(wa_hols)

    print("\nPERIOD OF THE DAY")
    bins = [0, 4, 8, 12, 16, 20, 24]
    labels = [1, 2, 3, 4, 5, 6]
    df["period"] = pd.cut(df["Work_DateTime"].dt.hour, bins=bins, labels=labels, include_lowest=True)
    df["period"] = pd.to_numeric(df["period"], errors="coerce")

    print("\nSEASON OF THE YEAR")
    df.loc[df["month"].isin([12, 1, 2]), "season"] = 1
    df.loc[df["month"].isin([3, 4, 5]), "season"] = 2
    df.loc[df["month"].isin([6, 7, 8]), "season"] = 3
    df.loc[df["month"].isin([9, 10, 11]), "season"] = 4

    print("\nGAP BETWEEN WORKING DAYS")
    df["gap"] = df.groupby("EmpNo_Anon")["date"].diff().dt.days

    print('\n"Work Description" - Mapping & Encoding"')
    start = time.time()
    df["WORK_DESC"] = df["WORK_DESC"].str.strip().str.upper()
    for key, value in work_desc.items():
        df.loc[df["WORK_DESC"].str.contains(key), "WORK_DESC"] = value
    df.loc[~df["WORK_DESC"].isin(list(work_desc.keys())), "WORK_DESC"] = "OTHERS"
    mapping = dict(zip(list(work_desc.values()), list(range(len(work_desc)))))
    df["WORK_DESC"] = df["WORK_DESC"].replace(mapping)
    df["WORK_DESC"] = df["WORK_DESC"].replace({"OTHERS": 999})
    print(f"Runtime - {time.time() - start}s")

    print("\nSHIFT HOUR CALCULATION")
    start = time.time()
    df = df.groupby("EmpNo_Anon").apply(get_date_group)
    print(f"Runtime - {time.time() - start}s")

    return df[input_cols + ["incident"]]

In [None]:
%%time
df = preprocess("../../incident-insights/data/public/public.csv")
df.to_csv("../data/processed_data.csv", index=False)

LOADING DATA


  """Entry point for launching an IPython kernel.



ADD TIME FEATURES

ENCODING "TIME_TYPE" & "FUNC_CAT"

WESTERN AUS PUBLIC HOLIDAY

PERIOD OF THE DAY

SEASON OF THE YEAR

GAP BETWEEN WORKING DAYS

"Work Description" - Mapping & Encoding"
Runtime - 525.3639886379242s

SHIFT HOUR CALCULATION


In [None]:
!cp processed_data.csv /gdrive/MyDrive/

In [None]:
input_cols = ["WORK_DESC", "TIME_TYPE", "FUNC_CAT", 
              "TOT_BRK_TM", "hour", "day_of_week", 
              "month", "year", "holiday", "period", 
              "season", "gap", "shift_hour"]

target_cols = ["incident"]

## Train Test Split

Stratification & Group Split together

In [None]:
true_employees = df.loc[df["Incident"]==True]["Emp_No"].tolist()
false_employees = df.loc[df["Incident"]==False]["Emp_No"].tolist()

Randomly pick 20% of employees from both splits & consider them as Validation data. Rest as train data

In [None]:
true_count = int(0.2*len(true_employees))
false_count = int(0.2*len(false_employees))

In [None]:
valid_true_emp = random.sample(true_employees, k=true_count)
valid_false_emp = random.sample(true_employees, k=false_count)

## XGBoost

In [None]:
import xgboost as xgb

In [None]:
df = pd.read_csv("processed_data.csv")

In [None]:
y_train = df[target_columns]
X_train = df.drop(columns=target_columns)

### Training

In [None]:
model = xgb.XGBClassifier(tree_method="hist")
model.fit(X_train, y_train)

In [None]:
with open("xgboost.pkl", "wb") as file:
    pickle.dump(model, file)

### Evaluation

In [None]:
with open("xgboost.pkl", "rb") as file:
    model = pickle.load(file)

In [None]:
y_pred = model.predict(processed_data[input_cols])
y_pred_proba = model.predict_proba(processed_data[input_cols])

In [None]:
y_pred.shape, y_pred_proba.shape

((12354494,), (12354494, 2))

In [None]:
class0 = pred_proba[:, 0]
class1 = pred_proba[:, 1]
final = pd.DataFrame.from_dict({"Class 0": class0, "Class 1": class1})

In [None]:
final.describe()

Unnamed: 0,Class 0,Class 1
count,12354490.0,12354490.0
mean,0.9999999,5.912832e-05
std,6.168339e-05,1.836057e-05
min,0.9996749,3.126473e-05
25%,0.9999241,4.071251e-05
50%,0.9999316,6.843024e-05
75%,0.9999593,7.593719e-05
max,0.9999687,0.0003251192


In [None]:
roc_auc_score(processed_data["incident"], y_pred)

0.5

In [None]:
roc_auc_score(processed_data["incident"], class1)

0.675003964838659

## Imbalanced XGBoost


In [None]:
# !pip install -q imbalance-xgboost

In [None]:
from imxgboost.imbalance_xgb import imbalance_xgboost as imb_xgb

In [None]:
y_train = processed_data[target_columns].values
X_train = processed_data.drop(columns=target_columns).values

In [None]:
model = imb_xgb(special_objective="focal", focal_gamma=2.)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict_determine(processed_data[input_cols]) 
y_pred_proba = model.predict(processed_data[input_cols])

In [None]:
pd.Series(y_pred_proba).describe()

In [None]:
roc_auc_score(processed_data["incident"], y_pred)

In [None]:
roc_auc_score(processed_data["incident"], y_pred_proba)

## HistGradientBoosting

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

In [None]:
y_train = processed_data[target_columns].values
X_train = processed_data.drop(columns=target_columns).values

In [None]:
model = HistGradientBoostingClassifier()

In [None]:
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


HistGradientBoostingClassifier(l2_regularization=0.0, learning_rate=0.1,
                               loss='auto', max_bins=255, max_depth=None,
                               max_iter=100, max_leaf_nodes=31,
                               min_samples_leaf=20, n_iter_no_change=None,
                               random_state=None, scoring=None, tol=1e-07,
                               validation_fraction=0.1, verbose=0,
                               warm_start=False)

In [None]:
y_pred = model.predict(processed_data[input_cols])
y_pred_proba = model.predict_proba(processed_data[input_cols])

In [None]:
y_pred.shape, y_pred_proba.shape

((12354494,), (12354494, 2))

In [None]:
class0 = pred_proba[:, 0]
class1 = pred_proba[:, 1]
final = pd.DataFrame.from_dict({"Class 0": class0, "Class 1": class1})

In [None]:
final.describe()

Unnamed: 0,Class 0,Class 1
count,12354490.0,12354490.0
mean,0.9999999,5.912832e-05
std,6.168339e-05,1.836057e-05
min,0.9996749,3.126473e-05
25%,0.9999241,4.071251e-05
50%,0.9999316,6.843024e-05
75%,0.9999593,7.593719e-05
max,0.9999687,0.0003251192


In [None]:
roc_auc_score(processed_data["incident"], y_pred)

0.5

In [None]:
roc_auc_score(processed_data["incident"], class1)

0.675003964838659

## LightGBM

## Tensorflow Decision Forests (Gradient Boosting)

In [None]:
import tensorflow_decision_forests as tfdf

In [None]:
processed_data["holiday"] = processed_data["holiday"].replace({False: 0, True: 1})

In [None]:
dataset = tfdf.keras.pd_dataframe_to_tf_dataset(processed_data[input_cols+target_columns], 
                                                label="incident")

In [None]:
model = tfdf.keras.GradientBoostedTreesModel()
model.fit(dataset)

In [None]:
y_pred = model.predict(processed_data[input_cols])