### Adnan Altukleh, Abdulkarim Dawalibi
### Course DV2627

### Importing libraries

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense, Masking
from tensorflow.keras.regularizers import l1_l2
from sklearn.metrics import classification_report, roc_auc_score
from kerastuner.tuners import RandomSearch
from kerastuner import HyperModel

### Reading Train data & Test data

In [4]:
train_oper=pd.read_csv("train_operational_readouts.csv")
train_tte=pd.read_csv("train_tte.csv")

In [5]:
test_label=pd.read_csv("validation_labels.csv")
test_label["class_label"].value_counts()

class_label
0    4910
4      76
3      30
1      16
2      14
Name: count, dtype: int64

In [6]:
training_data=pd.merge(train_tte,train_oper,on='vehicle_id')

In [7]:
test_data=pd.read_csv("validation_operational_readouts.csv")
test_data=pd.merge(test_data,test_label,on='vehicle_id')
test_data

Unnamed: 0,vehicle_id,time_step,171_0,666_0,427_0,837_0,167_0,167_1,167_2,167_3,...,397_27,397_28,397_29,397_30,397_31,397_32,397_33,397_34,397_35,class_label
0,10,3.0,46590.0,3696.0,2038959.0,1450.0,0.0,273826.0,339584.0,99834.0,...,8026.0,855.0,15.0,495.0,91665.0,169125.0,51900.0,9362.0,0.0,0
1,10,7.4,127110.0,16716.0,6501456.0,4660.0,0.0,635642.0,609742.0,288036.0,...,44312.0,7995.0,75.0,2265.0,414180.0,688891.0,258810.0,63167.0,1005.0,0
2,10,8.0,144015.0,19596.0,7327918.0,5230.0,0.0,654780.0,665756.0,300174.0,...,52203.0,9421.0,75.0,2415.0,487080.0,812071.0,300735.0,77477.0,1500.0,0
3,10,12.0,187560.0,24264.0,9286082.0,7420.0,2647.0,831628.0,794332.0,360066.0,...,61399.0,10727.0,75.0,2610.0,655620.0,1118116.0,403516.0,99587.0,1591.0,0
4,10,12.2,187575.0,24264.0,9286082.0,7420.0,,,,,...,61400.0,10727.0,75.0,2611.0,655620.0,1118116.0,403517.0,99588.0,1592.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196222,33625,67.8,1392945.0,2450.0,60748886.0,0.0,0.0,3772792.0,10954147.0,6446205.0,...,23783.0,1735.0,75.0,17827.0,219069.0,609726.0,41630.0,2284.0,60.0,0
196223,33625,73.8,1519110.0,2654.0,66374974.0,0.0,0.0,4062662.0,11906819.0,7121565.0,...,26033.0,1915.0,75.0,20227.0,247374.0,658866.0,44435.0,2480.0,60.0,0
196224,33625,77.4,1590900.0,2690.0,69656097.0,0.0,0.0,4256241.0,12550791.0,7621393.0,...,27594.0,2081.0,75.0,21307.0,263755.0,690606.0,46416.0,2585.0,60.0,0
196225,33625,83.4,1657335.0,2690.0,72538912.0,0.0,0.0,4488515.0,13016809.0,7995291.0,...,29380.0,2201.0,75.0,22462.0,285700.0,716767.0,48952.0,2810.0,60.0,0


In [8]:
test_data.loc[test_data['class_label'] !=0, 'class_label'] = 1

In [9]:
test_data["class_label"].value_counts()

class_label
0    191150
1      5077
Name: count, dtype: int64

### Handling imbalance in the data

In [11]:
data=training_data

In [12]:
test=test_data

In [13]:
#train
size_of_rows=data.groupby("vehicle_id").size().sort_values()
size_of_rows=size_of_rows.to_frame().reset_index().rename(columns={0:"size"})
size_of_rows["in_study_repair"]=size_of_rows["vehicle_id"].apply(lambda x: data[data["vehicle_id"]==x]["in_study_repair"].values[0])

In [14]:
#test
size_of_rows_test=test.groupby("vehicle_id").size().sort_values()
size_of_rows_test=size_of_rows_test.to_frame().reset_index().rename(columns={0:"size"})
size_of_rows_test["class_label"]=size_of_rows_test["vehicle_id"].apply(lambda x: test[test["vehicle_id"]==x]["class_label"].values[0])

In [15]:
#train
size=(size_of_rows.loc[size_of_rows["in_study_repair"]==1]).groupby("size").size()
size=size.to_frame().reset_index().rename(columns={0:"number"})
size=size.sort_values(by="number")

In [16]:
#test
size_test=(size_of_rows_test.loc[size_of_rows_test["class_label"]==1]).groupby("size").size()
size_test=size_test.to_frame().reset_index().rename(columns={0:"number"})
size_test=size_test.sort_values(by="number")

In [17]:
#train
vehicles_train=[]
for i, n in zip(size["size"], size["number"]):
    df = size_of_rows.loc[(size_of_rows["in_study_repair"] == 0) & (size_of_rows["size"] == i)]
    vehicles_train.extend(df["vehicle_id"].head(n).tolist())

In [18]:
#test
vehicles_test=[]
for i, n in zip(size_test["size"], size_test["number"]):
    df_test = size_of_rows_test.loc[(size_of_rows_test["class_label"] == 0) & (size_of_rows_test["size"] == i)]
    vehicles_test.extend(df_test["vehicle_id"].head(n).tolist())

#### Vehicles_train and vehicles_test contain not repair vehicle which contain the same distribution of read out rows and number of vehicles as the repair vehicles (for both train and test data)

In [19]:
he=data.loc[data["vehicle_id"].isin(vehicles_train)]
unhe=data.loc[data["in_study_repair"]==1]
data=pd.concat([he,unhe])

In [20]:
data

Unnamed: 0,vehicle_id,length_of_study_time_step,in_study_repair,time_step,171_0,666_0,427_0,837_0,167_0,167_1,...,397_26,397_27,397_28,397_29,397_30,397_31,397_32,397_33,397_34,397_35
172,2,281.8,0,1.4,65520.0,2226.0,2891746.0,424.0,0.0,811605.0,...,22516.0,1524.0,232.0,4.0,337.0,12089.0,10388.0,2413.0,1081.0,88.0
173,2,281.8,0,1.6,73215.0,2401.0,3221614.0,488.0,0.0,824430.0,...,24450.0,1661.0,256.0,4.0,377.0,13125.0,11100.0,2609.0,1110.0,92.0
174,2,281.8,0,6.0,156150.0,5656.0,6727292.0,944.0,2776.0,1051575.0,...,62722.0,3778.0,648.0,16.0,873.0,33321.0,29052.0,6805.0,2379.0,96.0
175,2,281.8,0,50.8,1091040.0,73703.0,46242922.0,18216.0,5912.0,1935420.0,...,470495.0,33330.0,6527.0,128.0,4262.0,232677.0,345040.0,85935.0,22203.0,449.0
176,2,281.8,0,63.4,1310445.0,92428.0,55111815.0,20616.0,5912.0,2314425.0,...,558799.0,39695.0,7932.0,156.0,4879.0,274641.0,432085.0,109379.0,28767.0,593.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1119567,33528,299.8,1,277.4,2433825.0,76099.0,144554584.0,37567.0,0.0,7809379.0,...,1112188.0,184449.0,25492.0,1370.0,50044.0,128376.0,301247.0,114714.0,21818.0,1626.0
1119568,33528,299.8,1,283.8,2491995.0,77562.0,148095419.0,38287.0,0.0,8034935.0,...,1129428.0,186389.0,25633.0,1466.0,51504.0,130156.0,303408.0,115198.0,21867.0,1759.0
1119569,33528,299.8,1,286.6,2523405.0,78633.0,149727115.0,39287.0,0.0,8139321.0,...,1151270.0,189446.0,25993.0,1467.0,51944.0,131848.0,308692.0,117095.0,22088.0,1759.0
1119570,33528,299.8,1,293.0,2570520.0,79578.0,152970388.0,39895.0,0.0,8371851.0,...,1163558.0,190804.0,26129.0,1515.0,53501.0,133309.0,310380.0,117559.0,22184.0,1819.0


In [21]:
data["in_study_repair"].value_counts()

in_study_repair
1    105852
0    105456
Name: count, dtype: int64

In [22]:
het=test.loc[test["vehicle_id"].isin(vehicles_test)]
unhet=test.loc[test["class_label"]==1]
test=pd.concat([het,unhet])

In [23]:
test

Unnamed: 0,vehicle_id,time_step,171_0,666_0,427_0,837_0,167_0,167_1,167_2,167_3,...,397_27,397_28,397_29,397_30,397_31,397_32,397_33,397_34,397_35,class_label
2752,303,3.8,4710.0,0.0,268983.0,0.0,13888.0,234948.0,184115.0,52094.0,...,60.0,75.0,0.0,30.0,211.0,780.0,120.0,1605.0,30.0,0
2753,303,5.2,80535.0,2318.0,3201017.0,270.0,13888.0,380688.0,376055.0,138180.0,...,3782.0,165.0,0.0,437.0,21872.0,69585.0,14386.0,2371.0,30.0,0
2754,303,6.6,163365.0,7898.0,6459337.0,1351.0,13888.0,481628.0,505597.0,199458.0,...,13173.0,1680.0,0.0,722.0,81722.0,354375.0,85501.0,9136.0,30.0,0
2755,303,8.0,210975.0,10094.0,8353047.0,1551.0,13888.0,541479.0,631555.0,280504.0,...,16311.0,2025.0,0.0,842.0,97878.0,426780.0,103966.0,11251.0,30.0,0
2756,303,9.4,255015.0,12519.0,10146735.0,2191.0,13888.0,656671.0,780263.0,341838.0,...,21742.0,2551.0,0.0,1202.0,134823.0,562366.0,142832.0,16996.0,30.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193498,33116,107.0,2377995.0,274252.0,108644374.0,93284.0,1390.0,4152613.0,4076304.0,4312496.0,...,348972.0,77101.0,566.0,40026.0,1445766.0,5177465.0,1688601.0,330485.0,1906.0,1
193499,33116,107.6,2389980.0,274888.0,109111074.0,93634.0,1390.0,4182083.0,4102666.0,4333651.0,...,350502.0,77267.0,567.0,40117.0,1449426.0,5196185.0,1693401.0,330725.0,1906.0,1
193500,33116,113.6,2554935.0,287920.0,116447585.0,102244.0,1390.0,4418053.0,4384402.0,4649869.0,...,382167.0,83807.0,597.0,43238.0,1563172.0,5554355.0,1793916.0,347195.0,1981.0,1
193501,33116,119.6,2717145.0,301600.0,123592502.0,110124.0,1391.0,4669143.0,4646244.0,4925935.0,...,416130.0,90167.0,642.0,46419.0,1678132.0,5984046.0,1914742.0,368736.0,2011.0,1


In [24]:
test["class_label"].value_counts()

class_label
0    5077
1    5077
Name: count, dtype: int64

### Window labeling time until fail

In [25]:
data['Window'] = data.apply(lambda row: 0 if row['in_study_repair'] == 0 else row['length_of_study_time_step'] - row['time_step'], axis=1)

In [26]:
w1 = 48
w2 = 24
w3 = 12
w4 = 6

train_df = data.copy()
train_df['label1'] = np.where((train_df['Window'] <= w1)&(train_df['Window'] > 0), 1, 0 )
train_df['label2'] = train_df['label1']
train_df.loc[(train_df['Window'] <= w2)&(train_df['Window'] > 0), 'label2'] = 2
train_df['label3'] = train_df['label2']
train_df.loc[(train_df['Window'] <= w3)&(train_df['Window'] > 0), 'label3'] = 3
train_df['label4'] = train_df['label3']
train_df.loc[(train_df['Window'] <= w4)&(train_df['Window'] > 0), 'label4'] = 4

train_df

Unnamed: 0,vehicle_id,length_of_study_time_step,in_study_repair,time_step,171_0,666_0,427_0,837_0,167_0,167_1,...,397_31,397_32,397_33,397_34,397_35,RUL,label1,label2,label3,label4
172,2,281.8,0,1.4,65520.0,2226.0,2891746.0,424.0,0.0,811605.0,...,12089.0,10388.0,2413.0,1081.0,88.0,0.0,0,0,0,0
173,2,281.8,0,1.6,73215.0,2401.0,3221614.0,488.0,0.0,824430.0,...,13125.0,11100.0,2609.0,1110.0,92.0,0.0,0,0,0,0
174,2,281.8,0,6.0,156150.0,5656.0,6727292.0,944.0,2776.0,1051575.0,...,33321.0,29052.0,6805.0,2379.0,96.0,0.0,0,0,0,0
175,2,281.8,0,50.8,1091040.0,73703.0,46242922.0,18216.0,5912.0,1935420.0,...,232677.0,345040.0,85935.0,22203.0,449.0,0.0,0,0,0,0
176,2,281.8,0,63.4,1310445.0,92428.0,55111815.0,20616.0,5912.0,2314425.0,...,274641.0,432085.0,109379.0,28767.0,593.0,0.0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1119567,33528,299.8,1,277.4,2433825.0,76099.0,144554584.0,37567.0,0.0,7809379.0,...,128376.0,301247.0,114714.0,21818.0,1626.0,22.4,1,2,2,2
1119568,33528,299.8,1,283.8,2491995.0,77562.0,148095419.0,38287.0,0.0,8034935.0,...,130156.0,303408.0,115198.0,21867.0,1759.0,16.0,1,2,2,2
1119569,33528,299.8,1,286.6,2523405.0,78633.0,149727115.0,39287.0,0.0,8139321.0,...,131848.0,308692.0,117095.0,22088.0,1759.0,13.2,1,2,2,2
1119570,33528,299.8,1,293.0,2570520.0,79578.0,152970388.0,39895.0,0.0,8371851.0,...,133309.0,310380.0,117559.0,22184.0,1819.0,6.8,1,2,3,3


In [27]:
train_df.drop(columns=["length_of_study_time_step","in_study_repair"],inplace=True)

### Feature Extraction & converting time step to time unit with window of 10 time unit

In [44]:
# Feature Extraction on train data
def calculate_metrics_for_vehicle(df):
    sensor_columns = df.columns.difference(['vehicle_id', 'time_step', 'Window', 'label1', 'label2', 'label3', 'label4'])
    time_unit_window = 10

    df['time_unit'] = ((df['time_step'] - df['time_step'].min()) / time_unit_window).astype(int)
    
    df = df.sort_values('time_step')

    grouped = df.groupby(['vehicle_id', 'time_unit'])

    agg_functions = ['mean', 'std', 'min', 'max', 'sum', 'var', 'median', 'skew']
    agg_dict = {col: agg_functions for col in sensor_columns}

    metrics = grouped.agg(agg_dict)

    for col in sensor_columns:
        metrics[(col, 'sem')] = grouped[col].apply(lambda x: x.sem(ddof=0))
        metrics[(col, 'q25')] = grouped[col].quantile(0.25)
        metrics[(col, 'q50')] = grouped[col].quantile(0.50)
        metrics[(col, 'q75')] = grouped[col].quantile(0.75)
    
    metrics.columns = ['_'.join(col).strip() for col in metrics.columns.ravel()]

    last_values = grouped[['Window', 'label1', 'label2', 'label3', 'label4']].last().reset_index()
    final_df = pd.merge(metrics.reset_index(), last_values, on=['vehicle_id', 'time_unit'], how='left')

    return final_df

In [45]:
grouped = train_df.groupby('vehicle_id')
df_feature_engineering = grouped.apply(calculate_metrics_for_vehicle).reset_index(drop=True)
df_feature_engineering

Unnamed: 0,vehicle_id,time_unit,100_0_mean,100_0_std,100_0_min,100_0_max,100_0_sum,100_0_var,100_0_median,100_0_skew,...,837_0_q75,time_unit_sem,time_unit_q25,time_unit_q50,time_unit_q75,RUL,label1,label2,label3,label4
0,2,0,9.584373e+05,407108.011153,691450.0,1427006.0,2875312.0,1.657369e+11,756856.0,1.681891,...,716.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0
1,2,4,8.485926e+06,,8485926.0,8485926.0,8485926.0,,8485926.0,,...,18216.0,0.0,4.0,4.0,4.0,0.0,0,0,0,0
2,2,6,1.020155e+07,,10201551.0,10201551.0,10201551.0,,10201551.0,,...,20616.0,0.0,6.0,6.0,6.0,0.0,0,0,0,0
3,2,7,1.208676e+07,,12086761.0,12086761.0,12086761.0,,12086761.0,,...,21712.0,0.0,7.0,7.0,7.0,0.0,0,0,0,0
4,2,9,1.416866e+07,,14168661.0,14168661.0,14168661.0,,14168661.0,,...,22952.0,0.0,9.0,9.0,9.0,0.0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90835,33631,10,6.912780e+06,,6912780.0,6912780.0,6912780.0,,6912780.0,,...,17009.0,0.0,10.0,10.0,10.0,0.0,0,0,0,0
90836,33631,11,7.480573e+06,187615.748562,7265785.0,7612445.0,22441720.0,3.519967e+10,7563490.0,-1.600327,...,21913.0,0.0,11.0,11.0,11.0,0.0,0,0,0,0
90837,33631,12,7.772865e+06,,7772865.0,7772865.0,7772865.0,,7772865.0,,...,22985.0,0.0,12.0,12.0,12.0,0.0,0,0,0,0
90838,33631,13,8.246218e+06,199054.094438,8105465.0,8386970.0,16492435.0,3.962253e+10,8246217.5,,...,25153.0,0.0,13.0,13.0,13.0,0.0,0,0,0,0


In [46]:
#train
size_of=df_feature_engineering.groupby("vehicle_id").size().sort_values()
size_of

vehicle_id
19317     3
6792      3
18190     3
11775     3
16054     3
         ..
228      46
721      46
703      46
1640     46
269      47
Length: 4542, dtype: int64

In [62]:
df_feature_engineering=df_feature_engineering.drop(columns=["time_unit_sem","time_unit_q25","time_unit_q50","time_unit_q75","time_unit_mean","time_unit_std", "time_unit_min", "time_unit_max", "time_unit_sum", "time_unit_var", "time_unit_median", "time_unit_skew"])

In [47]:
# feature extraction on test data
def calculate_metrics_for_vehicle_test(df):
    sensor_columns = df.columns.difference(['vehicle_id', 'time_step', 'class_label'])
    time_unit_window = 10

    df['time_unit'] = ((df['time_step'] - df['time_step'].min()) / time_unit_window).astype(int)
    
    df = df.sort_values('time_step')

    grouped = df.groupby(['vehicle_id', 'time_unit'])

    agg_functions = ['mean', 'std', 'min', 'max', 'sum', 'var', 'median', 'skew']
    agg_dict = {col: agg_functions for col in sensor_columns}

    metrics = grouped.agg(agg_dict)

    for col in sensor_columns:
        metrics[(col, 'sem')] = grouped[col].apply(lambda x: x.sem(ddof=0))
        metrics[(col, 'q25')] = grouped[col].quantile(0.25)
        metrics[(col, 'q50')] = grouped[col].quantile(0.50)
        metrics[(col, 'q75')] = grouped[col].quantile(0.75)
    
    metrics.columns = ['_'.join(col).strip() for col in metrics.columns.ravel()]

    last_values = grouped[['class_label']].last().reset_index()
    final_df = pd.merge(metrics.reset_index(), last_values, on=['vehicle_id', 'time_unit'], how='left')

    return final_df

In [48]:
grouped_test = test.groupby('vehicle_id')
df_feature_engineering_test = grouped_test.apply(calculate_metrics_for_vehicle_test).reset_index(drop=True)
df_feature_engineering_test

Unnamed: 0,vehicle_id,time_unit,100_0_mean,100_0_std,100_0_min,100_0_max,100_0_sum,100_0_var,100_0_median,100_0_skew,...,666_0_q75,835_0_sem,835_0_q25,835_0_q50,835_0_q75,837_0_sem,837_0_q25,837_0_q50,837_0_q75,class_label
0,189,0,7.065735e+05,301519.442430,239089.0,1208502.0,7772309.0,9.091397e+10,686130.0,0.010509,...,11131.00,1.026734e+06,5889434.50,8164588.0,10925082.00,86.830915,391.0,461.0,721.00,1
1,189,1,1.851272e+06,427128.292921,1349062.0,2512328.0,14810176.0,1.824386e+11,1832342.0,0.340698,...,26866.00,1.486658e+06,17183053.50,20606974.0,24148649.50,115.072455,1124.5,1363.0,1775.50,1
2,189,2,3.182079e+06,311504.338892,2730240.0,3643898.0,25456635.0,9.703495e+10,3188085.5,0.024616,...,39680.00,1.162791e+06,32539907.00,34709316.5,36929884.25,148.561376,2304.0,2434.0,2714.00,1
3,189,3,4.294647e+06,292275.030294,3817842.0,4733030.0,42946472.0,8.542469e+10,4362922.0,-0.221985,...,51761.00,9.893731e+05,45426563.00,48774348.0,49692181.50,65.718338,3864.0,4024.0,4154.00,1
4,189,4,5.258569e+06,356523.439103,4837470.0,5806304.0,36809986.0,1.271090e+11,5197674.0,0.431749,...,67887.00,1.475159e+06,55954457.50,58857578.0,61866177.00,188.524499,4844.0,5384.0,5619.50,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4270,33612,0,1.046185e+06,255727.712455,750904.0,1195660.0,3138556.0,6.539666e+10,1191992.0,-1.731650,...,7956.00,8.364258e+05,9472838.00,11006986.0,11011903.00,73.621152,1045.0,1180.0,1180.50,0
4271,33612,2,2.691316e+06,1301.076477,2690396.0,2692236.0,5382632.0,1.692800e+06,2691316.0,,...,15949.00,1.652862e+03,28500542.75,28501711.5,28502880.25,0.000000,1691.0,1691.0,1691.00,0
4272,33612,5,3.966932e+06,,3966932.0,3966932.0,3966932.0,,3966932.0,,...,32053.00,0.000000e+00,46700579.00,46700579.0,46700579.00,0.000000,5791.0,5791.0,5791.00,0
4273,33612,6,4.294548e+06,850.018823,4293273.0,4294981.0,17178192.0,7.225320e+05,4294969.0,-1.999734,...,33961.25,9.480814e+02,49579537.50,49580632.5,49580633.25,0.216506,5851.0,5851.0,5851.25,0


In [49]:
size_of=df_feature_engineering_test.groupby("vehicle_id").size().sort_values()
size_of

vehicle_id
24202     4
18424     4
23371     5
22959     5
22398     5
         ..
3513     35
339      36
486      38
1285     38
715      41
Length: 272, dtype: int64

### Normalize the data

In [63]:
non_normalize_cols = ['vehicle_id', 'Window', 'label1', 'label2', 'label3', 'label4']
cols_normalize = df_feature_engineering.columns.difference(non_normalize_cols)

norm_train_df = pd.DataFrame(index=df_feature_engineering.index, columns=cols_normalize)

for vehicle, group_df in df_feature_engineering.groupby('vehicle_id'):
    scaler = MinMaxScaler()
    norm_train_df.loc[group_df.index, cols_normalize] = scaler.fit_transform(group_df[cols_normalize])

join_df = df_feature_engineering[non_normalize_cols].join(norm_train_df)

train_df = join_df.reindex(columns = df_feature_engineering.columns)

In [64]:
train_df

Unnamed: 0,vehicle_id,time_unit,100_0_mean,100_0_std,100_0_min,100_0_max,100_0_sum,100_0_var,100_0_median,100_0_skew,...,835_0_q75,837_0_sem,837_0_q25,837_0_q50,837_0_q75,RUL,label1,label2,label3,label4
0,2,0.0,0.0,0.470213,0.0,0.0,0.0,0.230331,0.0,1.0,...,0.0,0.111725,0.0,0.0,0.0,0.0,0,0,0,0
1,2,0.148148,0.221119,,0.22718,0.210249,0.062912,,0.225704,,...,0.189546,0.0,0.260178,0.259831,0.257349,0.0,0,0,0,0
2,2,0.222222,0.271516,,0.277185,0.261349,0.082149,,0.275804,,...,0.231199,0.0,0.295337,0.295007,0.292643,0.0,0,0,0,0
3,2,0.259259,0.326894,,0.332132,0.3175,0.103288,,0.330856,,...,0.280097,0.0,0.311393,0.31107,0.30876,0.0,0,0,0,0
4,2,0.333333,0.388049,,0.392811,0.379509,0.126632,,0.391652,,...,0.335501,0.0,0.329559,0.329244,0.326995,0.0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90835,33631,0.714286,0.784196,,0.792252,0.772221,0.269618,,0.787133,,...,0.748411,0.0,0.647725,0.646363,0.645045,0.0,0,0,0,0
90836,33631,0.785714,0.852495,0.397501,0.833129,0.861052,1.0,0.188629,0.86434,0.0,...,0.848343,0.824988,0.775783,0.824883,0.83541,0.0,0,0,0,0
90837,33631,0.857143,0.887654,,0.891847,0.881419,0.310071,,0.889183,,...,0.865731,0.0,0.877952,0.87748,0.877023,0.0,0,0,0,0
90838,33631,0.928571,0.944592,0.426204,0.930362,0.959388,0.720184,0.212919,0.945346,,...,0.949168,0.635294,0.928189,0.944618,0.961182,0.0,0,0,0,0


In [52]:
non_normalize_cols_test = ['vehicle_id', 'class_label']
cols_normalize_test = df_feature_engineering_test.columns.difference(non_normalize_cols_test)

assert 'class_label' not in cols_normalize_test, "class_label should not be normalized"

norm_train_df_test = pd.DataFrame(index=df_feature_engineering_test.index, columns=cols_normalize_test)

for vehicle_test, group_df_test in df_feature_engineering_test.groupby('vehicle_id'):
    scaler = MinMaxScaler()
    norm_train_df_test.loc[group_df_test.index, cols_normalize_test] = scaler.fit_transform(group_df_test[cols_normalize_test])

join_df_test = df_feature_engineering_test[non_normalize_cols_test].join(norm_train_df_test)

test_df = join_df_test.reindex(columns = df_feature_engineering_test.columns)


In [53]:
test_df

Unnamed: 0,vehicle_id,time_unit,100_0_mean,100_0_std,100_0_min,100_0_max,100_0_sum,100_0_var,100_0_median,100_0_skew,...,666_0_q75,835_0_sem,835_0_q25,835_0_q50,835_0_q75,837_0_sem,837_0_q25,837_0_q50,837_0_q75,class_label
0,189,0.0,0.0,0.116008,0.0,0.0,0.0,0.095547,0.0,0.747503,...,0.0,0.47599,0.0,0.0,0.0,0.19136,0.0,0.0,0.0,1
1,189,0.111111,0.116895,1.0,0.111614,0.136812,0.158719,1.0,0.115909,0.887848,...,0.117025,0.689209,0.096085,0.105041,0.113082,0.253599,0.065873,0.076799,0.089859,1
2,189,0.222222,0.252794,0.186278,0.250499,0.255549,0.39882,0.156035,0.253006,0.753499,...,0.212327,0.539065,0.22674,0.224096,0.222381,0.327402,0.171801,0.167986,0.169834,1
3,189,0.333333,0.366408,0.050949,0.359863,0.369833,0.793253,0.041301,0.37181,0.648683,...,0.302176,0.458669,0.336378,0.342836,0.331518,0.144831,0.311899,0.303363,0.292544,1
4,189,0.444444,0.464842,0.503107,0.462392,0.482453,0.654862,0.453228,0.456223,0.926549,...,0.42211,0.683878,0.425948,0.427961,0.435624,0.415474,0.39991,0.419157,0.417427,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4270,33612,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
4271,33612,0.285714,0.414513,0.00177,0.454842,0.391841,0.159839,0.000015,0.392183,,...,0.2774,0.001976,0.423811,0.403453,0.403412,0.0,0.132731,0.107988,0.107894,0
4272,33612,0.714286,0.735923,,0.754209,0.725588,0.059003,,0.725851,,...,0.836295,0.0,0.829188,0.823145,0.823125,0.0,0.975139,0.974429,0.974427,0
4273,33612,0.857143,0.81847,0.0,0.830742,0.811479,1.0,0.0,0.811657,0.0,...,0.902521,0.001133,0.893312,0.889563,0.88955,0.002941,0.987467,0.987109,0.987161,0


In [65]:
sensor_columns_train = train_df.columns.difference(['vehicle_id', 'time_unit', 'RUL', 'label1', 'label2', 'label3', 'label4'])

In [67]:
test_df.fillna(0, inplace=True)
train_df.fillna(0, inplace=True)

### Padding the train and test data to window of 47 readouts

In [68]:
padded_dfs = []

max_sequence_length = 47

for vehicle_id, group in train_df.groupby('vehicle_id'):
    if len(group) < max_sequence_length:
        padding_length = max_sequence_length - len(group)
        padding_df = pd.DataFrame(data=-1, 
                                  index=np.arange(len(group), len(group) + padding_length), 
                                  columns=train_df.columns)
        padding_df['vehicle_id'] = vehicle_id
        padded_group = pd.concat([group, padding_df])
        padded_dfs.append(padded_group)
    else:
        padded_dfs.append(group)

padded_train_df = pd.concat(padded_dfs)

padded_train_df['vehicle_id'] = padded_train_df['vehicle_id'].astype(int)

padded_train_df = padded_train_df.reset_index(drop=True)

padded_train_df

Unnamed: 0,vehicle_id,time_unit,100_0_mean,100_0_std,100_0_min,100_0_max,100_0_sum,100_0_var,100_0_median,100_0_skew,...,835_0_q75,837_0_sem,837_0_q25,837_0_q50,837_0_q75,RUL,label1,label2,label3,label4
0,2,0.000000,0.000000,0.470213,0.000000,0.000000,0.000000,0.230331,0.000000,1.0,...,0.000000,0.111725,0.000000,0.000000,0.000000,0.0,0,0,0,0
1,2,0.148148,0.221119,0.000000,0.227180,0.210249,0.062912,0.000000,0.225704,0.0,...,0.189546,0.000000,0.260178,0.259831,0.257349,0.0,0,0,0,0
2,2,0.222222,0.271516,0.000000,0.277185,0.261349,0.082149,0.000000,0.275804,0.0,...,0.231199,0.000000,0.295337,0.295007,0.292643,0.0,0,0,0,0
3,2,0.259259,0.326894,0.000000,0.332132,0.317500,0.103288,0.000000,0.330856,0.0,...,0.280097,0.000000,0.311393,0.311070,0.308760,0.0,0,0,0,0
4,2,0.333333,0.388049,0.000000,0.392811,0.379509,0.126632,0.000000,0.391652,0.0,...,0.335501,0.000000,0.329559,0.329244,0.326995,0.0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213469,33631,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,...,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,-1,-1,-1,-1
213470,33631,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,...,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,-1,-1,-1,-1
213471,33631,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,...,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,-1,-1,-1,-1
213472,33631,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,...,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,-1,-1,-1,-1


In [69]:
padded_train_df.fillna(0, inplace=True)

In [70]:
size_of=padded_train_df.groupby("vehicle_id").size().sort_values()
size_of

vehicle_id
2        47
21630    47
21631    47
21636    47
21641    47
         ..
10267    47
10257    47
10245    47
10279    47
33631    47
Length: 4542, dtype: int64

In [71]:
padded_dfs_test = []

max_sequence_length = 47

for vehicle_id, group in test_df.groupby('vehicle_id'):
    if len(group) < max_sequence_length:
        padding_length = max_sequence_length - len(group)
        padding_df = pd.DataFrame(data=-1, 
                                  index=np.arange(len(group), len(group) + padding_length), 
                                  columns=test_df.columns)
        padding_df['vehicle_id'] = vehicle_id
        padded_group = pd.concat([group, padding_df])
        padded_dfs_test.append(padded_group)
    else:
        padded_dfs_test.append(group)

padded_test_df = pd.concat(padded_dfs_test)

padded_test_df['vehicle_id'] = padded_test_df['vehicle_id'].astype(int)

padded_test_df = padded_test_df.reset_index(drop=True)

padded_test_df


Unnamed: 0,vehicle_id,time_unit,100_0_mean,100_0_std,100_0_min,100_0_max,100_0_sum,100_0_var,100_0_median,100_0_skew,...,666_0_q75,835_0_sem,835_0_q25,835_0_q50,835_0_q75,837_0_sem,837_0_q25,837_0_q50,837_0_q75,class_label
0,189,0.000000,0.000000,0.116008,0.000000,0.000000,0.000000,0.095547,0.000000,0.747503,...,0.000000,0.475990,0.000000,0.000000,0.000000,0.191360,0.000000,0.000000,0.000000,1
1,189,0.111111,0.116895,1.000000,0.111614,0.136812,0.158719,1.000000,0.115909,0.887848,...,0.117025,0.689209,0.096085,0.105041,0.113082,0.253599,0.065873,0.076799,0.089859,1
2,189,0.222222,0.252794,0.186278,0.250499,0.255549,0.398820,0.156035,0.253006,0.753499,...,0.212327,0.539065,0.226740,0.224096,0.222381,0.327402,0.171801,0.167986,0.169834,1
3,189,0.333333,0.366408,0.050949,0.359863,0.369833,0.793253,0.041301,0.371810,0.648683,...,0.302176,0.458669,0.336378,0.342836,0.331518,0.144831,0.311899,0.303363,0.292544,1
4,189,0.444444,0.464842,0.503107,0.462392,0.482453,0.654862,0.453228,0.456223,0.926549,...,0.422110,0.683878,0.425948,0.427961,0.435624,0.415474,0.399910,0.419157,0.417427,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12779,33612,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,...,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1
12780,33612,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,...,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1
12781,33612,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,...,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1
12782,33612,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,...,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1


In [72]:
padded_test_df.fillna(0, inplace=True)

In [73]:
size_of=padded_test_df.groupby("vehicle_id").size().sort_values()
size_of

vehicle_id
189      47
22676    47
22726    47
22754    47
22959    47
         ..
10497    47
10369    47
10269    47
10714    47
33612    47
Length: 272, dtype: int64

### Reshape features into (samples, time steps, features) & selecting balanced number of vehicels for each class in valdation data (from train read out)

In [118]:
padded_train_df_in_study_repair=pd.merge(train_tte,padded_train_df,on='vehicle_id')
padded_train_df_in_study_repair

Unnamed: 0,vehicle_id,length_of_study_time_step,in_study_repair,time_unit,100_0_mean,100_0_std,100_0_min,100_0_max,100_0_sum,100_0_var,...,835_0_q75,837_0_sem,837_0_q25,837_0_q50,837_0_q75,RUL,label1,label2,label3,label4
0,2,281.8,0,0.000000,0.000000,0.470213,0.000000,0.000000,0.000000,0.230331,...,0.000000,0.111725,0.000000,0.000000,0.000000,0.0,0,0,0,0
1,2,281.8,0,0.148148,0.221119,0.000000,0.227180,0.210249,0.062912,0.000000,...,0.189546,0.000000,0.260178,0.259831,0.257349,0.0,0,0,0,0
2,2,281.8,0,0.222222,0.271516,0.000000,0.277185,0.261349,0.082149,0.000000,...,0.231199,0.000000,0.295337,0.295007,0.292643,0.0,0,0,0,0
3,2,281.8,0,0.259259,0.326894,0.000000,0.332132,0.317500,0.103288,0.000000,...,0.280097,0.000000,0.311393,0.311070,0.308760,0.0,0,0,0,0
4,2,281.8,0,0.333333,0.388049,0.000000,0.392811,0.379509,0.126632,0.000000,...,0.335501,0.000000,0.329559,0.329244,0.326995,0.0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213469,33631,146.2,0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,...,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,-1,-1,-1,-1
213470,33631,146.2,0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,...,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,-1,-1,-1,-1
213471,33631,146.2,0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,...,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,-1,-1,-1,-1
213472,33631,146.2,0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,...,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,-1,-1,-1,-1


In [124]:
length_of_class_0=len(padded_train_df_in_study_repair.loc[padded_train_df_in_study_repair['in_study_repair'] ==0]["vehicle_id"].unique())
length_of_class_1=len(padded_train_df_in_study_repair.loc[padded_train_df_in_study_repair['in_study_repair'] ==1]["vehicle_id"].unique())

In [125]:
training_vehicles_c_0=padded_train_df_in_study_repair.loc[padded_train_df_in_study_repair['in_study_repair'] ==0]["vehicle_id"].unique()[:int(length_of_class_0/2)]
training_vehicles_c_1=padded_train_df_in_study_repair.loc[padded_train_df_in_study_repair['in_study_repair'] ==1]["vehicle_id"].unique()[:int(length_of_class_1/2)]
validation_vehicles_c_0=padded_train_df_in_study_repair.loc[padded_train_df_in_study_repair['in_study_repair'] ==0]["vehicle_id"].unique()[int(length_of_class_0/2):]
validation_vehicles_c_1=padded_train_df_in_study_repair.loc[padded_train_df_in_study_repair['in_study_repair'] ==1]["vehicle_id"].unique()[int(length_of_class_1/2):]

In [141]:
validation_vehicles_c_1

array([15357, 15365, 15369, ..., 33467, 33472, 33528], dtype=int64)

In [127]:
train_vehicles = padded_train_df.loc[(padded_train_df['vehicle_id'].isin(training_vehicles_c_0)) | (padded_train_df['vehicle_id'].isin(training_vehicles_c_1))]
validation_vehicles = padded_train_df.loc[(padded_train_df['vehicle_id'].isin(validation_vehicles_c_0)) | (padded_train_df['vehicle_id'].isin(validation_vehicles_c_1))]


In [129]:
validation_vehicles

Unnamed: 0,vehicle_id,time_unit,100_0_mean,100_0_std,100_0_min,100_0_max,100_0_sum,100_0_var,100_0_median,100_0_skew,...,835_0_q75,837_0_sem,837_0_q25,837_0_q50,837_0_q75,RUL,label1,label2,label3,label4
100392,15357,0.000000,0.000000,0.000000,0.000000,0.000000,0.000583,0.000000,0.000000,0.000000,...,0.000000,0.004312,0.000000,0.000000,0.000000,195.0,0,0,0,0
100393,15357,0.052632,0.104633,0.000000,0.106923,0.102360,0.000000,0.000000,0.104633,0.000000,...,0.089483,0.000000,0.047947,0.047254,0.046577,179.6,0,0,0,0
100394,15357,0.105263,0.176805,0.000000,0.179367,0.174263,0.014164,0.000000,0.176805,0.000000,...,0.166867,0.000000,0.062111,0.061244,0.060399,167.0,0,0,0,0
100395,15357,0.157895,0.210404,0.810608,0.200106,0.220624,0.083171,0.690409,0.210404,0.000000,...,0.223935,0.108331,0.080752,0.082394,0.083997,155.8,0,0,0,0
100396,15357,0.210526,0.259016,0.776653,0.241558,0.268137,0.174205,0.640843,0.267255,0.037482,...,0.281263,0.221112,0.106035,0.118787,0.117921,146.0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213469,33631,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,...,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,-1,-1,-1,-1
213470,33631,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,...,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,-1,-1,-1,-1
213471,33631,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,...,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,-1,-1,-1,-1
213472,33631,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,...,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,-1,-1,-1,-1


In [151]:
train_vehicles.groupby("vehicle_id")["label1"].max().value_counts()

label1
0    1143
1    1128
Name: count, dtype: int64

##### Reshape for train data from train readout

In [152]:
num_samples = len(train_vehicles['vehicle_id'].unique())  
sequence_length = 47  
num_features = 1261 

X_train_ts_t = np.zeros((num_samples, sequence_length, num_features))

for i, (vehicle_id, group) in enumerate(train_vehicles.groupby('vehicle_id')):
    vehicle_data = group.drop(['vehicle_id',"RUL","label1","label2","label3","label4"], axis=1).values
   
    if vehicle_data.shape[0] > sequence_length:
        processed_data = vehicle_data[:sequence_length, :]
    else:
        processed_data = np.zeros((sequence_length, num_features))
        processed_data[:vehicle_data.shape[0], :] = vehicle_data
        
    X_train_ts_t[i] = processed_data

X_train_ts_t.shape

(2271, 47, 1261)

In [153]:
y_train_t = np.zeros((num_samples, 1))
for i, (vehicle_id, group) in enumerate(train_vehicles.groupby('vehicle_id')):
    y_train_t[i] = group["label1"].max()

y_train_t.shape

(2271, 1)

##### Reshape valdation data

In [154]:
num_samples = len(validation_vehicles['vehicle_id'].unique())  
sequence_length = 47  
num_features = 1261


X_train_ts_v = np.zeros((num_samples, sequence_length, num_features))


for i, (vehicle_id, group) in enumerate(validation_vehicles.groupby('vehicle_id')):
    vehicle_data = group.drop(['vehicle_id',"RUL","label1","label2","label3","label4"], axis=1).values
   
   
    if vehicle_data.shape[0] > sequence_length:
        processed_data = vehicle_data[:sequence_length, :]
   
    else:
        processed_data = np.zeros((sequence_length, num_features))
        processed_data[:vehicle_data.shape[0], :] = vehicle_data
        
    X_train_ts_v[i] = processed_data

X_train_ts_v.shape

(2271, 47, 1261)

In [155]:
y_train_v = np.zeros((num_samples, 1))
for i, (vehicle_id, group) in enumerate(validation_vehicles.groupby('vehicle_id')):
    y_train_v[i] = group["label1"].max()

y_train_v.shape

(2271, 1)

##### Reshape the test data to test the performace of the model

In [87]:
num_samples = len(padded_test_df['vehicle_id'].unique()) 
sequence_length = 47  
num_features = 1261   


X_test_ts = np.zeros((num_samples, sequence_length, num_features))


for i, (vehicle_id, group) in enumerate(padded_test_df.groupby('vehicle_id')):
    vehicle_data = group.drop(['vehicle_id',"class_label"], axis=1).values
   
    
    if vehicle_data.shape[0] > sequence_length:
        processed_data = vehicle_data[:sequence_length, :]
    
    else:
        processed_data = np.zeros((sequence_length, num_features))
        processed_data[:vehicle_data.shape[0], :] = vehicle_data
        
    X_test_ts[i] = processed_data

X_test_ts.shape

(272, 47, 1261)

In [88]:
y_test = np.zeros((num_samples, 1))
for i, (vehicle_id, group) in enumerate(padded_test_df.groupby('vehicle_id')):
    y_test[i] = group["class_label"].values[0]

y_test.shape

(272, 1)

### LSTM with masking, and L1,L2 layers

In [191]:
# Build the network
nb_features = X_train_ts_t.shape[2]
nb_out = y_train_t.shape[1]


# Define L1 and L2 regularization factors
l1_factor = 0.0000005
l2_factor = 0.0000005

model = Sequential()

# Add a Masking layer
model.add(Masking(mask_value=-1., input_shape=(47, nb_features)))

# Add first LSTM layer with regularization and increased dropout
model.add(LSTM(
    units=100,
    return_sequences=True,
    kernel_regularizer=l1_l2(l1=l1_factor, l2=l2_factor)))
model.add(Dropout(0.3))  

# Add second LSTM layer with regularization and increased dropout
model.add(LSTM(
    units=50,
    return_sequences=False,
    kernel_regularizer=l1_l2(l1=l1_factor, l2=l2_factor)))
model.add(Dropout(0.3)) 

# Add Dense output layer with regularization
model.add(Dense(
    units=nb_out,
    activation='sigmoid',
    kernel_regularizer=l1_l2(l1=l1_factor, l2=l2_factor)))

# Compile the model with regularizers
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [192]:
model.fit(X_train_ts_t, y_train_t, epochs=50, batch_size=20, 
          validation_data=(X_train_ts_v, y_train_v), verbose=1,
          callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=7, verbose=0, mode='auto')])


Epoch 1/50
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 109ms/step - accuracy: 0.5031 - loss: 0.7255 - val_accuracy: 0.5592 - val_loss: 0.6892
Epoch 2/50
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 103ms/step - accuracy: 0.5966 - loss: 0.6730 - val_accuracy: 0.5438 - val_loss: 0.7161
Epoch 3/50
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 105ms/step - accuracy: 0.6141 - loss: 0.6664 - val_accuracy: 0.5425 - val_loss: 0.7398
Epoch 4/50
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 104ms/step - accuracy: 0.6133 - loss: 0.6730 - val_accuracy: 0.5773 - val_loss: 0.6992
Epoch 5/50
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 103ms/step - accuracy: 0.6345 - loss: 0.6502 - val_accuracy: 0.5513 - val_loss: 0.7409
Epoch 6/50
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 103ms/step - accuracy: 0.6595 - loss: 0.6280 - val_accuracy: 0.5685 - val_loss: 0.6990
Epoch 7/50

<keras.src.callbacks.history.History at 0x292a9931fd0>

##### Evaluating the model using validation data

In [193]:
# Predict the probabilities for each class
y_pred_probs = model.predict(X_train_ts_v)

# Convert predicted probabilities to binary predictions using a threshold (e.g., 0.5)
y_pred = (y_pred_probs > 0.5).astype(int)

# Calculate the classification report for precision, recall, and F-score
report = classification_report(y_train_v, y_pred)
print(report)

# Calculate AUC
auc = roc_auc_score(y_train_v, y_pred_probs, multi_class='ovo')  
print('AUC:', auc)

[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 28ms/step
              precision    recall  f1-score   support

         0.0       0.59      0.74      0.65      1137
         1.0       0.64      0.48      0.55      1134

    accuracy                           0.61      2271
   macro avg       0.61      0.61      0.60      2271
weighted avg       0.61      0.61      0.60      2271

AUC: 0.6503601016940213


##### Evaluating the model on test data

In [197]:
# Predict the probabilities for each class
y_pred_probs = model.predict(X_test_ts)

# Convert predicted probabilities to binary predictions using a threshold (e.g., 0.5)
y_pred = (y_pred_probs > 0.5).astype(int)

# Calculate the classification report for precision, recall, and F-score
report = classification_report(y_test, y_pred)
print(report)

# Calculate AUC
auc = roc_auc_score(y_test, y_pred_probs, multi_class='ovo')
print('AUC:', auc)

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
              precision    recall  f1-score   support

         0.0       0.61      0.70      0.65       136
         1.0       0.65      0.56      0.60       136

    accuracy                           0.63       272
   macro avg       0.63      0.63      0.63       272
weighted avg       0.63      0.63      0.63       272

AUC: 0.6518706747404843


### Hyperparameter tuning

In [183]:
# Hyperparameter tuning class
class LSTMHyperModel(HyperModel):
    def __init__(self, input_shape, num_classes):
        self.input_shape = input_shape
        self.num_classes = num_classes
    
    def build(self, hp):
        model = Sequential()
        model.add(Masking(mask_value=-1., input_shape=self.input_shape))
        model.add(LSTM(
            units=hp.Int('units', min_value=32, max_value=512, step=32),
            kernel_regularizer=l1_l2(l1=hp.Float('l1', 1e-7, 1e-2, sampling='log'), 
                                     l2=hp.Float('l2', 1e-7, 1e-2, sampling='log')),
            return_sequences=True))
        model.add(Dropout(hp.Float('dropout_1', min_value=0.0, max_value=0.5, step=0.1)))
        model.add(LSTM(units=hp.Int('units', min_value=32, max_value=512, step=32), return_sequences=False))
        model.add(Dropout(hp.Float('dropout_2', min_value=0.0, max_value=0.5, step=0.1)))
        model.add(Dense(self.num_classes, activation='sigmoid'))
        
        model.compile(
            optimizer='adam',
            loss='binary_crossentropy',
            metrics=['accuracy'])
        
        return model

In [184]:
# applying hyperparameter tuning
input_shape = (47, nb_features)  
num_classes = y_train_t.shape[1] 

hypermodel = LSTMHyperModel(input_shape=input_shape, num_classes=num_classes)

tuner = RandomSearch(
    hypermodel,
    objective='val_accuracy',
    max_trials=25,
    executions_per_trial=2,
    directory='my_dir',
    project_name='hptuning'
)

tuner.search_space_summary()

tuner.search(X_train_ts_t, y_train_t, epochs=50, validation_data=(X_train_ts_v,y_train_v), verbose=1, 
            callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto')])

tuner.results_summary()

best_model = tuner.get_best_models(num_models=1)[0]
best_hyperparameters = tuner.get_best_hyperparameters()[0]


Trial 25 Complete [00h 02m 22s]
val_accuracy: 0.6263760626316071

Best val_accuracy So Far: 0.6263760626316071
Total elapsed time: 04h 50m 12s
Results summary
Results in my_dir\hptuning
Showing 10 best trials
Objective(name="val_accuracy", direction="max")

Trial 24 summary
Hyperparameters:
units: 32
l1: 5.3457971270713735e-06
l2: 8.930192741299733e-05
dropout_1: 0.0
dropout_2: 0.1
Score: 0.6263760626316071

Trial 15 summary
Hyperparameters:
units: 160
l1: 1.5718616682004777e-06
l2: 2.0058231648390484e-07
dropout_1: 0.0
dropout_2: 0.30000000000000004
Score: 0.6166886985301971

Trial 11 summary
Hyperparameters:
units: 96
l1: 6.277494564840289e-05
l2: 4.269607092748141e-07
dropout_1: 0.0
dropout_2: 0.2
Score: 0.6164685189723969

Trial 21 summary
Hyperparameters:
units: 96
l1: 4.790651788961243e-06
l2: 7.130323041921185e-05
dropout_1: 0.1
dropout_2: 0.4
Score: 0.614927351474762

Trial 05 summary
Hyperparameters:
units: 128
l1: 1.503309067336845e-07
l2: 6.458042960536317e-06
dropout_1: 0.3

#### Testing best model based on hyperparameter tuning using test data

In [185]:
best_model = tuner.get_best_models(num_models=1)[0]


y_pred_probs = best_model.predict(X_test_ts)

threshold = 0.5  
y_pred = (y_pred_probs > threshold).astype(int)


report = classification_report(y_test, y_pred, target_names=['Class 0', 'Class 1'])
print(report)

auc_score = roc_auc_score(y_test, y_pred_probs)
print('AUC:', auc_score)

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 48ms/step
              precision    recall  f1-score   support

     Class 0       0.60      0.53      0.56       136
     Class 1       0.58      0.65      0.61       136

    accuracy                           0.59       272
   macro avg       0.59      0.59      0.59       272
weighted avg       0.59      0.59      0.59       272

AUC: 0.6023464532871973
