### Import libraries

In [100]:
# for load dataset
from google.colab import drive
import sys
from pathlib import Path
# for data processing
import numpy as np # linear algebra
import pandas as pd
# for train data split
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import tensorflow as tf
tf.random.set_seed(363)

### Load Dataset

In [101]:
drive.mount("/gdrive", force_remount=True)

Mounted at /gdrive


In [102]:
base = Path('/gdrive/MyDrive/ML_Final/')
sys.path.append(str(base))

In [104]:
zip_path = base/'tabular.zip'
!cp "{zip_path}" .
!unzip -q tabular.zip
!rm tabular.zip

In [105]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

### Data Preprocess

In [106]:
train_df.info()
train_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26570 entries, 0 to 26569
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              26570 non-null  int64  
 1   product_code    26570 non-null  object 
 2   loading         26320 non-null  float64
 3   attribute_0     26570 non-null  object 
 4   attribute_1     26570 non-null  object 
 5   attribute_2     26570 non-null  int64  
 6   attribute_3     26570 non-null  int64  
 7   measurement_0   26570 non-null  int64  
 8   measurement_1   26570 non-null  int64  
 9   measurement_2   26570 non-null  int64  
 10  measurement_3   26189 non-null  float64
 11  measurement_4   26032 non-null  float64
 12  measurement_5   25894 non-null  float64
 13  measurement_6   25774 non-null  float64
 14  measurement_7   25633 non-null  float64
 15  measurement_8   25522 non-null  float64
 16  measurement_9   25343 non-null  float64
 17  measurement_10  25270 non-null 

Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,...,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
0,0,A,80.1,material_7,material_8,9,5,7,8,4,...,10.672,15.859,17.594,15.193,15.029,,13.034,14.684,764.1,0
1,1,A,84.89,material_7,material_8,9,5,14,3,3,...,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0
2,2,A,82.43,material_7,material_8,9,5,12,1,5,...,12.715,15.607,,13.798,16.711,18.631,14.094,17.946,663.376,0
3,3,A,101.07,material_7,material_8,9,5,13,2,6,...,12.471,16.346,18.377,10.02,15.25,15.562,16.154,17.172,826.282,0
4,4,A,188.06,material_7,material_8,9,5,9,2,8,...,10.337,17.082,19.932,12.428,16.182,12.76,13.153,16.412,579.885,0


In [107]:
test_df.head()

Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,...,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17
0,26570,F,119.57,material_5,material_6,6,4,6,9,6,...,18.654,10.802,15.909,18.07,13.772,13.659,16.825,13.742,17.71,634.612
1,26571,F,113.51,material_5,material_6,6,4,11,8,0,...,19.368,12.032,13.998,,12.473,17.468,16.708,14.776,14.102,537.037
2,26572,F,112.16,material_5,material_6,6,4,8,12,4,...,17.774,11.743,17.046,18.086,10.907,13.363,15.737,17.065,16.021,658.995
3,26573,F,112.72,material_5,material_6,6,4,8,11,10,...,18.948,11.79,18.165,16.163,10.933,15.501,15.667,12.62,16.111,594.301
4,26574,F,208.0,material_5,material_6,6,4,14,16,8,...,19.141,12.37,14.578,17.849,11.941,16.07,16.183,13.324,17.15,801.044


In [108]:
print("train: ", train_df.shape)
print("test: ", test_df.shape)

train:  (26570, 26)
test:  (20775, 25)


In [109]:
# Replace missing columns with incomplete values in dataset

null_cols_number = []
for i in train_df.columns:
  if train_df[i].isna().value_counts()[0]-len(train_df[i]) < 0:
    null_cols_number.append(i)

for col in null_cols_number:
  null_cols = train_df[train_df[col].isna()].index
  for n in null_cols:
    train_df.loc[n,col] = train_df[col].mean()

In [110]:
# Replace category columns with integer values in dataset
train_df['product_code'] = train_df['product_code'].factorize()[0]
train_df['attribute_0'] = train_df['attribute_0'].str.lstrip('material_').astype('int')
train_df['attribute_1'] = train_df['attribute_1'].str.lstrip('material_').astype('int')


In [111]:
train_df.info()
train_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26570 entries, 0 to 26569
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              26570 non-null  int64  
 1   product_code    26570 non-null  int64  
 2   loading         26570 non-null  float64
 3   attribute_0     26570 non-null  int64  
 4   attribute_1     26570 non-null  int64  
 5   attribute_2     26570 non-null  int64  
 6   attribute_3     26570 non-null  int64  
 7   measurement_0   26570 non-null  int64  
 8   measurement_1   26570 non-null  int64  
 9   measurement_2   26570 non-null  int64  
 10  measurement_3   26570 non-null  float64
 11  measurement_4   26570 non-null  float64
 12  measurement_5   26570 non-null  float64
 13  measurement_6   26570 non-null  float64
 14  measurement_7   26570 non-null  float64
 15  measurement_8   26570 non-null  float64
 16  measurement_9   26570 non-null  float64
 17  measurement_10  26570 non-null 

Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,...,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
0,0,0,80.1,7,8,9,5,7,8,4,...,10.672,15.859,17.594,15.193,15.029,16.048444,13.034,14.684,764.1,0
1,1,0,84.89,7,8,9,5,14,3,3,...,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0
2,2,0,82.43,7,8,9,5,12,1,5,...,12.715,15.607,19.172085,13.798,16.711,18.631,14.094,17.946,663.376,0
3,3,0,101.07,7,8,9,5,13,2,6,...,12.471,16.346,18.377,10.02,15.25,15.562,16.154,17.172,826.282,0
4,4,0,188.06,7,8,9,5,9,2,8,...,10.337,17.082,19.932,12.428,16.182,12.76,13.153,16.412,579.885,0


In [112]:
total_cols = list(train_df.columns)
# Removing strings columns
total_cols.remove("id")
total_cols.remove("product_code")
total_cols.remove("attribute_0")
total_cols.remove("attribute_1")
total_cols.remove("attribute_2")

In [113]:
total_cols

['loading',
 'attribute_3',
 'measurement_0',
 'measurement_1',
 'measurement_2',
 'measurement_3',
 'measurement_4',
 'measurement_5',
 'measurement_6',
 'measurement_7',
 'measurement_8',
 'measurement_9',
 'measurement_10',
 'measurement_11',
 'measurement_12',
 'measurement_13',
 'measurement_14',
 'measurement_15',
 'measurement_16',
 'measurement_17',
 'failure']

In [114]:
# One-hot encoding
train_dum = pd.get_dummies(train_df[total_cols[:-1]], drop_first=True)

In [115]:
# Train - test split
# split the train data where 0.8 samples will be used for training purpose
x_train, x_test, y_train, y_test = train_test_split(train_dum, train_df["failure"], test_size=0.2, shuffle=False)

### Model Training

In [116]:
# NN
model = tf.keras.Sequential([
  tf.keras.layers.Dense(50, activation="relu"),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(50, activation="relu"),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(50, activation="relu", bias_regularizer='l2'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(1, activation="sigmoid")
])

In [117]:
# Model compilation 
model.compile(loss="binary_crossentropy",
        optimizer=tf.keras.optimizers.Adam(),
        metrics=["accuracy"])

model.fit(train_dum,train_df["failure"], epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7ff6dc2d6c70>

In [118]:
model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_38 (Dense)            (None, 50)                1050      
                                                                 
 dropout_28 (Dropout)        (None, 50)                0         
                                                                 
 dense_39 (Dense)            (None, 50)                2550      
                                                                 
 dropout_29 (Dropout)        (None, 50)                0         
                                                                 
 dense_40 (Dense)            (None, 50)                2550      
                                                                 
 dropout_30 (Dropout)        (None, 50)                0         
                                                                 
 dense_41 (Dense)            (None, 1)               

In [119]:
model.save("tabular10.h5")