In [None]:
!pip install keras
!pip install tensorflow

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os 
import tensorflow as tf
import random
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import roc_auc_score
from keras.layers import Dense, LeakyReLU

random_seed = 328#328
os.environ['PYTHONHASHSEED'] = str(random_seed)
random.seed(random_seed)
np.random.seed(random_seed)
tf.compat.v1.set_random_seed(random_seed)

In [2]:
train = pd.read_csv('input/train.csv') # train data link
test = pd.read_csv('input/test.csv') # test data link

#train.head()

# Data preprocessing
### Delete duplicates and Check for unique values in each features

In [3]:
def scan_dup(df, delete = False):
    # Print number of duplicates for each line , duplicates deleted when 'delete' be True.
    print(f"# of duplicates rows : {df.duplicated().sum()}, ({np.round(100*df.duplicated().sum()/len(df),1)}%)")
    if delete :
        df.drop_duplicates(inplace = True)

def remove_badfeat(df_train, df_test, features):
    # Remove categorical features where train&test dataframe have different unique values
    for feat in features :
        if len(set(df_train[feat].unique()) ^ set(df_test[feat].unique()) )> 0 :
            print(f"Cat-feature {feat} has different values in train & test set \n")
            print(f" --> {feat} is deleted \n")
            print(df_train[feat].unique(), df_test[feat].unique() ,"\n")
            del df_train[feat]
            del df_test[feat]

###########
scan_dup(train)
print("\n")
train.nunique().sort_values(ascending=True)
#print("\n")
#test["product_code"].value_counts()

# of duplicates rows : 0, (0.0%)




failure               2
attribute_0           2
attribute_1           3
attribute_2           4
attribute_3           4
product_code          5
measurement_2        25
measurement_0        29
measurement_1        30
measurement_5      4671
measurement_4      4692
measurement_6      4704
measurement_9      4708
measurement_8      4713
measurement_3      4721
measurement_7      4734
measurement_13     5271
measurement_10     6177
measurement_14     6389
measurement_12     6392
measurement_11     6526
measurement_15     6577
measurement_16     7035
loading           11950
measurement_17    23612
id                26570
dtype: int64

### Check  missing values of each feature



In [4]:
train.isnull().sum()

id                   0
product_code         0
loading            250
attribute_0          0
attribute_1          0
attribute_2          0
attribute_3          0
measurement_0        0
measurement_1        0
measurement_2        0
measurement_3      381
measurement_4      538
measurement_5      676
measurement_6      796
measurement_7      937
measurement_8     1048
measurement_9     1227
measurement_10    1300
measurement_11    1468
measurement_12    1601
measurement_13    1774
measurement_14    1874
measurement_15    2009
measurement_16    2110
measurement_17    2284
failure              0
dtype: int64

In [5]:
train_gp = train.groupby('product_code').mean().T
test_gp = test.groupby('product_code').mean().T

data = pd.concat([train_gp,test_gp], axis = 1 ).corr(method = 'kendall')
data[data==1] = -1

#test['product_code'].value_counts()

  train_gp = train.groupby('product_code').mean().T
  test_gp = test.groupby('product_code').mean().T


### Features filtering

In [6]:
# Those having more than 50 unique values are considered numerical features
cat_feat = [feat for feat in train.columns if train[feat].nunique() < 50 and feat!= "attribute_2" and feat!="attribute_3" and feat!= 'failure' ]
num_feat = [feat for feat in train.columns if feat not in cat_feat and feat!= 'failure']

# Remove categorical features where train&test dataframe have different unique values
remove_badfeat(train,test,cat_feat)

# Train&Test sets both have same unique values in each column , encode the former feature as a binary variable
train['attribute_0'] = train['attribute_0'].map({'material_7':0,'material_5':1})
test['attribute_0'] = test['attribute_0'].map({'material_7':0,'material_5':1})

Cat-feature product_code has different values in train & test set 

 --> product_code is deleted 

['A' 'B' 'C' 'D' 'E'] ['F' 'G' 'H' 'I'] 

Cat-feature attribute_1 has different values in train & test set 

 --> attribute_1 is deleted 

['material_8' 'material_5' 'material_6'] ['material_6' 'material_7' 'material_5'] 

Cat-feature measurement_0 has different values in train & test set 

 --> measurement_0 is deleted 

[ 7 14 12 13  9 11  4 10  6  8 21 15 17 18 19 16  5 25  3  1 23 20 22  2
 26 24  0 29 27] [ 6 11  8 14 10 16  7 20  9  5  2 13  3  4 15 19 12 22 21 18 17 23  0 26
 24  1 25 29 30 28] 

Cat-feature measurement_1 has different values in train & test set 

 --> measurement_1 is deleted 

[ 8  3  1  2  4  6  0  9  5  7 10 12 11 13 17 14 16 15 18 20 24 22 21 19
 23 27 25 26 29 28] [ 9  8 12 11 16 18  7 15 19 10 13  6 14  5  2  4 17 25 22 21 23  3 20 26
 24 31 27 28  1 29 33 32  0] 

Cat-feature measurement_2 has different values in train & test set 

 --> measurement_2 is del

### Filling missing data and label encoding categorical features

In [7]:
train['measurement_avg'] = train[[f'measurement_{i}' for i in range(3, 17)]].mean(axis=1)
test['measurement_avg'] = test[[f'measurement_{i}' for i in range(3, 17)]].mean(axis=1)

train['m3_missing'] = train['measurement_3'].isnull().astype(np.int8)
train['m5_missing'] = train['measurement_5'].isnull().astype(np.int8)

test['m3_missing'] = test['measurement_3'].isnull().astype(np.int8)
test['m5_missing'] = test['measurement_5'].isnull().astype(np.int8)

#measure_gp1_cols = [f"measurement_{i:d}" for i in list(range(3, 5)) + list(range(9, 17))]
measure_gp2_cols = [f"measurement_{i:d}" for i in list(range(5, 9))]
train['measure_gp2_avg'] = np.mean(train[measure_gp2_cols], axis=1)
test['measure_gp2_avg'] = np.mean(test[measure_gp2_cols], axis=1)

train['attribute_2*3'] = train['attribute_2'] * train['attribute_3']
test['attribute_2*3'] = test['attribute_2'] * test['attribute_3']

#train.head()

# One-hot encoding
encoded_col = []
for col in encoded_col:
     tmp_train= pd.get_dummies(test[col], prefix = col)
     test = pd.merge(left = test, right = tmp_train, left_index = True, right_index = True)
test = test.drop(encoded_col, axis = 1)
#test.head()

x_train = train[train.columns.difference(["failure",'id',"attribute_2","attribute_3"])]
x_test = test[test.columns.difference(['id', "attribute_2","attribute_3"])]
y_train = train["failure"]

iter_imputer = IterativeImputer(max_iter = 8, random_state = 0, skip_complete = True, n_nearest_features = 12)

x_train = iter_imputer.fit_transform(x_train)
x_test = iter_imputer.transform(x_test)
#x_train.head()
#x_test.head()




# Train Model

In [8]:
number_of_features = x_train.shape[1]
model = tf.keras.models.Sequential([
    Dense(6,input_dim = number_of_features),
    LeakyReLU(alpha=0.06)
])

lr = tf.keras.optimizers.schedules.ExponentialDecay(initial_learning_rate=1e-2,decay_steps=10000,decay_rate=0.9)#1e-2,0.9
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
model.add(tf.keras.layers.Dense(1,activation = 'sigmoid'))
model.compile(optimizer=optimizer, loss='binary_crossentropy',  metrics = ['accuracy'])#categorical_crossentropy
history = model.fit(x_train, y_train, epochs=100, validation_split = 0.2)# 100

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

# Save best model

In [9]:
from keras.models import load_model
model.save('109550135_model.h5') # In the same directory as this file