In [1]:
!pip install feature_engine
!pip install tensorflow_addons
!pip install flake8 pycodestyle_magic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting feature_engine
  Downloading feature_engine-1.5.2-py2.py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.0/290.0 KB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: feature_engine
Successfully installed feature_engine-1.5.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow_addons
  Downloading tensorflow_addons-0.19.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow_addons
Successfully installed tensorflow_addons-0.19.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting flake8
  Downloading flake8-6.0.0-py2.py

In [2]:
# for google colab
from google.colab import drive
# mount your Google Drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
# for google colab
# copy all files from "HW5" directory in Google drive to current directory
!cp -r ./gdrive/MyDrive/Final/* .

In [15]:
import os
import sys
import joblib
import numpy as np
import pandas as pd
import gc
from lightgbm import LGBMClassifier
from sklearn.impute import KNNImputer
from sklearn.metrics import roc_auc_score
from sklearn.naive_bayes import GaussianNB
from feature_engine.encoding import WoEEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression, HuberRegressor
from tensorflow.keras import Sequential
from tensorflow.keras import layers
from keras.models import Model
from keras.layers import Input, Dense, Dropout
from keras.layers import Flatten, Conv1D, MaxPooling1D, BatchNormalization
from keras import backend as K
import warnings
import tensorflow as tf
import tensorflow_addons as tfa
gc.enable()
warnings.filterwarnings("ignore")

In [5]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')
tf.random.set_seed(42)

# Data Preprocessing

In [19]:
def preprocessing(df_train, df_test):
    # record correlated relationship
    full_fill_dict = {}
    full_fill_dict['measurement_17'] = {
      'A': ['measurement_5', 'measurement_6', 'measurement_8'],
      'B': ['measurement_4', 'measurement_5', 'measurement_7'],
      'C': ['measurement_5', 'measurement_7', 'measurement_8',
            'measurement_9'],
      'D': ['measurement_5', 'measurement_6', 'measurement_7',
            'measurement_8'],
      'E': ['measurement_4', 'measurement_5', 'measurement_6',
            'measurement_8'],
      'F': ['measurement_4', 'measurement_5', 'measurement_6',
            'measurement_7'],
      'G': ['measurement_4', 'measurement_6', 'measurement_8',
            'measurement_9'],
      'H': ['measurement_4', 'measurement_5', 'measurement_7',
            'measurement_8', 'measurement_9'],
      'I': ['measurement_3', 'measurement_7', 'measurement_8']
    }

    # data = train + test => take both train and test data into consideration
    data = pd.concat([df_train, df_test])
    # construct additional column to record the loss data for
    # measurement_3 & measurement_5 & area
    data['m3_missing'] = 1 * data['measurement_3'].isnull()
    data['m5_missing'] = 1 * data['measurement_5'].isnull()
    data['area'] = data['attribute_2'] * data['attribute_3']

    # calculate the important order of all measurements which depends on
    # correlation filter out the column that has no relation to
    # measurement ramaining the related one and keep them in corelated_data
    # correlated_data = data[['measurement_' + str(i) for i in range(18)] +
    #          ['failure', 'area']]
    correlated_data_col = []
    for i in range(18):
        correlated_data_col.append('measurement_' + str(i))
    correlated_data_col.append('failure')
    correlated_data_col.append('area')
    correlated_data = data[correlated_data_col]

    val = []
    col = []
    for x in range(3, 17):
        # data.corr()表示了data中的两个变量之间的相关性
        cor_val = correlated_data.corr()['measurement_' + str(x)]
        cor_val = np.absolute(cor_val)
        # get most 3 correlated value
        total_val = np.sum(cor_val.sort_values(ascending=False)[1:4])
        val.append(np.round(total_val, 3))
        col.append('measurement_' + str(x))

    c = pd.DataFrame()
    c['corelated columns'] = col
    c['correlated value'] = val
    c = c.sort_values(
              by='correlated value',
              ascending=False).reset_index(drop=True)

    # we just pick the most important 10 measurements
    # find the best corelated columns based on the product code
    # as the initial format of measurement17
    for i in range(10):
        # we select the next best correlated column since the
        # first one is initially set-up measurement17
        measurement_col = 'measurement_' + c.iloc[i, 0][12:]
        fill_dict = {}
        for x in data['product_code'].unique() : 
            cor_val = correlated_data[data['product_code'] == x].corr()[measurement_col]
            cor_val = np.absolute(cor_val).sort_values(ascending=False)
            # keep the most important 4 measurement
            measurement_col_dic = {}
            measurement_col_dic[measurement_col] = cor_val[1:5].index.tolist()
            fill_dict[x] = measurement_col_dic[measurement_col]
        full_fill_dict[measurement_col] = fill_dict

    # start running depends on product code
    for code in data['product_code'].unique():
        # use HuberRegressor to fill the missing value
        for measurement_col in list(full_fill_dict.keys()):
            # extract the current product code data
            tmp = data[data['product_code'] == code]
            # extract the correlated measurement we just claculated
            column = full_fill_dict[measurement_col][code]
            # collect all corelated measurement's data and drop rows which contain missing values
            tmp_train = tmp[column + [measurement_col]].dropna(how='any')
            # collect the data that doesn't miss data
            tmp_test = tmp[(tmp[column].isnull().sum(axis=1) == 0) & (tmp[measurement_col].isnull())]
            model = HuberRegressor(epsilon=1.9)
            model.fit(tmp_train[column], tmp_train[measurement_col])
            data.loc[
                (data['product_code'] == code) & (data[column].isnull().sum(axis=1) == 0) &
                (data[measurement_col].isnull()), measurement_col] = model.predict(tmp_test[column])

        # use KNNImputer to fill the missing value
        # keep the column with loss data
        nullValue_cols = [col for col in df_train.columns if df_train[col].isnull().any()]
        # calculate the total missing data depends on each measurement and current product code
        NA = data.loc[data['product_code'] == code, nullValue_cols].isnull().sum().sum()
        # Imputation for completing missing values using k-Nearest Neighbors.
        model1 = KNNImputer(n_neighbors=3)
        feature = ['loading'] + ['measurement_' + str(i) for i in range(18)]
        data.loc[data['product_code'] == code, feature] = model1.fit_transform(data.loc[data['product_code'] == code, feature])

    # measurement 3 - 16 looks like they belong to the same group
    data['measurement_avg'] = data[['measurement_' + str(i) for i in range(3, 17)]].mean(axis=1)

    # replaces categories by the weight of evidence
    df_train = data.iloc[:len(df_train), :]
    woe_encoder = WoEEncoder(variables=['attribute_0'])
    woe_encoder.fit(df_train, df_train['failure'])
    df_train = woe_encoder.transform(df_train)

    return df_train

df_train = preprocessing(train, test)

# Build Model

In [7]:
def build_model(total_col):
    model = Sequential()
    model.add(Conv1D(
             filters=32, kernel_size=3, padding='same',
             activation='relu', input_shape=(25, 1)))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(pool_size=2, padding='same'))
    model.add(Dropout(0.2))
    model.add(Conv1D(
             filters=64, kernel_size=3,
             padding='same', activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(pool_size=2, padding='same'))
    model.add(Dropout(0.2))
    model.add(Conv1D(
             filters=128, kernel_size=3,
             padding='same', activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(pool_size=2, padding='same'))
    model.add(Dropout(0.2))
    model.add(Conv1D(
             filters=256, kernel_size=3,
             padding='same', activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(pool_size=2, padding='same'))
    model.add(Flatten())
    model.add(Dropout(0.2))
    model.add(Dense(units=1, activation='sigmoid'))

    return model

In [8]:
features = [
       'loading', 'attribute_0', 'measurement_17', 'measurement_0',
       'measurement_1', 'measurement_2', 'measurement_3', 'measurement_4',
       'measurement_5', 'measurement_6', 'measurement_7', 'measurement_8',
       'measurement_9', 'measurement_10', 'measurement_11', 'measurement_12',
       'measurement_13', 'measurement_14', 'measurement_15', 'measurement_16',
       'measurement_17',
       'area', 'm3_missing', 'm5_missing', 'measurement_avg']

# Cross Validation

In [9]:
X = ['A', 'B', 'C', 'D', 'E']

folds_dict = {}
i = 1
for j in range(5):
    for k in range(j + 1, 5):
        tmp_X = X.copy()
        tmp_X.remove(X[j])
        tmp_X.remove(X[k])
        tmpList = list()
        tmpList.append(tmp_X)
        tmpList.append([X[j], X[k]])
        folds_dict['#' + str(i)] = tmpList
        i += 1

# Training

In [10]:
# training with 10-fold cross validation
for num in folds_dict.keys():
    print(f'fold {num}')

    X_df_train = df_train[df_train['product_code'].isin(folds_dict[num][0])]
    X_train = X_df_train[features].values
    y_df_train = df_train[df_train['product_code'].isin(folds_dict[num][0])]
    y_train = y_df_train['failure'].values
    X_df_valid = df_train[df_train['product_code'].isin(folds_dict[num][1])]
    X_valid = X_df_valid[features].values
    y_df_valid = df_train[df_train['product_code'].isin(folds_dict[num][1])]
    y_valid = y_df_valid['failure'].values

    model = build_model(len(X_train))
    es_callbacks = tf.keras.callbacks.EarlyStopping(
           patience=10, restore_best_weights=True)

    lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(
           monitor="auc", factor=0.9, patience=5,
           verbose=0, mode="max", min_delta=0.0001,
           cooldown=0, min_lr=0)

    model.compile(optimizer=tfa.optimizers.AdamW(
                learning_rate=1e-3, weight_decay=1e-3),
                loss="BinaryCrossentropy", metrics=["AUC"])

    model.fit(
         X_train, y_train, batch_size=64, epochs=200,
         callbacks=[es_callbacks, lr_scheduler],
         validation_data=(X_valid, y_valid))

fold #1
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
fold #2
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
fold #3
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
fold #4
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epo

In [11]:
model.save('model.h5')
!cp model.h5 ./gdrive/MyDrive/Final/model.h5