In [None]:
!pip install feature_engine
!pip install tensorflow_addons

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# for google colab
from google.colab import drive
# mount your Google Drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
# for google colab
# copy all files from "HW5" directory in Google drive to current directory
!cp -r ./gdrive/MyDrive/Final/* .

In [None]:
import os
import sys
import joblib
import numpy as np
import pandas as pd
import gc; gc.enable()
from lightgbm import LGBMClassifier
from sklearn.impute import KNNImputer
from sklearn.metrics import roc_auc_score
from sklearn.naive_bayes import GaussianNB
from feature_engine.encoding import WoEEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression, HuberRegressor
from keras.models import load_model
import warnings; warnings.filterwarnings("ignore")
from tensorflow.keras import Sequential  
from tensorflow.keras import layers
import tensorflow as tf
import tensorflow_addons as tfa

pd.options.display.max_columns = 999

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

In [None]:
# 1. fill the missing value (HuberRegressor + KNNImputer) 2. change attribute 0 to woe
def preprocessing(df_train, df_test):
  # dictionnary of dictionnaries (for the 11 best correlated measurement columns), 
  # we will use the dictionnaries below to select the best correlated columns according to the product code)
  # Only for 'measurement_17' we make a 'manual' selection :
  full_fill_dict ={}
  full_fill_dict['measurement_17'] = {
      'A': ['measurement_5','measurement_6','measurement_8'],
      'B': ['measurement_4','measurement_5','measurement_7'],
      'C': ['measurement_5','measurement_7','measurement_8','measurement_9'],
      'D': ['measurement_5','measurement_6','measurement_7','measurement_8'],
      'E': ['measurement_4','measurement_5','measurement_6','measurement_8'],
      'F': ['measurement_4','measurement_5','measurement_6','measurement_7'],
      'G': ['measurement_4','measurement_6','measurement_8','measurement_9'],
      'H': ['measurement_4','measurement_5','measurement_7','measurement_8','measurement_9'],
      'I': ['measurement_3','measurement_7','measurement_8']
  }

  # data = train + test => take both train and test data into consideration
  data = pd.concat([df_train, df_test]) 
  # construct additional column to record the loss data for measurement_3 & measurement_5
  data['m3_missing'] = 1 * data['measurement_3'].isnull()
  data['m5_missing'] = 1 * data['measurement_5'].isnull()
  data['area'] = data['attribute_2'] * data['attribute_3']

  # calculate the important order of all measurements which depends on correlation
  # filter out the column that has no relation to measurement ramaining the related one and keep them in corelated_data
  correlated_data = data[['measurement_' + str(i) for i in range(18)] + ['failure', 'area']]
  val = []
  col =[]
  for x in range(3,17):
    cor_val = correlated_data.corr()['measurement_' + str(x)] # data.corr()表示了data中的两个变量之间的相关性
    cor_val = np.absolute(cor_val)
    total_val = np.sum(cor_val.sort_values(ascending=False)[1:4]) # get most 3 correlated value
    val.append(np.round(total_val,3)) 
    col.append('measurement_' + str(x))

  c = pd.DataFrame()
  c['corelated columns'] = col
  c['correlated value'] = val
  c = c.sort_values(by='correlated value', ascending=False).reset_index(drop=True)

  # we just pick the most important 10 measurements
  # find the best corelated columns based on the product code as the initial format of measurement17
  for i in range(10):
    measurement_col = 'measurement_' + c.iloc[i,0][12:] # we select the next best correlated column 
    fill_dict = {}
    for x in data['product_code'].unique() : 
      cor_val = correlated_data[data['product_code'] == x].corr()[measurement_col]
      cor_val = np.absolute(cor_val).sort_values(ascending=False)
      measurement_col_dic = {}
      measurement_col_dic[measurement_col] = cor_val[1:5].index.tolist() # keep the most important 4 measurement
      fill_dict[x] = measurement_col_dic[measurement_col]
    full_fill_dict[measurement_col] = fill_dict

  # start running depends on product code
  for code in data['product_code'].unique():
    # use HuberRegressor to fill the missing value
    for measurement_col in list(full_fill_dict.keys()):
      # extract the current product code data
      tmp = data[data['product_code'] == code]
      # extract the correlated measurement we just claculated
      column = full_fill_dict[measurement_col][code]
      # collect all corelated measurement's data and drop rows which contain missing values
      tmp_train = tmp[column + [measurement_col]].dropna(how='any')
      # collect the data that doesn't miss data
      tmp_test = tmp[(tmp[column].isnull().sum(axis=1) == 0) & (tmp[measurement_col].isnull())]
      model = HuberRegressor(epsilon=1.9)
      model.fit(tmp_train[column], tmp_train[measurement_col])
      data.loc[(data['product_code'] == code) & (data[column].isnull().sum(axis=1) == 0) & (data[measurement_col].isnull()), measurement_col] = model.predict(tmp_test[column])

    # use KNNImputer to fill the missing value
    nullValue_cols = [col for col in df_train.columns if df_train[col].isnull().any()] # keep the column with loss data
    # calculate the total missing data depends on each measurement and current product code
    NA = data.loc[data['product_code'] == code, nullValue_cols].isnull().sum().sum()
    # Imputation for completing missing values using k-Nearest Neighbors.
    model1 = KNNImputer(n_neighbors=3) 
    feature = ['loading'] + ['measurement_' + str(i) for i in range(18)]
    data.loc[data['product_code'] == code, feature] = model1.fit_transform(data.loc[data['product_code'] == code, feature])

  data['measurement_avg'] = data[['measurement_' + str(i) for i in range(3, 17)]].mean(axis=1)

  # replaces categories by the weight of evidence
  df_train = data.iloc[:len(df_train),:]
  df_test = data.iloc[len(df_train):,:]
  woe_encoder = WoEEncoder(variables=['attribute_0'])
  woe_encoder.fit(df_train, df_train['failure'])
  df_test = woe_encoder.transform(df_test)
  
  return df_test

In [None]:
df_test = preprocessing(train, test)
features = ['loading', 'attribute_0', 'measurement_17', 'measurement_0', 
      'measurement_1', 'measurement_2','measurement_3','measurement_4',
      'measurement_5', 'measurement_6','measurement_7','measurement_8',
      'measurement_9','measurement_10','measurement_11', 'measurement_12',
      'measurement_13','measurement_14','measurement_15','measurement_16', 'measurement_17',
      'area', 'm3_missing', 'm5_missing', 'measurement_avg']


-------- Product code A ----------

filled by linear model :
measurement_17 : 386
measurement_8 : 167
measurement_11 : 225
measurement_5 : 113
measurement_6 : 146
measurement_7 : 153
measurement_4 : 79
measurement_15 : 273
measurement_10 : 209
measurement_16 : 293
measurement_14 : 237

2281 filled by linear model 
1568 filled by KNN 

-------- Product code B ----------

filled by linear model :
measurement_17 : 418
measurement_8 : 165
measurement_11 : 220
measurement_5 : 83
measurement_6 : 106
measurement_7 : 176
measurement_4 : 80
measurement_15 : 294
measurement_10 : 197
measurement_16 : 358
measurement_14 : 330

2427 filled by linear model 
1548 filled by KNN 

-------- Product code C ----------

filled by linear model :
measurement_17 : 391
measurement_8 : 211
measurement_11 : 231
measurement_5 : 141
measurement_6 : 150
measurement_7 : 140
measurement_4 : 110
measurement_15 : 319
measurement_10 : 262
measurement_16 : 343
measurement_14 : 340

2638 filled by linear model 
1706 fill

In [None]:
X = ['A', 'B', 'C', 'D', 'E']

folds_dict = {}
i = 1
for j in range(5):
  for k in range(j + 1, 5):
    tmp_X = X.copy()
    tmp_X.remove(X[j])
    tmp_X.remove(X[k])
    tmpList = list()
    tmpList.append(tmp_X)
    tmpList.append([X[j], X[k]])
    folds_dict['#' + str(i)] = tmpList
    i += 1

In [None]:
model = joblib.load('model.joblib')
test_predictions = np.zeros((df_test.shape[0], 1))

for fold in folds_dict.keys():
  test_pred = model.predict(df_test[features].values).reshape(-1, 1)
  test_predictions += test_pred / 10    



In [None]:
submission['failure'] = test_predictions
submission.to_csv('submission.csv', index=False)
!cp submission.csv ./gdrive/MyDrive/Final/submission.csv 