<a href="https://colab.research.google.com/github/ben900926/Intro_to_ML_final_project/blob/main/109550146_Final_inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Modules

In [1]:
# download input data from my drive
!gdown "1GDmfpyUQJSR30OQGop1OfEPEos9sndrl" # train.csv https://drive.google.com/file/d/1GDmfpyUQJSR30OQGop1OfEPEos9sndrl/view?usp=sharing
!gdown "1JBxfTEXZCGWfmKFQWUlEcwlBYVnIRcUh" # test.csv https://drive.google.com/file/d/1JBxfTEXZCGWfmKFQWUlEcwlBYVnIRcUh/view?usp=sharing
# download the MODEL WEIGHT from my drive 
!gdown "1-5d3PfMkJr7ln3xu5xQbZQsqlqqInJbq" # link: https://drive.google.com/file/d/1-5d3PfMkJr7ln3xu5xQbZQsqlqqInJbq/view?usp=sharing
# sample submission.csv
!gdown "1-5xIIYDvwWfR0JSWqd5LNFX7K_mWLldN" # dont_click_me.csv https://drive.google.com/file/d/1-5xIIYDvwWfR0JSWqd5LNFX7K_mWLldN/view?usp=sharing

# download encoder needed for preprocessing
!pip install category_encoders

Downloading...
From: https://drive.google.com/uc?id=1GDmfpyUQJSR30OQGop1OfEPEos9sndrl
To: /content/train.csv
100% 3.95M/3.95M [00:00<00:00, 136MB/s]
Downloading...
From: https://drive.google.com/uc?id=1JBxfTEXZCGWfmKFQWUlEcwlBYVnIRcUh
To: /content/test.csv
100% 3.06M/3.06M [00:00<00:00, 171MB/s]
Downloading...
From: https://drive.google.com/uc?id=1-5d3PfMkJr7ln3xu5xQbZQsqlqqInJbq
To: /content/final_model.pkl
100% 330/330 [00:00<00:00, 514kB/s]
Downloading...
From: https://drive.google.com/uc?id=1-5xIIYDvwWfR0JSWqd5LNFX7K_mWLldN
To: /content/dont_click_me.csv
100% 532k/532k [00:00<00:00, 106MB/s]
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.5.1.post0-py2.py3-none-any.whl (72 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.4/72.4 KB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed cate

In [2]:
import pandas as pd
import os
import tensorflow as tf
import random
import numpy as np
import itertools

# model
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression, HuberRegressor, LogisticRegression
from sklearn.impute import KNNImputer

# auc metrics
from sklearn.metrics import roc_auc_score
from sklearn.base import clone

# Pipeline Constructors
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import FunctionTransformer, RobustScaler
from category_encoders import WOEEncoder
import sklearn
# save model
import pickle

## Parameters

In [3]:
SEED = 153

########## please offer the link to other "final_model.pkl" IF you run my train code! ####################
MODEL_WEIGHT_PATH = "final_model.pkl"
########################################################################################################################

# submission.csv file path
SUBMISSION_PATH = "109550146_submission.csv"
# sample submission

## Preprocessing

In [4]:
# read input data
train_df = pd.read_csv("train.csv") 
target = train_df.pop("failure")
test_df = pd.read_csv("test.csv")

# use 2 production groups for valid set
production = test_df["product_code"].unique()
# pick 2 index out of five product codes
cmb_groups = list(itertools.combinations(production, 2))

In [5]:
def preprocessing(df_train, df_test):
    data = pd.concat([df_train, df_test])
    
    # new attribute inspired by disscusion
    data['area'] = data['attribute_2'] * data['attribute_3']
    data['m3_null'] = data['measurement_3'].isnull().astype(np.int64)
    data['m5_null'] = data['measurement_5'].isnull().astype(np.int64)
    feature = [f for f in df_test.columns if f.startswith('measurement') or f =='loading']

    full_fill_dict ={}
    full_fill_dict['measurement_17'] = {
        'A': [                'measurement_4','measurement_5','measurement_6','measurement_7','measurement_8'],
        'B': ['measurement_3','measurement_4','measurement_5',                'measurement_7',                'measurement_9'],
        'C': [                                'measurement_5','measurement_6','measurement_7','measurement_8','measurement_9'],
        'D': ['measurement_3',                'measurement_5','measurement_6','measurement_7','measurement_8'],
        'E': [                'measurement_4','measurement_5','measurement_6',                'measurement_8','measurement_9'],
        'F': [                'measurement_4','measurement_5','measurement_6','measurement_7'],
        'G': [                'measurement_4','measurement_5','measurement_6',                'measurement_8','measurement_9'],
        'H': [                'measurement_4','measurement_5',                'measurement_7','measurement_8','measurement_9'],
        'I': ['measurement_3','measurement_4',                'measurement_7','measurement_8','measurement_9']
    }

    # features without measurement
    col = [col for col in df_test.columns if 'measurement' not in col]+ ['loading', 'm3_null', 'm5_null']
    a = []
    b = []

    for x in range(3,17):
      # correlation between each measurement and measurement 3~17
      corr = np.absolute(data.drop(col, axis=1).corr()[f'measurement_{x}']).sort_values(ascending=False)
      # sorted
      corr = corr.sort_values(ascending=False)
      a.append(np.round(np.sum(corr[1:4]),3))
      b.append(f'measurement_{x}')

    # making df with correlations
    corr_df = pd.DataFrame()
    corr_df['corr_sum'] = b
    corr_df['selected_col'] = a
    corr_df = corr_df.sort_values(by = 'corr_sum',ascending=False).reset_index(drop=True)

    # select top-10 measurement with highest correlation
    for i in range(10):
      measurement_col = 'measurement_' + corr_df.iloc[i,0][12:]
      fill_dict = {}
      # correlation for each production code
      for x in data.product_code.unique() : 
          #print(measurement_col)
          corr = np.absolute(data[data.product_code == x].drop(col, axis=1).corr()[measurement_col]).sort_values(ascending=False)
          # {'measurement_8': ['measurement_17', 'measurement_2', 'measurement_0', 'measurement_3']}
          measurement_col_dic = {}
          measurement_col_dic[measurement_col] = corr[1:5].index.tolist()
          fill_dict[x] = measurement_col_dic[measurement_col]
      full_fill_dict[measurement_col] =fill_dict

    feature = [f for f in data.columns if f.startswith('measurement') or f=='loading']
    nullValue_cols = [col for col in df_train.columns if df_train[col].isnull().sum()!=0]

    for code in data.product_code.unique():
      # we are using high-correlated given measurements to predict missing measurement_col
      for measurement_col in list(full_fill_dict.keys()):
        # train model with non-null
        tmp = data[data.product_code == code]
        column = full_fill_dict[measurement_col][code]
        tmp_train = tmp[column+[measurement_col]].dropna(how='any')
        measurement_null = (tmp[column].isnull().sum(axis=1) == 0) & (tmp[measurement_col].isnull())
        tmp_test = tmp[measurement_null]

        # using HugerRegressor(linear regression that is robust to outlines) to predict missing value
        model = HuberRegressor(epsilon=1.9, max_iter=500)
        model.fit(tmp_train[column], tmp_train[measurement_col])
        measure_null_only = (data.product_code==code) & (data[column].isnull().sum(axis=1)==0) & (data[measurement_col].isnull())
        data.loc[measure_null_only, measurement_col] = model.predict(tmp_test[column])

      # now using KNN imputer to impute missing values
      model1 = KNNImputer(n_neighbors=3)
      data.loc[data.product_code==code, feature] = model1.fit_transform(data.loc[data.product_code==code, feature])

    # average of measurement
    data['measurement_avg'] = data[[f'measurement_{i}' for i in range(3, 17)]].mean(axis=1)
    df_train = data.iloc[:df_train.shape[0],:]
    df_test = data.iloc[df_train.shape[0]:,:]
    features = ['loading', 'attribute_0', 'measurement_17', 'measurement_0', 'measurement_1', 'measurement_2', 'area', 'm3_null', 'm5_null', 'measurement_avg']
    
    return df_train, df_test, features

In [6]:
# apply preprocess
x_train, x_test, features = preprocessing(train_df, test_df)

# pipeline (to make preprocessing faster)
preprocessing_pp = make_pipeline(
    make_column_transformer(
        (WOEEncoder(), ['attribute_0']), # turn string into binary value
        (FunctionTransformer(np.log1p), ['loading']),
        remainder = 'passthrough'
    ),
    RobustScaler()
)

# cross validation
# use 2 production groups for valid set
production = train_df["product_code"].unique()
# pick 2 index out of five product codes
cmb_groups = list(itertools.combinations(production, 2))

train_index = []
valid_index = []

# pick out data using these codes
for group in cmb_groups:
  group_zero_list = train_df.loc[train_df["product_code"]==group[0], :].index
  group_one_list = train_df.loc[train_df["product_code"]==group[1], :].index
  # combine two list
  list0 = list(group_zero_list)
  list1 = list(group_one_list)
  tmp_list = list0 + list1

  # total list - test set = train set
  train_set = set(list(train_df.index)) - set(tmp_list)
  train_index.append(list(train_set))
  valid_index.append(tmp_list)

## Make Prediction

In [7]:
# set seed
def set_seeds(seed):
  random.seed(seed)
  tf.random.set_seed(seed)
  np.random.seed(seed)

# main train loop here
def score(input_model):
  set_seeds(seed=SEED)

  # store some results
  test_preds = np.zeros((x_test.shape[0],)) 

  for fold, train_ in enumerate(train_index):
    # training data
    x_train_ = x_train[features].iloc[train_, :].copy()
    y_train = target[train_].copy()

    # define model and train
    model = make_pipeline(
        clone(preprocessing_pp),
        clone(input_model)
    )

    model.fit(x_train_, y_train)
    # get predictions

    test_preds += model.predict_proba(x_test)[:,1] / len(train_index)

  return test_preds

In [8]:
# load model
with open(MODEL_WEIGHT_PATH, "rb") as pklFile:
  model = pickle.load(pklFile)

# making submission
submission = pd.read_csv("dont_click_me.csv")
submission['failure'] = score(
    model
)

submission.to_csv(SUBMISSION_PATH, index=False)