In [12]:
! pip install feature_engine

[0m

In [13]:
import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from feature_engine.encoding import WoEEncoder
from sklearn.linear_model import HuberRegressor
from sklearn.impute import KNNImputer
import csv

In [14]:
# To reproduce the same result, we manually set seed
seed = 123
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [15]:
INPUT_PATH = '/kaggle/input/tabular-playground-series-aug-2022/'
MODEL_WEIGHT_PATH = '/kaggle/input/model-weights'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [16]:
class ValidationDataset(Dataset):
    # Dataset for validation data
    def __init__(self, X):
        self.X = X
        
    def __getitem__(self, index):
        return self.X[index]
    
    def __len__(self):
        return self.X.shape[0]

In [17]:
# Read the data
df_train = pd.read_csv(os.path.join(INPUT_PATH, 'train.csv'), index_col='id')
df_test = pd.read_csv(os.path.join(INPUT_PATH, 'test.csv'), index_col='id')
target = df_train['failure']
df_train.drop('failure',axis=1, inplace = True)

In [18]:
def Preprocessing(df_train, df_test, target):
    # Preprocess the features and get the most useful 10 features.
    # Return: the scaled features in numpy.

    # Concatenate training and testing data
    data = pd.concat([df_train, df_test])
    
    # Use dictionaries of dictionary to store the most correlated column according to the product code
    most_correlated = {}
    # We manually add data for 'measurement_17' (because it is the most important one among other measurement columns)
    most_correlated['measurement_17'] = {
        'A': ['measurement_5','measurement_6','measurement_8'],
        'B': ['measurement_4','measurement_5','measurement_7'],
        'C': ['measurement_5','measurement_7','measurement_8','measurement_9'],
        'D': ['measurement_5','measurement_6','measurement_7','measurement_8'],
        'E': ['measurement_4','measurement_5','measurement_6','measurement_8'],
        'F': ['measurement_4','measurement_5','measurement_6','measurement_7'],
        'G': ['measurement_4','measurement_6','measurement_8','measurement_9'],
        'H': ['measurement_4','measurement_5','measurement_7','measurement_8','measurement_9'],
        'I': ['measurement_3','measurement_7','measurement_8']
    }
    
    m_cols = [f'measurement_{i}' for i in range(18)]
    corr_values = []
    for i in range(3, 17):
        # From measurement_3 to measurement_16, calculate the sum of the largest 3 correlation values
        cur_col = m_cols[i]
        df_correlation = np.abs(data[m_cols].corr()[cur_col]).sort_values(ascending=False)
        corr_values.append([cur_col, np.sum(df_correlation[1:4])])
    
    # Sort according to the correlation values (in descending order)
    corr_values = np.array(corr_values)
    corr_values = corr_values[np.argsort(corr_values[:, 1])[::-1]]
    
    product_codes = data.product_code.unique()
    for i in range(10):
        # For the 10 most correlated measurement columns
        # Find other 4 columns that are most correlated to it, and store to dict
        cur_col = corr_values[i][0]
        cur_correlated = {}
        for code in product_codes:
            df_correlation = np.abs(data[data.product_code == code][m_cols].corr()[cur_col]).sort_values(ascending=False)
            cur_correlated[code] = df_correlation[1:5].index.tolist()
        most_correlated[cur_col] = cur_correlated
    
    # Features that need imputation (measurement columns + loading)
    features = m_cols + ['loading']
    
    for code in product_codes:
        # Impute features according to product code
        for cur_col in list(most_correlated.keys()):
            # For columns that are highly correlated to other columns, impute with linear model (HuberRegressor)
            temp = data[data.product_code == code]
            corr_cols = most_correlated[cur_col][code]
            temp_train = temp[corr_cols+[cur_col]].dropna(how='any')
            temp_test = temp[(temp[cur_col].isnull()) & (temp[corr_cols].isnull().sum(axis=1)==0)]
            
            linear_model = HuberRegressor(epsilon=1.9, max_iter=400)
            linear_model.fit(temp_train[corr_cols], temp_train[cur_col])
            pred = linear_model.predict(temp_test[corr_cols])
            data.loc[(data.product_code == code)&(data[cur_col].isnull())&(data[corr_cols].isnull().sum(axis=1)==0), cur_col] = pred
        
        # For all other columns (not highly correlated), use KNN imputer
        knn_model = KNNImputer(n_neighbors=3)
        data.loc[data.product_code == code, features] = knn_model.fit_transform(data.loc[data.product_code == code, features])
        
    
    # DataFrame of preprocessed data, we will have a total of 10 features
    preprocessed_data = pd.DataFrame()
    # New features
    preprocessed_data['m3_missing'] = data['measurement_3'].isnull().astype(np.int32)
    preprocessed_data['m5_missing'] = data['measurement_5'].isnull().astype(np.int32)
    preprocessed_data['area'] = data['attribute_2'] * data['attribute_3']
    preprocessed_data['measurement_avg'] = data[[f'measurement_{i}' for i in range(3, 17)]].mean(axis=1)
    # Old features
    useful_cols = ['loading', 'attribute_0', 'measurement_17', 'measurement_0', 'measurement_1', 'measurement_2']
    preprocessed_data[useful_cols] = data[useful_cols]
    
    # Split training and testing data
    df_train = preprocessed_data[:df_train.shape[0]]
    df_test = preprocessed_data[df_train.shape[0]:]
    
    # Encode 'attribute_0' with WoEEncoder(Weight of Evidence)
    woe_encoder = WoEEncoder(variables=['attribute_0'])
    woe_encoder.fit(df_train, target)
    df_train = woe_encoder.transform(df_train)
    df_test = woe_encoder.transform(df_test)
    
    # Scale data
    scaler = StandardScaler()
    np_train = scaler.fit_transform(df_train)
    np_test = scaler.transform(df_test)
    
    return np_train, np_test

In [19]:
X_train, X_test = Preprocessing(df_train, df_test, target)

In [21]:
# Load the data
val_ds = ValidationDataset(torch.FloatTensor(X_test))
val_dl = DataLoader(val_ds, batch_size=500, num_workers=2, drop_last=False, shuffle=False)

In [20]:
class Model(nn.Module):
    # NN Model for binary classification
    def __init__(self):
        super().__init__()
        self.layer_1 = nn.Linear(10, 32) 
        self.layer_2 = nn.Linear(32, 32)
        self.layer_3 = nn.Linear(32, 16)
        self.layer_out = nn.Linear(16, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.4)
        self.batchnorm1 = nn.BatchNorm1d(32)
        self.batchnorm2 = nn.BatchNorm1d(16)
        
        
    def forward(self, x):
        x = self.relu(self.layer_1(x))
        x = self.batchnorm1(x)
        x = self.dropout(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm1(x)
        x = self.dropout(x)
        x = self.relu(self.layer_3(x))
        x = self.batchnorm2(x)
        x = self.layer_out(x)
        return x

In [22]:
model = Model().to(device)
# Load the model weight
model.load_state_dict(torch.load(os.path.join(MODEL_WEIGHT_PATH, 'model_weights.pth')))

count = 26570
with open('submission.csv', 'w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(["id", "failure"])
    
    model.eval()

    for feature in val_dl:
        feature = feature.to(device)

        y_pred = torch.sigmoid(model(feature))

        for pred in y_pred:
            csv_writer.writerow([count, pred.item()])
            count += 1