In [70]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [71]:
PATH = 'data/'
device = 'cpu'
train_df = pd.read_csv(PATH+'train.csv')
train_df.head()

Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,...,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
0,0,A,80.1,material_7,material_8,9,5,7,8,4,...,10.672,15.859,17.594,15.193,15.029,,13.034,14.684,764.1,0
1,1,A,84.89,material_7,material_8,9,5,14,3,3,...,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0
2,2,A,82.43,material_7,material_8,9,5,12,1,5,...,12.715,15.607,,13.798,16.711,18.631,14.094,17.946,663.376,0
3,3,A,101.07,material_7,material_8,9,5,13,2,6,...,12.471,16.346,18.377,10.02,15.25,15.562,16.154,17.172,826.282,0
4,4,A,188.06,material_7,material_8,9,5,9,2,8,...,10.337,17.082,19.932,12.428,16.182,12.76,13.153,16.412,579.885,0


In [72]:
# Add new features 'miss_m3' and 'miss_m3',
# which represent if the value is nan or not
train_df['miss_m3'] = train_df['measurement_3'].isna()
train_df['miss_m5'] = train_df['measurement_5'].isna()

In [73]:
# Encode att 0 and 1 into indices
mat_list = ['material_5', 'material_6', 'material_7', 'material_8']
train_df['attribute_0'] = [mat_list.index(item) for item in train_df['attribute_0']]
train_df['attribute_1'] = [mat_list.index(item) for item in train_df['attribute_1']]

In [74]:
# Inpute missing values with mean
missing_cols = train_df.columns[train_df.isna().sum(0) > 0]
imputer = SimpleImputer(strategy='mean')
train_df[missing_cols] = imputer.fit_transform(train_df[missing_cols])

In [75]:
# Create another feature 'area',
# representing att 2 and 3 with one feature 
train_df['area'] = train_df['attribute_2'] * train_df['attribute_3']

In [76]:
# Make measure 3 to 16 into two features: sum and variance
measure_list = list(train_df.iloc[:,10:24].columns)
print(measure_list)
train_df['measurement_sum'] = train_df[measure_list].sum(axis=1)
train_df['measurement_std'] = train_df[measure_list].std(axis=1)

['measurement_3', 'measurement_4', 'measurement_5', 'measurement_6', 'measurement_7', 'measurement_8', 'measurement_9', 'measurement_10', 'measurement_11', 'measurement_12', 'measurement_13', 'measurement_14', 'measurement_15', 'measurement_16']


In [77]:
# Select 9 important features and failure label
target_list = ['loading', 'attribute_0', 'attribute_1', 'area', 'measurement_17', 'measurement_sum', 'measurement_std', 'miss_m3', 'miss_m5']
X_train = train_df[target_list].values
y_train = train_df['failure'].values

In [78]:
# Standardize the data
sc = StandardScaler()
X_train = sc.fit_transform(X_train)

# Use SMOTE to balance the data
X_train, y_train = SMOTE(random_state=42).fit_resample(X_train, y_train)

In [79]:
# Define data class used to make dataset
class TrainData(Dataset):
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

In [80]:
# Load training data, 500 per batch
train_ds = TrainData(torch.FloatTensor(X_train), torch.LongTensor(y_train))
train_dl = DataLoader(train_ds, batch_size=500, drop_last=True, shuffle=True)

In [81]:
# Build model
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(9, 32), # Input features: 9
            nn.ReLU(),
            nn.BatchNorm1d(32),
            nn.Dropout(p=0.25),
            nn.Linear(32, 32),
            nn.ReLU(),
            nn.BatchNorm1d(32),
            nn.Dropout(p=0.25),
            nn.Linear(32, 2), # Output 2 classes
            nn.Softmax(dim=1) # so that we can use softmax
        )
        
    def forward(self, x):
        return self.layers(x)

In [82]:
# Train Model
model = Model().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
loss_fn = nn.CrossEntropyLoss()

for epoch in range(60):
    epoch_loss = 0
    model.train()
    for data, label in train_dl:
        data = data.to(device)
        label = label.to(device)
        
        pred = model(data)
        loss = loss_fn(pred, label)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        
    if epoch % 10 == 0:
        print(f"Epoch {epoch}:{epoch_loss/len(train_dl): .4f}")

Epoch 0: 0.7024
Epoch 10: 0.6862
Epoch 20: 0.6831
Epoch 30: 0.6816
Epoch 40: 0.6814
Epoch 50: 0.6806


In [83]:
# Save model
torch.save(model.state_dict(), "model")