In [39]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
from torch.utils.data import Dataset, DataLoader

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

INPUT_DIR = './input'
#INPUT_DIR = '/kaggle/input'
TRAIN_PATH = INPUT_DIR + '/train.csv'
TEST_PATH = INPUT_DIR + '/test.csv'
SUBMISSION_PATH = INPUT_DIR + '/sample_submission.csv'

import os
for dirname, _, filenames in os.walk(INPUT_DIR):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

./input\sample_submission.csv
./input\test.csv
./input\train.csv


### Data Pre-Processing

In [45]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

def dfToNumpy(csv_file):
    df = pd.read_csv(csv_file)
    num_of_rows = len(df['id'])

    df = df.drop(['id', 'product_code', 'failure'], axis=1)   
    df['area'] = df['attribute_2'] * df['attribute_3']
    df = df.drop(['attribute_2', 'attribute_3'], axis=1)

    num_col = 0

    df['attribute_1'] = df['attribute_1'].map({'material_5': 5, 'material_6': 6, 'material_7': 7, 'material_8': 8})
    df['attribute_0'] = df['attribute_0'].map({'material_5': 5, 'material_6': 6, 'material_7': 7, 'material_8': 8})
    
    print("Data Frame has columns: ", df.columns.to_list())
    features = df.to_numpy()
    
    return features
            

def getLabel(csv_file):
    df = pd.read_csv(csv_file)
    label = df['failure'].to_numpy()
    print("label shape is: ", label.shape)

    return label

def handleMissingValue(features):
    imputer = KNNImputer(n_neighbors=5)
    new_features = imputer.fit_transform(features)

    return imputer, new_features

def Normalize(features):
    scaler = StandardScaler()
    scaler.fit(features)
    print("Get mean of each column: ", scaler.mean_)
    print("Get std of each column: ", np.sqrt(scaler.var_))

    new_features = scaler.transform(features)
    
    return scaler, new_features


#dfToNumpy(TRAIN_PATH)

In [46]:
features = dfToNumpy(TRAIN_PATH)
labels = getLabel(TRAIN_PATH)
imputer, features = handleMissingValue(features)



'''
Get new features and mean and std of each column
'''
scaler, features = Normalize(features)

from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(features, labels, test_size=0.1)

#print("New Features: ", features)

Data Frame has columns:  ['loading', 'attribute_0', 'attribute_1', 'measurement_0', 'measurement_1', 'measurement_2', 'measurement_3', 'measurement_4', 'measurement_5', 'measurement_6', 'measurement_7', 'measurement_8', 'measurement_9', 'measurement_10', 'measurement_11', 'measurement_12', 'measurement_13', 'measurement_14', 'measurement_15', 'measurement_16', 'measurement_17', 'area']
label shape is:  (26570,)
Get mean of each column:  [127.82518954   6.60481746   6.42785096   7.41588257   8.23251788
   6.25656756  17.79081431  11.73289076  17.12790676  17.51052087
  11.71878737  19.02333165  11.4294161   16.12174455  19.17448997
  11.70316005  15.64894519  16.04278097  14.99821438  16.4589192
 701.45757606  47.74761009]
Get std of each column:  [ 38.88037008   0.79636414   1.3574283    4.11661265   4.19932206
   3.30904702   0.99544177   0.98791527   0.98595938   0.9838503
   0.98642304   0.99251951   0.98049454   1.37971252   1.48789755
   1.45336496   1.12430125   1.45015767   1.50

### PCA: For Dimension Reduction

In [49]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.8)
x_train = pca.fit_transform(x_train)

print("After reduction, the dimension of features become: ", x_train.shape)


After reduction, the dimension of features become:  (26570, 15)


### Train a Model for classification

#### Method 1: SVM

In [None]:
from scipy import stats
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils import shuffle

features, _ , y = shuffle(features, labels)
param = {"C": stats.uniform(1, 10),
             "gamma": stats.uniform(0.01, 1)}

svc = SVC()

clf = RandomizedSearchCV(svc, param_distributions=param, n_iter=20, cv=5, scoring='auc')
model = clf.fit(features, labels)
print("best parameters: ", model.best_params_)
print("Search Result")
print(model.cv_results_)

### Validation