# Importing Libraries and Loading datasets

In [13]:
import os
import random
import numpy as np
import pandas as pd
from joblib import dump, load
import matplotlib.pyplot as plt

!git clone https://github.com/analokmaus/kuma_utils.git
import sys; sys.path.append("kuma_utils/")
from kuma_utils.preprocessing.imputer import LGBMImputer

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression

fatal: destination path 'kuma_utils' already exists and is not an empty directory.


# Load data

In [14]:
train = pd.read_csv("../input/tabular-playground-series-aug-2022/train.csv", index_col='id')
test = pd.read_csv("../input/tabular-playground-series-aug-2022/test.csv", index_col='id')
sub = pd.read_csv("../input/tabular-playground-series-aug-2022/sample_submission.csv")

#train.head()
#train.describe()

train_y = train.failure.copy()
train = train.drop('failure', axis=1)
print('Train data shape:', train.shape)
print('Test data shape:', test.shape)
print('Train data failure shape:', train_y.shape)

Train data shape: (26570, 24)
Test data shape: (20775, 24)
Train data failure shape: (26570,)


# Numerical & Categorical Features 

In [15]:
# eliminate non-numerical data(product_code, attribute_0, attribute_1):categorical features
# column number: 24 -> 21
numerical_cols = train.select_dtypes(np.number).columns.values.tolist()
categorical_cols = [x for x in train.columns.values if (x not in numerical_cols)]

# Fill Missing Values

In [16]:
# calculate missing number of dataset
missing_values_train = train.isna().sum().sum()
print('Missing values in train data: {0}'.format(missing_values_train))
missing_values_test = test.isna().sum().sum()
print('Missing values in test data: {0}\n'.format(missing_values_test))

# from kuma_utils
imputer = LGBMImputer(n_iter=100)
imputer.fit(train[numerical_cols].append(test[numerical_cols]))
train[numerical_cols] = imputer.transform(train[numerical_cols])
test[numerical_cols] = imputer.transform(test[numerical_cols])

missing_values_train = train.isna().any().sum()
print('\nMissing values in train data: {0}'.format(missing_values_train))
missing_values_test = test.isna().any().sum()
print('Missing values in test data: {0}'.format(missing_values_train))

Missing values in train data: 20273
Missing values in test data: 15709



  0%|          | 0/16 [00:00<?, ?it/s]



  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]


Missing values in train data: 0
Missing values in test data: 0


# Feature Engineering

Credits to https://www.kaggle.com/code/themikejones/tps-aug-22-votingclassifier/notebook?scriptVersionId=102761965#4.1-Combine-features-and-create-new

In [17]:
def feature_engineering(data, numerical_cols):
    meas_gr1_cols = [f"measurement_{i:d}" for i in list(range(3, 5)) + list(range(9, 18))] 
    meas_gr2_cols = [f"measurement_{i:d}" for i in list(range(5, 9))]
    
    data['attribute_2*3'] = data['attribute_2'] * data['attribute_3']
    data['meas_gr1_avg'] = np.mean(data[meas_gr1_cols], axis=1)
    data['meas_gr1_std'] = np.std(data[meas_gr1_cols], axis=1)
    data['meas_gr2_avg'] = np.mean(data[meas_gr2_cols], axis=1)
    data['meas17/meas_gr2_avg'] = data['measurement_17'] / data['meas_gr2_avg']

    numerical_cols = numerical_cols + ['attribute_2*3']
    numerical_cols = numerical_cols + ['meas_gr1_avg']
    numerical_cols = numerical_cols + ['meas_gr1_std']
    numerical_cols = numerical_cols + ['meas_gr2_avg']   
    numerical_cols = numerical_cols + ['meas17/meas_gr2_avg']
    
    for column in categorical_cols:
        # from sklearn
        label_encoder = LabelEncoder()
        label_encoder.fit(train[column].append(test[column]))
        train[column] = label_encoder.transform(train[column])
        test[column] = label_encoder.transform(test[column])    
    
    #data['loading'] = np.log1p(data['loading'])
    return numerical_cols

feature_engineering(train, numerical_cols)
numerical_cols = feature_engineering(test, numerical_cols)
train.head()

Unnamed: 0_level_0,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,...,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,attribute_2*3,meas_gr1_avg,meas_gr1_std,meas_gr2_avg,meas17/meas_gr2_avg
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,80.1,1,3,9,5,7,8,4,18.04,...,15.029,15.400725,13.034,14.684,764.1,45,82.920339,215.417497,16.7335,45.662892
1,0,84.89,1,3,9,5,14,3,3,18.213,...,14.732,15.425,14.395,15.631,682.057,45,75.641636,191.779013,16.56175,41.182665
2,0,82.43,1,3,9,5,12,1,5,18.057,...,16.711,18.631,14.094,17.946,663.376,45,74.66069,186.182903,16.496,40.214355
3,0,101.07,1,3,9,5,13,2,6,17.295,...,15.25,15.562,16.154,17.172,826.282,45,88.737909,233.245686,17.1395,48.209224
4,0,188.06,1,3,9,5,9,2,8,19.346,...,16.182,12.76,13.153,16.412,579.885,45,66.406091,162.402011,15.53375,37.330651


# Feature selection(Mutual information)

In [18]:
'''
def make_mi_scores(mi_scores, X, y):
    mi_scores = pd.Series(mi_scores, name="MI Scores")
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

def plot_mi_scores(scores, X):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = X.columns[scores.index]
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")
    return ticks
'''

'\ndef make_mi_scores(mi_scores, X, y):\n    mi_scores = pd.Series(mi_scores, name="MI Scores")\n    mi_scores = mi_scores.sort_values(ascending=False)\n    return mi_scores\n\ndef plot_mi_scores(scores, X):\n    scores = scores.sort_values(ascending=True)\n    width = np.arange(len(scores))\n    ticks = X.columns[scores.index]\n    plt.barh(width, scores)\n    plt.yticks(width, ticks)\n    plt.title("Mutual Information Scores")\n    return ticks\n'

In [19]:
'''
X = train.copy()
y = train_y.copy()

mi_scores = mutual_info_classif(X, y, random_state=1)
mi_scores_classif = make_mi_scores(mi_scores, X, y)

plt.figure(dpi=100, figsize=(12, 8))
columns = plot_mi_scores(mi_scores_classif[mi_scores_classif > 1e-3], X)

columns = columns.tolist()
if 'product_code' in columns:
    columns.remove('product_code')
print(columns)
'''

"\nX = train.copy()\ny = train_y.copy()\n\nmi_scores = mutual_info_classif(X, y, random_state=1)\nmi_scores_classif = make_mi_scores(mi_scores, X, y)\n\nplt.figure(dpi=100, figsize=(12, 8))\ncolumns = plot_mi_scores(mi_scores_classif[mi_scores_classif > 1e-3], X)\n\ncolumns = columns.tolist()\nif 'product_code' in columns:\n    columns.remove('product_code')\nprint(columns)\n"

# Modelling

In [20]:
# features selected from "features selection" above
select_feature = ['attribute_0', 'meas_gr2_avg', 'measurement_1', 'measurement_17', 'measurement_12', 'meas_gr1_std', 'attribute_2', 'attribute_1', 'measurement_2', 'measurement_7', 'measurement_3', 'loading']
select_feature

['attribute_0',
 'meas_gr2_avg',
 'measurement_1',
 'measurement_17',
 'measurement_12',
 'meas_gr1_std',
 'attribute_2',
 'attribute_1',
 'measurement_2',
 'measurement_7',
 'measurement_3',
 'loading']

In [21]:
X = train[select_feature].copy()
y = train_y.copy()
print(X.shape, y.shape)
print(train.shape, test.shape)

(26570, 12) (26570,)
(26570, 29) (20775, 29)


# Select Best Hyper Parameter with Kfold

In [22]:
'''
predictions = np.array(len(test))
acc = 0

# similiar to kflod(5 is the best split between 5 to 15)
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    print("Fold:", fold_idx+1)
    x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # over-increase iterations & C may cause overfitting
    # class_weight='balanced' will decrease performance
    model = LogisticRegression(max_iter=1000, C=0.0001, penalty='l2', solver='newton-cg')
    model.fit(x_train[select_feature], y_train)
    # joblib is better than pickle for sklearn
    
    y_preds = model.predict(x_val[select_feature])
    acc += accuracy_score(y_val, y_preds) / 5 

print(f"\nAverage acc = {round(acc, 5)}")
'''


'\npredictions = np.array(len(test))\nacc = 0\n\n# similiar to kflod(5 is the best split between 5 to 15)\nkf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)\nfor fold_idx, (train_idx, val_idx) in enumerate(kf.split(X, y)):\n    print("Fold:", fold_idx+1)\n    x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]\n    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]\n    \n    # over-increase iterations & C may cause overfitting\n    # class_weight=\'balanced\' will decrease performance\n    model = LogisticRegression(max_iter=1000, C=0.0001, penalty=\'l2\', solver=\'newton-cg\')\n    model.fit(x_train[select_feature], y_train)\n    # joblib is better than pickle for sklearn\n    \n    y_preds = model.predict(x_val[select_feature])\n    acc += accuracy_score(y_val, y_preds) / 5 \n\nprint(f"\nAverage acc = {round(acc, 5)}")\n'

# Train Model(Logistic Regression) & Save

In [23]:
model = LogisticRegression(max_iter=1000, C=0.0001, penalty='l2', solver='newton-cg')
model.fit(train[select_feature], train_y)
dump(model, 'model.joblib')

['model.joblib']