In [52]:
# Import Libraries 
import os
import pandas as pd
import numpy as np
from IPython.display import clear_output

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [53]:
# read in data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [54]:
train.head()

Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,...,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
0,0,A,80.1,material_7,material_8,9,5,7,8,4,...,10.672,15.859,17.594,15.193,15.029,,13.034,14.684,764.1,0
1,1,A,84.89,material_7,material_8,9,5,14,3,3,...,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0
2,2,A,82.43,material_7,material_8,9,5,12,1,5,...,12.715,15.607,,13.798,16.711,18.631,14.094,17.946,663.376,0
3,3,A,101.07,material_7,material_8,9,5,13,2,6,...,12.471,16.346,18.377,10.02,15.25,15.562,16.154,17.172,826.282,0
4,4,A,188.06,material_7,material_8,9,5,9,2,8,...,10.337,17.082,19.932,12.428,16.182,12.76,13.153,16.412,579.885,0


In [55]:
# split into X and y
y = train['failure']
train.drop(['failure'], axis=1, inplace=True)

labelencoder = LabelEncoder()
y = labelencoder.fit_transform(y)

In [56]:
# drop columns that are not needed
train.drop(['id'], axis=1, inplace=True)
Id = test['id']
test.drop(['id'], axis=1, inplace=True)

In [57]:
train.nunique()

product_code          5
loading           11950
attribute_0           2
attribute_1           3
attribute_2           4
attribute_3           4
measurement_0        29
measurement_1        30
measurement_2        25
measurement_3      4721
measurement_4      4692
measurement_5      4671
measurement_6      4704
measurement_7      4734
measurement_8      4713
measurement_9      4708
measurement_10     6177
measurement_11     6526
measurement_12     6392
measurement_13     5271
measurement_14     6389
measurement_15     6577
measurement_16     7035
measurement_17    23612
dtype: int64

In [58]:
# categorical features
categorical_features = ['product_code', 'attribute_0', 'attribute_1','attribute_2', 'attribute_3'] 

# numerical features
numerical_features = [col for col in train.columns if col not in categorical_features]

In [59]:
# Converting string columns to numeric

labelencoder = LabelEncoder()
for feature in categorical_features:
    train[feature] = train[feature].astype(str)
    test[feature] = test[feature].astype(str)

    labelencoder.fit(train[feature].append(test[feature]))

    train[feature] = labelencoder.transform(train[feature])
    test[feature] = labelencoder.transform(test[feature])

    train[feature].fillna(train[feature].mean(), inplace = True)
    test[feature].fillna(train[feature].mean(), inplace = True)

# impute missing values
for feature in numerical_features:
    train[feature].fillna(train[feature].mean(), inplace = True)
    test[feature].fillna(test[feature].mean(), inplace = True)

In [60]:
# pipeline for categorical variables
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [61]:
# Use the full pipeline to transform both training and test sets
preprocessor.fit(train)
train = preprocessor.transform(train)
test = preprocessor.transform(test)

In [62]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean=False)
train = sc.fit_transform(train)
test = sc.transform(test)

In [63]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=42)

In [64]:
# Define models
models = [
    LGBMClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    DecisionTreeClassifier(),
    xgb.XGBClassifier(),
    lgb.LGBMClassifier(),
    CatBoostClassifier()
]

In [65]:
# Evaluate models
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score


for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("Model: %s" % model.__class__.__name__)
    print("Accuracy: %.2f" % accuracy_score(y_test, y_pred))
    print("Precision: %.2f" % precision_score(y_test, y_pred))
    print("Recall: %.2f" % recall_score(y_test, y_pred))
    print("F1: %.2f" % f1_score(y_test, y_pred))
    print("\n")


Model: LGBMClassifier
Accuracy: 0.80
Precision: 0.21
Recall: 0.00
F1: 0.01


Model: RandomForestClassifier
Accuracy: 0.80
Precision: 0.50
Recall: 0.00
F1: 0.00


Model: GradientBoostingClassifier
Accuracy: 0.80
Precision: 0.14
Recall: 0.00
F1: 0.00


Model: DecisionTreeClassifier
Accuracy: 0.67
Precision: 0.21
Recall: 0.25
F1: 0.23


Model: XGBClassifier
Accuracy: 0.79
Precision: 0.23
Recall: 0.03
F1: 0.05


Model: LGBMClassifier
Accuracy: 0.80
Precision: 0.21
Recall: 0.00
F1: 0.01


Learning rate set to 0.037999
0:	learn: 0.6793635	total: 20.7ms	remaining: 20.7s
1:	learn: 0.6666924	total: 36.4ms	remaining: 18.2s
2:	learn: 0.6543001	total: 55.7ms	remaining: 18.5s
3:	learn: 0.6430967	total: 72.5ms	remaining: 18s
4:	learn: 0.6327293	total: 91ms	remaining: 18.1s
5:	learn: 0.6236526	total: 113ms	remaining: 18.7s
6:	learn: 0.6149285	total: 137ms	remaining: 19.4s
7:	learn: 0.6072524	total: 157ms	remaining: 19.5s
8:	learn: 0.6002291	total: 173ms	remaining: 19.1s
9:	learn: 0.5933066	total: 191

In [66]:
# Final model
final_model = models[0]

In [69]:
# make final predictions
final_model.fit(train, y)
final_predictions = final_model.predict(test)

# convert predictions to boolean
# final_predictions = np.array(final_predictions, dtype = bool)

In [70]:
# save results to file
results = pd.DataFrame({'id': Id, 'failure': final_predictions})

filename = "submission.csv"

if os.path.exists(filename):
  os.remove(filename)
results.to_csv(filename, index=False,header=True, mode='w')

In [72]:
results.failure.value_counts()

0    20732
1       43
Name: failure, dtype: int64