In [1]:
import numpy as np
import pandas as pd
import joblib
from sklearn import preprocessing

In [2]:
# read csv
test = pd.read_csv("./test.csv")
sample_submission = pd.read_csv("./sample_submission.csv")

In [3]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20775 entries, 0 to 20774
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              20775 non-null  int64  
 1   product_code    20775 non-null  object 
 2   loading         20552 non-null  float64
 3   attribute_0     20775 non-null  object 
 4   attribute_1     20775 non-null  object 
 5   attribute_2     20775 non-null  int64  
 6   attribute_3     20775 non-null  int64  
 7   measurement_0   20775 non-null  int64  
 8   measurement_1   20775 non-null  int64  
 9   measurement_2   20775 non-null  int64  
 10  measurement_3   20446 non-null  float64
 11  measurement_4   20366 non-null  float64
 12  measurement_5   20267 non-null  float64
 13  measurement_6   20151 non-null  float64
 14  measurement_7   20055 non-null  float64
 15  measurement_8   19929 non-null  float64
 16  measurement_9   19871 non-null  float64
 17  measurement_10  19708 non-null 

In [4]:
# drop not important data
not_important_list = [
    'id',
    'product_code'
]

for s in not_important_list:
  test = test.drop(s, axis=1)

In [5]:
# transform object to int64
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html

test['attribute_1'] = test['attribute_1'].astype(str).apply(lambda a:str(a[-1:]))

le = preprocessing.LabelEncoder()
le.fit(['8', '7', '6', '5'])
test['attribute_1'] = le.transform(list(test['attribute_1'].values))

for i in test.columns[test.dtypes == 'object']:
  le = preprocessing.LabelEncoder()
  le.fit(test[i])
  test[i] = le.transform(list(test[i].values))

In [6]:
# fill NA value
test = test.fillna(test.mean())
print(test.isna().sum())

loading           0
attribute_0       0
attribute_1       0
attribute_2       0
attribute_3       0
measurement_0     0
measurement_1     0
measurement_2     0
measurement_3     0
measurement_4     0
measurement_5     0
measurement_6     0
measurement_7     0
measurement_8     0
measurement_9     0
measurement_10    0
measurement_11    0
measurement_12    0
measurement_13    0
measurement_14    0
measurement_15    0
measurement_16    0
measurement_17    0
dtype: int64


In [7]:
# load model
models = joblib.load('weight.joblib')

# run prediction
test_preds = np.zeros(len(test))
for model in models:
  test_preds += model.predict_proba(test)[:, 1]

In [8]:
# save submission
sub = pd.DataFrame({'id': sample_submission.id, 'failure': test_preds / len(models)})
sub.to_csv("submission.csv", index=False)