In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import metrics
import joblib

In [2]:
# read csv
train = pd.read_csv("./train.csv")

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26570 entries, 0 to 26569
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              26570 non-null  int64  
 1   product_code    26570 non-null  object 
 2   loading         26320 non-null  float64
 3   attribute_0     26570 non-null  object 
 4   attribute_1     26570 non-null  object 
 5   attribute_2     26570 non-null  int64  
 6   attribute_3     26570 non-null  int64  
 7   measurement_0   26570 non-null  int64  
 8   measurement_1   26570 non-null  int64  
 9   measurement_2   26570 non-null  int64  
 10  measurement_3   26189 non-null  float64
 11  measurement_4   26032 non-null  float64
 12  measurement_5   25894 non-null  float64
 13  measurement_6   25774 non-null  float64
 14  measurement_7   25633 non-null  float64
 15  measurement_8   25522 non-null  float64
 16  measurement_9   25343 non-null  float64
 17  measurement_10  25270 non-null 

In [4]:
# drop not important data
not_important_list = [
    'id',
    'product_code'
]

for s in not_important_list:
  train = train.drop(s, axis=1)

In [5]:
# transform object to int64
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html

train['attribute_1'] = train['attribute_1'].astype(str).apply(lambda a:str(a[-1:]))

le = preprocessing.LabelEncoder()
le.fit(['8', '7', '6', '5'])
train['attribute_1'] = le.transform(list(train['attribute_1'].values))

for i in train.columns[train.dtypes == 'object']:
  le = preprocessing.LabelEncoder()
  le.fit(train[i])
  train[i] = le.transform(list(train[i].values))

In [6]:
# fill NA value
train = train.fillna(train.mean())
print(train.isna().sum())

loading           0
attribute_0       0
attribute_1       0
attribute_2       0
attribute_3       0
measurement_0     0
measurement_1     0
measurement_2     0
measurement_3     0
measurement_4     0
measurement_5     0
measurement_6     0
measurement_7     0
measurement_8     0
measurement_9     0
measurement_10    0
measurement_11    0
measurement_12    0
measurement_13    0
measurement_14    0
measurement_15    0
measurement_16    0
measurement_17    0
failure           0
dtype: int64


In [7]:
train_x = train
train_x = train_x.drop('failure', axis=1)
train_y = train['failure']

In [8]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html

models = []
kf = model_selection.KFold(n_splits=5, random_state=1, shuffle=True)
for i, (train_index, test_index) in enumerate(kf.split(train_x, train_y)):
  train_x_fold = train_x.iloc[train_index]
  train_y_fold = train_y.iloc[train_index]
  
  model = LogisticRegression(max_iter=1000, C=5e-5)
  model.fit(train_x_fold, train_y_fold)

  test_x_fold = train_x.iloc[test_index]
  test_y_fold = train_y.iloc[test_index]

  test_preds_fold = model.predict_proba(test_x_fold)[:, 1]
  print('fold ', i, ' roc score:', metrics.roc_auc_score(test_y_fold, test_preds_fold))
  
  models.append(model)

fold  0  roc score: 0.585410715556414
fold  1  roc score: 0.598728710563126
fold  2  roc score: 0.5803310488850125
fold  3  roc score: 0.6029514541887344
fold  4  roc score: 0.5793542445113544


In [9]:
# dump model
joblib.dump(models, 'weight.joblib', compress=1)

['weight.joblib']