In [None]:
!pip install category-encoders
!pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Imports
import joblib
import time
import numpy as np
import pandas as pd
import gc; gc.enable()

# Hide warnings
import warnings
warnings.filterwarnings('ignore')

# Speedup some scikit-learn algorithms
#from sklearnex import patch_sklearn
#patch_sklearn()
import sklearn

# Scoring and Cross-Validation
from sklearn.base import clone
from sklearn.metrics import roc_auc_score
from sklearn.utils.extmath import softmax

# Imputation and Preprocessing
from sklearn.preprocessing import RobustScaler
from category_encoders import WOEEncoder
from sklearn.impute import KNNImputer

# Pipeline Constructors
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import FunctionTransformer

# Models
from sklearn.linear_model import LogisticRegression, HuberRegressor
from xgboost import XGBClassifier

# Optuna
import optuna
from optuna.visualization import plot_param_importances, plot_parallel_coordinate
from optuna.pruners import PercentilePruner

In [None]:
submission = pd.read_csv('sample_submission.csv')
train = pd.read_csv('train.csv', index_col = 'id') 
test = pd.read_csv('test.csv', index_col = 'id')

# Remove target column
target = train['failure'].copy()
gc.collect()

train.head()

Unnamed: 0_level_0,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,...,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,A,80.1,material_7,material_8,9,5,7,8,4,18.04,...,10.672,15.859,17.594,15.193,15.029,,13.034,14.684,764.1,0
1,A,84.89,material_7,material_8,9,5,14,3,3,18.213,...,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0
2,A,82.43,material_7,material_8,9,5,12,1,5,18.057,...,12.715,15.607,,13.798,16.711,18.631,14.094,17.946,663.376,0
3,A,101.07,material_7,material_8,9,5,13,2,6,17.295,...,12.471,16.346,18.377,10.02,15.25,15.562,16.154,17.172,826.282,0
4,A,188.06,material_7,material_8,9,5,9,2,8,19.346,...,10.337,17.082,19.932,12.428,16.182,12.76,13.153,16.412,579.885,0


In [None]:
def preprocessing(df_train, df_test):
    data = pd.concat([df_train, df_test])
    
    data['m3_missing'] = data['measurement_3'].isnull().astype(np.int8)
    data['m5_missing'] = data['measurement_5'].isnull().astype(np.int8)
    data['area'] = data['attribute_2'] * data['attribute_3']# product code?
    feature = [f for f in df_test.columns if f.startswith('measurement') or f =='loading']

    full_fill_dict ={}
    full_fill_dict['measurement_17'] = {
        'A': ['measurement_5','measurement_6','measurement_8'],
        'B': ['measurement_4','measurement_5','measurement_7'],
        'C': ['measurement_5','measurement_7','measurement_8','measurement_9'],
        'D': ['measurement_5','measurement_6','measurement_7','measurement_8'],
        'E': ['measurement_4','measurement_5','measurement_6','measurement_8'],
        'F': ['measurement_4','measurement_5','measurement_6','measurement_7'],
        'G': ['measurement_4','measurement_6','measurement_8','measurement_9'],
        'H': ['measurement_4','measurement_5','measurement_7','measurement_8','measurement_9'],
        'I': ['measurement_3','measurement_7','measurement_8']
    }

    # collect the name of the next 10 best measurement columns sorted by correlation (except 17 already done above):
    col = [col for col in df_test.columns if 'measurement' not in col]+ ['loading','m3_missing','m5_missing']
    a = []
    b =[]
    for x in range(3,17):
        corr = np.absolute(data.drop(col, axis=1).corr()[f'measurement_{x}']).sort_values(ascending=False)
        a.append(np.round(np.sum(corr[1:4]),3)) # add the 3 first lines of the correlation values to get the "most correlated"
        b.append(f'measurement_{x}')
    c = pd.DataFrame()
    c['Selected columns'] = b
    c['correlation total'] = a
    c = c.sort_values(by = 'correlation total',ascending=False).reset_index(drop = True)
    #print(f'Columns selected by correlation sum of the 3 first rows : ')
    #display(c.head(10))

    for i in range(10):
        #print(c.iloc[i,0])
        measurement_col = c.iloc[i,0] # we select the next best correlated column 
        fill_dict = {}
        for x in data.product_code.unique(): 
            corr = np.absolute(data[data.product_code == x].drop(col, axis=1).corr()[measurement_col]).sort_values(ascending=False)
            #print(corr)
            fill_dict[x] = corr[1:5].index.tolist()
        full_fill_dict[measurement_col] =fill_dict
        #print(full_fill_dict[measurement_col])

    feature = [f for f in data.columns if f.startswith('measurement') or f=='loading']
    nullValue_cols = [col for col in df_train.columns if df_train[col].isnull().sum()!=0]

    for code in data.product_code.unique():#A to I
        for measurement_col in list(full_fill_dict.keys()):
            tmp = data[data.product_code == code]
            column = full_fill_dict[measurement_col][code]
            tmp_train = tmp[column+[measurement_col]].dropna(how='any')
            tmp_test = tmp[(tmp[column].isnull().sum(axis=1)==0)&(tmp[measurement_col].isnull())]

            model = HuberRegressor(epsilon=1.9)
            model.fit(tmp_train[column], tmp_train[measurement_col])
            data.loc[(data.product_code==code)&(data[column].isnull().sum(axis=1)==0)&(data[measurement_col].isnull()),measurement_col] = model.predict(tmp_test[column])

        # others NA columns:
        data.loc[data.product_code==code, feature] = KNNImputer(n_neighbors=3).fit_transform(data.loc[data.product_code==code, feature])

    data['measurement_avg'] = data[[f'measurement_{i}' for i in range(3, 17)]].mean(axis=1)
    df_train = data.iloc[:df_train.shape[0],:]
    df_test = data.iloc[df_train.shape[0]:,:]
  
    
    return df_test

In [None]:
test = preprocessing(train, test)

features = ['loading', 'attribute_0', 'measurement_17', 'measurement_0', 'measurement_1', 'measurement_2', 'area', 'm3_missing', 'm5_missing', 'measurement_avg']

In [None]:
def split(X: pd.DataFrame):
    indices = list(X.groupby("product_code").indices.values())
    for i in range(len(indices)):
        for j in range(i + 1, len(indices)):
            yield [
                np.concatenate([ix for k, ix in enumerate(indices) if k not in [i, j]]),
                np.concatenate([ix for k, ix in enumerate(indices) if k in [i, j]])
            ]

In [None]:
SPLITS = list(split(train))

In [None]:
test_preds = np.zeros((test.shape[0],))
for i in range(len(SPLITS)):
  model = joblib.load('model'+str(i)+'.pkl')
  #print(model)
   
  preds = model.predict_proba(test[features])[:, 1] / len(SPLITS)
  print(preds)
  test_preds += preds
submission['failure']=test_preds
submission.to_csv('submission.csv', index=False)

[0.01989427 0.01730632 0.01914867 ... 0.01312845 0.02333925 0.015487  ]
[0.01620215 0.0125654  0.0145942  ... 0.01221921 0.02380757 0.01742187]
[0.01584959 0.01349301 0.01453796 ... 0.01287045 0.0226535  0.01618309]
[0.01740517 0.01466124 0.01618279 ... 0.01294047 0.0231747  0.01569443]
[0.02071211 0.0179468  0.0194577  ... 0.01267411 0.0226859  0.01593197]
[0.02044994 0.01876466 0.0194664  ... 0.01312878 0.02212566 0.0152049 ]
[0.01813402 0.01618072 0.0175227  ... 0.01370725 0.02391967 0.01554568]
[0.01683351 0.01422654 0.01508402 ... 0.0122771  0.02208211 0.01658281]
[0.01550329 0.01271956 0.01414613 ... 0.01277388 0.02372271 0.01686772]
[0.01608427 0.01410439 0.01480852 ... 0.01291236 0.02243724 0.01578448]
