In [None]:
!pip3 install numpy pandas
!pip3 install scikit-learn
!pip3 install feature_engine

In [None]:
import pandas as pd
import numpy as np
import time
import csv
import math

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.impute import KNNImputer
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn import svm
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression, HuberRegressor
from feature_engine.encoding import WoEEncoder

In [None]:
DATA_PATH = "./tabular-playground-series-aug-2022"
seed = 69

In [None]:
# This function is mainly from https://www.kaggle.com/code/nourhadrich/tps-aug-neural-network
def preprocessing(df_train, df_test):
    data = pd.concat([df_train, df_test])

    data['m3_missing'] = data['measurement_3'].isnull().astype(np.int8)
    data['m5_missing'] = data['measurement_5'].isnull().astype(np.int8)
    data['area'] = data['attribute_2'] * data['attribute_3']

    feature = [f for f in df_test.columns if f.startswith(
        'measurement') or f == 'loading']

    # dictionnary of dictionnaries (for the 11 best correlated measurement columns),
    # we will use the dictionnaries below to select the best correlated columns according to the product code)
    # Only for 'measurement_17' we make a 'manual' selection :
    full_fill_dict = {}
    full_fill_dict['measurement_17'] = {
        'A': ['measurement_5', 'measurement_6', 'measurement_8'],
        'B': ['measurement_4', 'measurement_5', 'measurement_7'],
        'C': ['measurement_5', 'measurement_7', 'measurement_8', 'measurement_9'],
        'D': ['measurement_5', 'measurement_6', 'measurement_7', 'measurement_8'],
        'E': ['measurement_4', 'measurement_5', 'measurement_6', 'measurement_8'],
        'F': ['measurement_4', 'measurement_5', 'measurement_6', 'measurement_7'],
        'G': ['measurement_4', 'measurement_6', 'measurement_8', 'measurement_9'],
        'H': ['measurement_4', 'measurement_5', 'measurement_7', 'measurement_8', 'measurement_9'],
        'I': ['measurement_3', 'measurement_7', 'measurement_8']
    }

    # collect the name of the next 10 best measurement columns sorted by correlation (except 17 already done above):
    col = [col for col in df_test.columns if 'measurement' not in col] + \
        ['loading', 'm3_missing', 'm5_missing']
    a = []
    b = []
    for x in range(3, 17):
        corr = np.absolute(data.drop(col, axis=1).corr()[
                           f'measurement_{x}']).sort_values(ascending=False)
        # we add the 3 first lines of the correlation values to get the "most correlated"
        a.append(np.round(np.sum(corr[1:4]), 3))
        b.append(f'measurement_{x}')
    c = pd.DataFrame()
    c['Selected columns'] = b
    c['correlation total'] = a
    c = c.sort_values(by='correlation total',
                      ascending=False).reset_index(drop=True)

    for i in range(10):
        # we select the next best correlated column
        measurement_col = 'measurement_' + c.iloc[i, 0][12:]
        fill_dict = {}
        for x in data.product_code.unique():
            corr = np.absolute(data[data.product_code == x].drop(col, axis=1).corr()[
                               measurement_col]).sort_values(ascending=False)
            measurement_col_dic = {}
            measurement_col_dic[measurement_col] = corr[1:5].index.tolist()
            fill_dict[x] = measurement_col_dic[measurement_col]
        full_fill_dict[measurement_col] = fill_dict

    feature = [f for f in data.columns if f.startswith(
        'measurement') or f == 'loading']

    nullValue_cols = [
        col for col in df_train.columns if df_train[col].isnull().sum() != 0]

    for code in data.product_code.unique():
        for measurement_col in list(full_fill_dict.keys()):
            tmp = data[data.product_code == code]
            column = full_fill_dict[measurement_col][code]
            tmp_train = tmp[column+[measurement_col]].dropna(how='any')
            tmp_test = tmp[(tmp[column].isnull().sum(axis=1) == 0)
                           & (tmp[measurement_col].isnull())]

            model = HuberRegressor(epsilon=1.9)
            model.fit(tmp_train[column], tmp_train[measurement_col])
            data.loc[(data.product_code == code) & (data[column].isnull().sum(axis=1) == 0) & (
                data[measurement_col].isnull()), measurement_col] = model.predict(tmp_test[column])

        # others NA columns:
        NA = data.loc[data["product_code"] == code,
                      nullValue_cols].isnull().sum().sum()
        model1 = KNNImputer(n_neighbors=3)
        data.loc[data.product_code == code, feature] = model1.fit_transform(
            data.loc[data.product_code == code, feature])

    data['measurement_avg'] = data[[
        f'measurement_{i}' for i in range(3, 17)]].mean(axis=1)
    data['measurement_std'] = data[[
        f'measurement_{i}' for i in range(3, 17)]].std(axis=1)
    data['measurement_median'] = data[[
        f'measurement_{i}' for i in range(3, 17)]].median(axis=1)
    data['measurement_max'] = data[[
        f'measurement_{i}' for i in range(3, 17)]].max(axis=1)
    data['measurement_min'] = data[[
        f'measurement_{i}' for i in range(3, 17)]].min(axis=1)
    data['measurement_skew'] = data[[
        f'measurement_{i}' for i in range(3, 17)]].skew(axis=1)

    df_train = data.iloc[:df_train.shape[0], :]
    df_test = data.iloc[df_train.shape[0]:, :]

    woe_encoder = WoEEncoder(variables=['attribute_0'])
    woe_encoder.fit(df_train, df_train['failure'])
    df_train = woe_encoder.transform(df_train)
    df_test = woe_encoder.transform(df_test)

    selector = SelectKBest(k=5)
    selector.fit(df_train.iloc[:, 5:25], df_train['failure'])
    features = selector.get_support(indices=True)
    features = [df_train.columns[x+5] for x in features]
    features = np.append(features, ['loading', 'attribute_0', 'area',
                                    'm3_missing', 'm5_missing', 'measurement_avg',
                                    'measurement_std', 'measurement_median', 'measurement_min',
                                    'measurement_skew'])
    print(features)

    # features = ['loading', 'attribute_0',
    #             'measurement_17', 'measurement_0',
    #             'measurement_1', 'measurement_2', 'area',
    #             'm3_missing', 'm5_missing', 'measurement_avg',
    #             'measurement_std', 'measurement_median', 'measurement_min',
    #             'measurement_skew']

    return df_train, df_test, features


df_train, df_test, features = preprocessing(pd.read_csv(
    f'{DATA_PATH}/train.csv'), pd.read_csv(f'{DATA_PATH}/test.csv'))


In [None]:
def dataPreprocessing(filePath: str, submit: bool):
    trainData = pd.read_csv(filePath)

    if submit:
        y = trainData.failure.values
    else:
        id = trainData.loc[:, 'id'].values
    x = trainData.loc[:, 'product_code':'measurement_17'].values

    # average = {}

    # for row in x:
    #     if not average.get(row[0]):
    #         average[row[0]] = [[] for i in range(len(row) - 1)]

    #     for val, averageArr in zip(row[1:], average[row[0]]):
    #         if type(val) != str and not math.isnan(val):
    #             averageArr.append(val)

    # for key in average:
    #     newArr = []
    #     for averageArr in average[key]:
    #         newArr.append(np.mean(averageArr))
    #     average[key] = newArr

    codes = {}
    xx = [0 for i in range(len(x))]  # Storing new table

    for i in range(len(x)):
        # for j in range(len(x[i][1:])):
        #     if type(x[i][j+1]) != str and math.isnan(x[i][j+1]):
        #         x[i][j+1] = average[x[i][0]][j]
        x[i][0] = ord(x[i][0]) - ord('A')

        if not codes.get(x[i][0]):
            codes[x[i][0]] = 0
        codes[x[i][0]] += 1

        x[i][2] = int(x[i][2][-1])
        x[i][3] = int(x[i][3][-1])
        xx[i] = [
            1 if math.isnan(x[i][9]) else 0,  # measurement 3 missing
            1 if math.isnan(x[i][11]) else 0  # measurement 5 missing
        ]

    print(codes)

    imputer = KNNImputer(n_neighbors=3)
    idx = 0
    for code, num in codes.items():
        x[idx:idx+num+1] = imputer.fit_transform(x[idx:idx+num+1])
        idx += num

    # Index(['id', 'product_code', 'loading', 'attribute_0', 'attribute_1',
#        'attribute_2', 'attribute_3', 'measurement_0', 'measurement_1',
#        'measurement_2', 'measurement_3', 'measurement_4', 'measurement_5',
#        'measurement_6', 'measurement_7', 'measurement_8', 'measurement_9',
#        'measurement_10', 'measurement_11', 'measurement_12', 'measurement_13',
#        'measurement_14', 'measurement_15', 'measurement_16', 'measurement_17',
#        'failure'],
    for i in range(len(x)):
        xx[i] = np.append(x[i][1], xx[i])
        xx[i] = np.append(x[i][2], xx[i])
        xx[i] = np.append(x[i][6:], xx[i])
        xx[i] = np.append(x[i][4] * x[i][5], xx[i])

    x = np.array(xx).astype(float)
    if submit:
        y = y.astype(float)

    if submit:
        x_train, x_test, y_train, y_test = train_test_split(
            x, y, test_size=0.2, random_state=69)
    else:
        x_train, x_test, y_train, y_test = id, x, None, None

    scaler = StandardScaler()
    if submit:
        scaler.fit(x_train)
    else:
        scaler.fit(x_test)

    if submit:
        x_train = scaler.transform(x_train)
    x_test = scaler.transform(x_test)

    return x_train, x_test, y_train, y_test

# x_train, x_valid, y_train, y_valid = dataPreprocessing(f'{DATA_PATH}/train.csv', True)


In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(
    df_train[features].values, df_train['failure'].values, test_size=0.1, random_state=seed)


# NN = MLPRegressor(hidden_layer_sizes=(64, 32),
#                   random_state=seed, verbose=True, early_stopping=True)
# NN.fit(x_train, y_train)

# SVM = svm.NuSVR(verbose=True)
# SVM.fit(x_train, y_train)

pred = []
score = []

kf = KFold(n_splits=10, shuffle=True)
for i, (train_index, test_index) in enumerate(kf.split(df_train[features].values)):
    LR = LogisticRegression(max_iter=1000, penalty='l2',
                            solver='newton-cg', random_state=seed)
    x_data, y_data = df_train[features].values, df_train['failure'].values
    x_train, x_valid = x_data[train_index], x_data[test_index]
    y_train, y_valid = y_data[train_index], y_data[test_index]
    LR.fit(x_train, y_train)

    # outputs = NN.predict(x_valid)
    # outputs =  SVM.predict(x_valid)
    outputs = LR.predict_proba(x_valid)[:, 1]

    print(outputs)

    # for i in range(len(outputs)):
    #     if outputs[i] < 0:
    #         outputs[i] = 0.0
    #     if outputs[i] > 1:
    #         outputs[i] = 1.0

    print(roc_auc_score(y_valid, outputs))

    pred.append(LR.predict_proba(df_test[features].values)[:, 1])
    score.append((roc_auc_score(y_valid, outputs), len(pred)-1))


In [None]:
# ids, x_submit, _, _ = dataPreprocessing(f'{DATA_PATH}/test.csv', False)

# outputs = NN.predict(x_submit)

# test_pred = NN.predict(df_test[features].values)

score.sort(reverse=True)
print(score)
weight = [0.4, 0.3, 0.3]
test_pred = pred[score[0][1]] * weight[0] + \
    pred[score[1][1]] * weight[1] + \
    pred[score[2][1]] * weight[2]

# for i in range(len(test_pred)):
#     if test_pred[i] < 0:
#         test_pred[i] = 0.0
#     if test_pred[i] > 1:
#         test_pred[i] = 1.0
print(test_pred[0:100])


In [None]:

# Train does not output prediction csv
# csvFile = open('submission.csv', 'w', newline='')
# csv_writer = csv.writer(csvFile)
# csv_writer.writerow(["id", "failure"])

# for id, output in zip(df_test['id'].values, test_pred):
#     # for id, output in zip(ids, outputs):
#     csv_writer.writerow([id, output])

# csvFile.close()
