In [77]:
import numpy as np
import pandas as pd 

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
import joblib
import math

In [78]:
#LOAD DATASET
df_test = pd.read_csv("dataset/test.csv")

In [79]:
Float_feature = df_test.select_dtypes(float).columns
int_feature = df_test.select_dtypes(np.int64).columns
Obj_feature = df_test.select_dtypes(object).columns

In [80]:
def Preprocess_data(df , clip = True):

    df['missing_m3'] = df.measurement_3.isna()
    df['missing_m5'] = df.measurement_5.isna()


    kkn_Imp = KNNImputer(n_neighbors=15)
    kkn_Imp.fit(df[Float_feature])
    df[Float_feature] = kkn_Imp.transform(df[Float_feature])

    for feature in Obj_feature:
        df[feature] = LabelEncoder().fit_transform(df[feature])
        
    if clip:
        df['measurement_2'] = df['measurement_2'].clip(11, None)

    return df

In [81]:
df_test = Preprocess_data(df_test)

In [82]:
model = make_pipeline(StandardScaler(),LogisticRegression(penalty='l1', class_weight='balanced', C=0.01, solver='liblinear', random_state=420))

#LOAD MODEL
model = joblib.load('my_model.joblib')

In [83]:
test_pred = np.round(model.predict_proba(df_test),3)

In [84]:
df_subm= pd.read_csv("sample_submission.csv")

In [85]:
df_subm['failure'] = test_pred[:,1]

In [86]:
df_subm.to_csv('109550201.csv',index=False)