In [2285]:
import numpy as np
import pandas as pd 

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
import joblib

In [2286]:
df_train= pd.read_csv("dataset/train.csv")

In [2287]:
Float_feature = df_train.select_dtypes(float).columns
Int_feature = df_train.select_dtypes(np.int64).columns
Obj_feature = df_train.select_dtypes(object).columns

In [2288]:
def Preprocess_data(df, clip = True):

    df['missing_m3'] = df.measurement_3.isna()
    df['missing_m5'] = df.measurement_5.isna()


    kkn_Imp = KNNImputer(n_neighbors=15)
    kkn_Imp.fit(df[Float_feature])
    df[Float_feature] = kkn_Imp.transform(df[Float_feature])

    for feature in Obj_feature:
        df[feature] = LabelEncoder().fit_transform(df[feature])
        
    if clip:
        df['measurement_2'] = df['measurement_2'].clip(11, None)
    
    return df

In [2289]:
#LOAD DATASET
df_train = Preprocess_data(df_train, False)

In [None]:
x_data = df_train.drop(['failure'], axis=1)
y_data = df_train['failure']

In [None]:
x_train, x_val , y_train , y_val = train_test_split(x_data, y_data, test_size=0.2, random_state=9)

In [None]:
model = make_pipeline(StandardScaler(),LogisticRegression(penalty='l1', class_weight='balanced', C=0.01, solver='liblinear', random_state=420))

model.fit(x_train, y_train)
y_pred = np.round(model.predict_proba(x_val),3)


In [None]:
y_pred[:,1]

array([0.467, 0.619, 0.636, ..., 0.526, 0.46 , 0.504])

In [None]:
auc = roc_auc_score(y_val, y_pred[:,1])

In [None]:
auc

0.5804776054787916

In [None]:
joblib.dump(model, 'my_model.joblib')

['my_model.joblib']

In [None]:
# df_test = pd.read_csv("test.csv")
# df_test = Preprocess_data(df_test)
# test_pred = np.round(model.predict_proba(df_test),3)
# df_subm= pd.read_csv("sample_submission.csv")
# df_subm['failure']=test_pred[:,1]
# df_subm.to_csv('submission.csv',index=False)
