In [None]:
import os
import pickle
import tqdm
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import TensorDataset, DataLoader
import sklearn
from sklearn.ensemble import BaggingClassifier
from torch.optim import Adam
import warnings
import joblib

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)


def read_parquet_dataset_from_local(path_to_dataset: str, start_from: int = 0,
                                     num_parts_to_read: int = 2, columns=None, verbose=False) -> pd.DataFrame:
    res = []
    dataset_paths = sorted([os.path.join(path_to_dataset, filename) for filename in os.listdir(path_to_dataset)
                              if filename.startswith('train')])
    print(dataset_paths)

    start_from = max(0, start_from)
    chunks = dataset_paths[start_from: start_from + num_parts_to_read]
    if verbose:
        print('Reading chunks:\n')
        for chunk in chunks:
            print(chunk)
    for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):
        print('chunk_path', chunk_path)
        chunk = pd.read_parquet(chunk_path,columns=columns)
        res.append(chunk)

    return pd.concat(res).reset_index(drop=True)


def create_dataset(path):
    for i, elem in enumerate(['df_'+str(a) for a in range(0, 12)]):
        elem = read_parquet_dataset_from_local(path, i, 1)
        if i==0:
            df = elem.copy()
        else:
            df = pd.concat([df, elem], ignore_index=True, axis=0)
        del elem
    return df


def drop_col(df_1, targets=None):
    columns_to_drop = ['pre_since_opened', 'pre_since_confirmed', 'pre_fterm', 'pre_till_pclose', 
                       'pre_till_fclose', 'pre_loans_outstanding', 'pre_loans_total_overdue',
                       'pre_loans_max_overdue_sum', 'pre_loans90', 'is_zero_util', 'is_zero_over2limit', 
                       'is_zero_maxover2limit', 'enc_paym_0', 'enc_paym_1', 'enc_paym_2', 'enc_paym_3', 
                       'enc_paym_4', 'enc_paym_5', 'enc_paym_6', 'enc_paym_7', 'enc_paym_8', 'enc_paym_9', 
                       'enc_paym_10', 'enc_paym_11', 'enc_paym_12', 'enc_paym_13', 'enc_paym_14', 'enc_paym_15', 
                       'enc_paym_16', 'enc_paym_17', 'enc_paym_18', 'enc_paym_19', 'enc_paym_20','enc_paym_21']
    return df_1.drop(columns_to_drop, axis=1)


def one_hot_enc(df_1, targets=None):
    cat_columns = ['pre_pterm', 'pre_loans_credit_limit',  'pre_loans_next_pay_summ', 'pre_loans_credit_cost_rate',
              'pre_loans5', 'pre_loans530', 'pre_loans3060', 'pre_loans6090', 'pre_util', 'pre_over2limit', 'is_zero_loans90', 
               'pre_maxover2limit', 'enc_paym_22', 'enc_paym_23', 'enc_paym_24', 'enc_loans_account_holder_type',
               'enc_loans_credit_status', 'enc_loans_credit_type', 'enc_loans_account_cur']

    ohe = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore', dtype='int8')
    df_1[ohe.get_feature_names_out()] = ohe.fit_transform(df_1[cat_columns])
    df_1.drop(columns=cat_columns, inplace=True)
    return df_1


def group_df(df_1, targets=None):
    def new_features(df):
        df_1 = df.copy()
        df_1 = df_1.iloc[:, :2].copy().groupby(['id']).max()
        df_1.columns = ['max_rn']
        return df_1
    
    df_max_rn = new_features(df_1)
    df_1 = df_1.groupby(['id']).sum()
    return df_1.merge(df_max_rn, how='left', on='id').drop(columns='rn')


class Net_model(nn.Module):
        def __init__(self):
            super(Net_model, self).__init__()  
            self.fc1 = nn.Linear(195, 100, bias=True)
            self.fc2 = nn.Linear(100, 50, bias=True)
            self.dropout = nn.Dropout(p=0.4)
            self.fc3 = nn.Linear(50, 20, bias=True)
            self.fc4 = nn.Linear(20, 5, bias=True)
            self.fc5 = nn.Linear(5, 1, bias=True)
            
        def forward(self, x):
            x = F.relu(self.fc1(x))
            x = self.dropout(F.relu(self.fc2(x)))
            x = F.relu(self.fc3(x))
            x = F.relu(self.fc4(x))
            x = F.sigmoid(self.fc5(x))
            return x

            
class PytorchModel(sklearn.base.BaseEstimator):
    def __init__(self, net_type, net_params, optimizer_type, optimizer_params, loss_fn,
                 batch_size=10000, auc_tol=0.04, tol_epochs=10):
        self.net_type = net_type
        self.net_params = net_params
        self.optimizer_type = optimizer_type
        self.optimizer_params = optimizer_params
        self.loss_fn = loss_fn
    
        self.batch_size = batch_size
        self.auc_tol = auc_tol 
        self.tol_epochs = tol_epochs
    
    def fit(self, X, y):
        super().__init__()
        self.net = self.net_type(**self.net_params)
        self.optimizer = self.optimizer_type(self.net.parameters(), **self.optimizer_params)
            
        uniq_classes = np.sort(np.unique(y))
        self.classes_ = uniq_classes
        
        X_t = torch.FloatTensor(X)
        y_t = torch.FloatTensor(y).view(-1, 1)
        train_dataset = TensorDataset(X_t, y_t)
        train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True, drop_last=False)
        metrics = []
        epoch = 0
        keep_training = True   
        while keep_training:
            self.net.train()
            epoch_loss = []
            for X_batch, y_batch in train_loader:
                y_pred = self.net(X_batch)
                loss = self.loss_fn(y_pred, y_batch)
                epoch_loss.append(loss.item())
                    
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()               
                    
            with torch.no_grad():
                nn_prediction_train = self.net(X_t).tolist()
                roc_auc_sc = roc_auc_score(y_t, nn_prediction_train)
                metrics.append(roc_auc_sc)
                    
            if len(metrics) > self.tol_epochs:
                metrics.pop(0)
            if len(metrics) == self.tol_epochs:
                roc_auc_diff = max(metrics) - min(metrics)
                if roc_auc_diff <= self.auc_tol:
                    keep_training = False        

    def predict_proba(self, X):
        X_tensor = torch.tensor(X.astype(np.float32))
        self.net.eval()
        preds_proba = self.net(X_tensor).detach().numpy()
        return preds_proba
        
    def predict(self, X):
        preds_proba = self.predict_proba(X)
        predictions = np.amax(preds_proba, axis=1)
        return predictions

    
def get_pred(predictions):
    predictions = np.amax(predictions, axis=1)
    return predictions


def main():
    path = 'train_data/'
    df = create_dataset(path)
    targets = pd.read_csv('train_target.csv')


    preprocessor = Pipeline(steps=[
        ('drop_columns', FunctionTransformer(drop_col)),
        ('encoder', FunctionTransformer(one_hot_enc)),
        ('group_dataset', FunctionTransformer(group_df))
         ])

    df =  preprocessor.fit_transform(df, targets)
    df.loc[:, 'target'] = targets.loc[:, 'flag']

    df.drop_duplicates(inplace=True)
    int_64_cols  = df.select_dtypes('int64').columns
    df[int_64_cols] = df[int_64_cols].astype('int8')
    
    X, y =  df[df.columns[:-1]].to_numpy(), df[df.columns[-1]].to_numpy()

    net = Net_model()   
    estimated_model = PytorchModel(net_type=Net_model, net_params=dict(), optimizer_type=Adam, optimizer_params={"lr": 1e-3}, 
                                   loss_fn=torch.nn.BCELoss(), batch_size=10000, auc_tol=0.003, tol_epochs=10)
    meta_classifier = BaggingClassifier(estimator=estimated_model, n_estimators=10)
    meta_classifier.fit(X, y)   
    
    predict_nn = get_pred(meta_classifier.predict_proba(X))
    print('roc_auc: ', roc_auc_score(y, predict_nn))

    joblib.dump(meta_classifier, 'ML_project.joblib')

if __name__ == '__main__':
    main()