In [1]:
import pandas as pd
import numpy as np
import torch 

In [2]:
from category_encoders.target_encoder import TargetEncoder

## 构造多层全连接网络(带dropout)

In [5]:
import torch.nn as nn
import torch
import torch.nn.functional as F


def mish(input):

    return input * torch.tanh(F.softplus(input))

class Mish(nn.Module):

    def __init__(self):
        '''
        Init method.
        '''
        super().__init__()

    def forward(self, input):
        '''
        Forward pass of the function.
        '''
        return mish(input)

class MLPLayer(nn.Module):
    def __init__(self, dim_in, dim_out, res_coef = 0, dropout_p = 0.1):
        super().__init__()
        self.linear  = nn.Linear(dim_in, dim_out)
        self.res_coef = res_coef
        self.activation = Mish()
        self.dropout = nn.Dropout(dropout_p)
        self.ln = nn.LayerNorm(dim_out)
    
    def forward(self, x):
        y = self.linear(x)
        y = self.activation(y)
        y = self.dropout(y)
        if self.res_coef == 0:
            return self.ln(y)
        else:
            return self.ln(self.res_coef*x +y )

        
class MyNetwork(nn.Module):
    def __init__(self, dim_in, dim, res_coef=0.5, dropout_p = 0.1, n_layers = 10):
        super().__init__()
        self.mlp = nn.ModuleList()
        self.first_linear = MLPLayer(dim_in, dim)
        self.n_layers = n_layers
        for i in range(n_layers):
            self.mlp.append(MLPLayer(dim, dim, res_coef, dropout_p))
        self.final = nn.Linear(dim, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.first_linear(x)
        for layer in self.mlp:
            x = layer(x)
        x= self.sigmoid(self.final(x))
        return x.squeeze()

In [47]:
import pytorch_lightning as pl
from pytorch_lightning.metrics import Accuracy
class TrainingModule(pl.LightningModule):
    def __init__(self, dim_in, dim, res_coef=0, dropout_p=0, n_layers=10):
        super().__init__()
        self.backbone = MyNetwork(dim_in, dim, res_coef, dropout_p, n_layers)
        self.loss = nn.BCELoss()
        self.accuracy = Accuracy()
    def forward(self, x):
        return self.backbone(x)
    def validation_step(self, batch, batch_idx):
        x, y = batch
        x = self.backbone(x)
        loss = self.loss(x, y.type(torch.float32))
        acc = self.accuracy(x, y)
        self.log("Validation loss", loss)
        self.log("Validation acc", acc)
        return loss, acc
        
    def training_step(self, batch, batch_idx):
        x, y = batch
        x = self.backbone(x)
        loss = self.loss(x, y.type(torch.float32))
        acc = self.accuracy(x, y)
        self.log("Training loss", loss)
        self.log("Training acc", acc)
        
        return loss
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

import os
class CheckpointEveryNSteps(pl.Callback):
    def __init__(self, save_step_frequency):
        self.save_step_frequency = save_step_frequency

    def on_batch_end(self, trainer: pl.Trainer, _):
        epoch = trainer.current_epoch
        global_step = trainer.global_step
        if global_step % self.save_step_frequency == 0:
            filename = "epoch=" + str(epoch) + "_step=" + str(global_step)+".ckpt"
            ckpt_path = os.path.join(trainer.checkpoint_callback.dirpath, filename)
            trainer.save_checkpoint(ckpt_path)

In [4]:
from torch.utils.data import Dataset, DataLoader

class MyDataSet(Dataset):
    def __init__(self, x, y):
        super().__init__()
        self.x = x
        self.y = y
        self.len = x.shape[0]
        
    def __getitem__(self, idx):
        return self.x[idx, :], self.y[idx]
    
    def __len__(self):
        return self.len

## 一些有用的数据处理方法
1. one-hot 转 ordinal encoding
2. Discretize
3. Variable selection

In [38]:
# coding = 'utf-8'
import numpy as np
import pandas as pd
import tqdm

# 将onehot 字段还原 成分类字段
def inverse_onehot(df, cols_prefix):
    cols = [x for x in df.columns if cols_prefix in x]
    onehots = df.loc[:, cols].values
    trans = np.arange(1, onehots.shape[1]+1).reshape(onehots.shape[1], -1)
    return cols, np.dot(onehots, trans)

def inverse_onehot_mat(df, col_prefixes):
    df1 = df.copy(deep=True)
    drop_cols = []
    for col in col_prefixes:
        cols, value = inverse_onehot(df1, col)
        drop_cols.extend(cols)
        df1[col] = value
        
    df1.drop(columns=drop_cols, inplace=True)
    return df1

def encode_label(x):
    unique=sorted(list(set([str(item) for item in np.unique(x)])))
    kv = {unique[i]: i for i in range(len(unique))}
    vfunc = np.vectorize(lambda x: kv[str(x)])
    return vfunc(x)

def encode_label_mat(x):
    _, ncol = x.shape
    result = np.empty_like(x, dtype=int)
    for col in range(ncol):
        result[:,col] = encode_label(x[:, col])
    return result

def impute_nan(x, method='median'):
    _, ncol = x.shape
    result = np.empty_like(x)

    for col in range(ncol):
        if method == 'median':
            data = x[:, col]
            impute_value = np.median(data[~pd.isnull(data) & (data != np.inf) & (data != -np.inf)])
        else:
            raise NotImplementedError()

        func = np.vectorize(lambda x: impute_value if pd.isnull(x) else x)
        result[:, col] = func(x[:, col])
    return result


def get_uniform_interval(minimum, maximum, nbins):
    result = [minimum]
    step_size = (float(maximum - minimum)) / nbins
    for index in range(nbins - 1):
        result.append(minimum + step_size * (index + 1))
    result.append(maximum)
    return result


def get_interval_v2(x, sorted_intervals):
    if pd.isnull(x):
        return -1
    if x == np.inf:
        return -2
    if x == -np.inf:
        return -3
    interval = 0
    found = False
    sorted_intervals.append(np.inf)
    while not found and interval < len(sorted_intervals) - 1:
        if sorted_intervals[interval] <= x < sorted_intervals[interval + 1]:
            return interval
        else:
            interval += 1


def get_quantile_interval(data, nbins):
    quantiles = get_uniform_interval(0, 1, nbins)
    return list(np.quantile(data[(~pd.isnull(data)) & (data != np.inf) & (data != -np.inf)], quantiles))


def discretize(x, nbins=20):
    nrow, ncol = x.shape
    result = np.empty_like(x)
    interval_list = list()
    for col in range(ncol):
        intervals = sorted(list(set(get_quantile_interval(x[:, col], nbins))))
        interval_centroid = list()

        for i in range(len(intervals) - 1):
            interval_centroid.append(0.5 * (intervals[i] + intervals[i + 1]))
        func = np.vectorize(lambda x: get_interval_v2(x, intervals))
        result[:, col] = encode_label(func(x[:, col]))
        interval_list.append(interval_centroid)
    return result.astype(np.int64), interval_list

def get_var_type(df):
    columns = df.columns
    continuous_vars = [x for x in columns if x.startswith('continuous_')]
    discrete_vars = [x for x in columns if x.startswith('discrete_')]
    other_vars = list()
    for column in columns:
        if column not in continuous_vars and column not in discrete_vars:
            other_vars.append(column)
    return {'continuous': continuous_vars,
            'discrete': discrete_vars,
            'other': other_vars}


def get_cont_var(df):
    var_types = get_var_type(df)
    return var_types['continuous']


def get_dis_var(df):
    var_types = get_var_type(df)
    return var_types['discrete']

def drop_const_var(data):
    result = data.copy(deep=True)
    for col in data.columns:
        if len(data.loc[~pd.isnull(data[col]), col].unique()) <= 1:
            result.drop(columns=col, inplace=True)
    return result

## Load data

In [7]:
df_train = pd.read_csv('train_final.csv')
df_test = pd.read_csv('test_final.csv')

## preprocessing

In [45]:
# 设置要将onehot转换为target-encoder的 特征
cols = ['discrete_addr_state', 'discrete_grade'] #, 'discrete_sub_grade', 'discrete_emp_length']
df_train_ = inverse_onehot_mat(df_train, cols)
df_test_ = inverse_onehot_mat(df_test, cols)

x_train, y_train = df_train_.drop(columns='loan_status'), df_train_.loc[:, 'loan_status']
x_test, y_test = df_test_.drop(columns='loan_status'), df_test_.loc[:, 'loan_status']

In [46]:
num_workers = 4

x_train_ = torch.from_numpy(impute_nan(x_train.values)).type(torch.float32)
y_train_ = torch.from_numpy(y_train.values).type(torch.int)
x_test_, y_test_ = torch.from_numpy(impute_nan(x_test.values)).type(torch.float32), torch.from_numpy(y_test.values).type(torch.int)

train_dataset = MyDataSet(x_train_, y_train_)
test_dataset = MyDataSet(x_test_, y_test_)
train_dataloader = DataLoader(train_dataset, batch_size = 128, shuffle=True, num_workers=num_workers)
test_dataloader = DataLoader(test_dataset, batch_size = 128, num_workers=num_workers)

## Training 

In [48]:
from pytorch_lightning import loggers as pl_loggers

tb_logger = pl_loggers.TensorBoardLogger('logs/')
save_by_steps = CheckpointEveryNSteps(100)
training_module = TrainingModule(x_train.shape[1], 10, 0.5, 0.1, 2)
trainer = pl.Trainer(max_epochs=2, gpus=None, progress_bar_refresh_rate=100, val_check_interval=0.25, logger=tb_logger)
trainer.fit(training_module, train_dataloader, test_dataloader)

GPU available: False, used: False
TPU available: None, using: 0 TPU cores

  | Name     | Type      | Params
---------------------------------------
0 | backbone | MyNetwork | 1.2 K 
1 | loss     | BCELoss   | 0     
2 | accuracy | Accuracy  | 0     
---------------------------------------
1.2 K     Trainable params
0         Non-trainable params
1.2 K     Total params
0.005     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

1

In [50]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [51]:
%tensorboard --logdir logs