In [None]:
import numpy as np
import pandas as pd

from sklearn.exceptions import ConvergenceWarning
from sklearn.mixture import BayesianGaussianMixture
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils._testing import ignore_warnings

import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras import *
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras import activations
from tensorflow.keras import layers
from tensorflow import keras

import time

from google.colab import drive

In [None]:
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
class DataTransformer(object):
    """Data Transformer.

    Model continuous columns with a BayesianGMM and normalized to a scalar
    [0, 1] and a vector.
    Discrete columns are encoded using a scikit-learn OneHotEncoder.

    Args:
        n_cluster (int):
            Number of modes.
        epsilon (float):
            Epsilon value.
    """

    def __init__(self, n_clusters=10, epsilon=0.005):
        self.n_clusters = n_clusters
        self.epsilon = epsilon

    @ignore_warnings(category=ConvergenceWarning)
    def _fit_continuous(self, column, data):
        gm = BayesianGaussianMixture(
            self.n_clusters,
            weight_concentration_prior_type='dirichlet_process',
            weight_concentration_prior=0.001,
            n_init=1
        )
        gm.fit(data)
        components = gm.weights_ > self.epsilon
        num_components = components.sum()

        return {
            'name': column,
            'model': gm,
            'components': components,
            'output_info': [(1, 'tanh'), (num_components, 'softmax')],
            'output_dimensions': 1 + num_components,
        }

    @staticmethod
    def _fit_discrete(column, data):
        ohe = OneHotEncoder(sparse=False)
        ohe.fit(data)
        categories = len(ohe.categories_[0])

        return {
            'name': column,
            'encoder': ohe,
            'output_info': [(categories, 'softmax')],
            'output_dimensions': categories
        }

    def fit(self, data, discrete_columns=()):
        self.output_info = []
        self.output_dimensions = 0

        if not isinstance(data, pd.DataFrame):
            self.dataframe = False
            data = pd.DataFrame(data)
        else:
            self.dataframe = True

        self.meta = []
        for column in data.columns:
            column_data = data[[column]].values
            if column in discrete_columns:
                meta = self._fit_discrete(column, column_data)
            else:
                meta = self._fit_continuous(column, column_data)

            self.output_info += meta['output_info']
            self.output_dimensions += meta['output_dimensions']
            self.meta.append(meta)

    def _transform_continuous(self, column_meta, data):
        components = column_meta['components']

        model = column_meta['model']

        means = model.means_.reshape((1, self.n_clusters))
        stds = np.sqrt(model.covariances_).reshape((1, self.n_clusters))
        features = (data - means) / (4 * stds)

        probs = model.predict_proba(data)
        n_opts = components.sum()
        features = features[:, components]
        probs = probs[:, components]

        opt_sel = np.zeros(len(data), dtype='int')
        for i in range(len(data)):
            pp = probs[i] + 1e-6
            pp = pp / pp.sum()
            opt_sel[i] = np.random.choice(np.arange(n_opts), p=pp)

        idx = np.arange((len(features)))
        features = features[idx, opt_sel].reshape([-1, 1])
        features = np.clip(features, -.99, .99)

        probs_onehot = np.zeros_like(probs)
        probs_onehot[np.arange(len(probs)), opt_sel] = 1
  
        return [features, probs_onehot]

    @staticmethod
    def _transform_discrete(column_meta, data):
        encoder = column_meta['encoder']
        return encoder.transform(data)

    def transform(self, data):
        if not isinstance(data, pd.DataFrame):
            data = pd.DataFrame(data)

        values = []
        for meta in self.meta:
            column_data = data[[meta['name']]].values
            if 'model' in meta:
                values += self._transform_continuous(meta, column_data)
            else:
                values.append(self._transform_discrete(meta, column_data))

        return np.concatenate(values, axis=1).astype(float)
        # return values

    def _inverse_transform_continuous(self, meta, data, sigma):
        model = meta['model']
        components = meta['components']

        u = data[:, 0]
        v = data[:, 1:]

        if sigma is not None:
            u = np.random.normal(u, sigma)

        u = np.clip(u, -1, 1)
        v_t = np.ones((len(data), self.n_clusters)) * -100
        v_t[:, components] = v
        v = v_t
        means = model.means_.reshape([-1])
        stds = np.sqrt(model.covariances_).reshape([-1])
        p_argmax = np.argmax(v, axis=1)
        std_t = stds[p_argmax]
        mean_t = means[p_argmax]
        column = u * 4 * std_t + mean_t

        return column

    @staticmethod
    def _inverse_transform_discrete(meta, data):
        encoder = meta['encoder']
        return encoder.inverse_transform(data)

    def inverse_transform(self, data, sigmas):
        start = 0
        output = []
        column_names = []
        for meta in self.meta:
            dimensions = meta['output_dimensions']
            columns_data = data[:, start:start + dimensions]

            if 'model' in meta:
                sigma = sigmas[start] if sigmas else None
                inverted = self._inverse_transform_continuous(meta, columns_data, sigma)
            else:
                inverted = self._inverse_transform_discrete(meta, columns_data)

            output.append(inverted)
            column_names.append(meta['name'])
            start += dimensions

        output = np.column_stack(output)
        if self.dataframe:
            output = pd.DataFrame(output, columns=column_names)

        return output


In [None]:
MAIN_PATH = "/content/drive/MyDrive/bnp_competition/"

In [None]:
train_data = pd.read_csv(MAIN_PATH + "train.csv", names = ["Index1", "Index2", "Index3", "Index4"])
train_data.head()

Unnamed: 0,Index1,Index2,Index3,Index4
0,0.012495,0.011126,0.003252,0.006625
1,0.011439,0.002691,0.001206,0.006947
2,0.000632,0.007277,0.004049,7.4e-05
3,0.017828,0.02821,0.007758,0.007382
4,0.021115,0.019642,0.009238,0.011499


In [None]:
transformer = DataTransformer()
transformer.fit(train_data)
res = transformer.transform(train_data)

# transformed_train = transformer.transform(train_data)

In [None]:
pd.DataFrame(res)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24
0,0.508719,0.0,0.0,0.0,0.0,0.0,1.0,0.534733,0.0,0.0,0.0,0.0,1.0,-0.111512,1.0,0.0,0.0,0.0,0.0,0.150406,0.0,0.0,0.0,0.0,1.0
1,0.434615,0.0,0.0,0.0,0.0,0.0,1.0,-0.140180,0.0,0.0,0.0,0.0,1.0,-0.269221,1.0,0.0,0.0,0.0,0.0,0.175812,0.0,0.0,0.0,0.0,1.0
2,-0.323417,0.0,0.0,0.0,0.0,0.0,1.0,0.226823,0.0,0.0,0.0,0.0,1.0,-0.050096,1.0,0.0,0.0,0.0,0.0,-0.366724,0.0,0.0,0.0,0.0,1.0
3,0.216074,0.0,1.0,0.0,0.0,0.0,0.0,0.047855,0.0,1.0,0.0,0.0,0.0,0.235878,1.0,0.0,0.0,0.0,0.0,-0.182315,1.0,0.0,0.0,0.0,0.0
4,0.360324,0.0,1.0,0.0,0.0,0.0,0.0,0.352918,1.0,0.0,0.0,0.0,0.0,0.349950,1.0,0.0,0.0,0.0,0.0,0.023166,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
741,-0.231789,0.0,0.0,0.0,0.0,0.0,1.0,0.351280,0.0,0.0,0.0,0.0,1.0,-0.059507,1.0,0.0,0.0,0.0,0.0,0.030481,0.0,0.0,0.0,0.0,1.0
742,-0.016814,0.0,0.0,0.0,0.0,0.0,1.0,0.320852,1.0,0.0,0.0,0.0,0.0,-0.126531,1.0,0.0,0.0,0.0,0.0,-0.215695,0.0,0.0,0.0,0.0,1.0
743,0.171173,0.0,0.0,0.0,0.0,0.0,1.0,-0.457288,1.0,0.0,0.0,0.0,0.0,0.177615,1.0,0.0,0.0,0.0,0.0,-0.288847,0.0,0.0,0.0,1.0,0.0
744,-0.129551,0.0,0.0,0.0,0.0,0.0,1.0,-0.253075,0.0,0.0,0.0,0.0,1.0,0.225299,1.0,0.0,0.0,0.0,0.0,-0.239970,0.0,0.0,0.0,0.0,1.0


In [None]:
u = [res[i] for i in range(8) if i%2 == 1]
v = [res[i] for i in range(8) if i%2 == 0]

In [None]:
u[3].shape

(25,)

# Other Experiments

In [None]:
for item in u:
  if(item.shape[-1] != 5):
    print(item.shape)

(25,)
(25,)
(25,)
(25,)


In [None]:
transformed_train[5]

NameError: ignored

In [None]:
transformed_train.shape

NameError: ignored

In [None]:
tmp = transformer.inverse_transform(transformed_train, None)

In [None]:
class Linear(keras.layers.Layer):
    def __init__(self, input_dim=32, units=32):
        super(Linear, self).__init__()
        self.w = self.add_weight(
            shape=(input_dim, units), initializer="random_normal", trainable=True
        )
        self.b = self.add_weight(shape=(units,), initializer="zeros", trainable=True)

    def call(self, inputs):
        return tf.matmul(inputs, self.w) + self.b

In [None]:
class Residual(keras.layers.Layer):
  def __init__(self, i, o):
        super(Residual, self).__init__()
        self.fc = Linear(i, o)
        self.bn = BatchNormalization()
        self.relu = layers.Activation(activations.relu)

  def call(self, input):
    x = self.fc(input)
    x = self.bn(x)
    x = self.relu(x)
    return x

In [None]:
class Generator():
  def __init__(self, embedding_dim, gen_dims, data_dim, latent_dim):
      dim = embedding_dim
      self.seq = Sequential()
      for item in list(gen_dims):
          self.seq.add(Residual(dim, item))
          dim += item

      self.seq.add(Linear(dim, data_dim))
      noise = Input(shape=(latent_dim,))

      res = self.seq(noise)

      return Model(noise, res)

In [None]:
class Generator(keras.Model):
  def __init__(self, n_units, n_z, output_length, n_x, n_u, n_v):
    super(Generator, self).__init__()
    self.alpha = tf.Variable(shape = (1,output_length,), 
                             initial_value= tf.random.normal(shape=(1,output_length,)))
    self.lstm = LSTM(n_units, activation='relu', return_state=True)
    self.n_z = n_z
    self.n_units = n_units
    self.output_length =  output_length
    self.n_x = n_x
    self.n_v = n_v
    self.n_u = n_u
    self.projector_f = Dense(self.n_x, activation="tanh")
    self.projector_v = [Dense(i, activation='tanh') for i in n_v]
    self.projector_u = [Dense(i, activation='tanh') for i in n_u]
    


  def give_noise(self):
    return tf.random.normal(shape=(1, 1, self.n_z,),)

  def give_attention(self, h,):
    n_dim = h.shape[-2]
    alpha_pr = self.alpha[:n_dim]
    alpha_pr = tf.nn.softmax(alpha_pr)
    
    output = tf.math.reduce_sum(alpha_pr*h, axis=-2)

    return output

  def call(self, x):
    a = tf.zeros(shape=(1, 1, self.n_units,))
    z = self.give_noise()
    input = tf.concat([z, tf.reshape(x[0], shape=(1, 1, -1)), a], axis = -1)
    output_h = np.random.randn(self.output_length, self.n_units)
    output_v = [np.random.randn(i) for i in self.n_v]
    output_u = [np.random.randn(i) for i in self.n_u]

    h = tf.zeros(shape=(1, self.n_units,))
    c = tf.zeros(shape=(1, self.n_units,))
    flag = True

    u_cur, v_cur = 0, 0
    for i in range(self.output_length):
        _, h, c = self.lstm(input, initial_state=[h, c])
        # print(h)
        # print(output_h[i])
        # tmp = output_h.numpy()
        # tmp[i] = h.numpy()
        # output_h = tf.convert_to_tensor(tmp)
        output_h[i] = h.numpy()
        f = self.projector_f(h)
        f = tf.reshape(f, shape=(1, 1, -1))
        a = self.give_attention(output_h[:i+1]) 
        a = tf.reshape(a, shape=(1, 1, -1))
        # print(z.shape, f.shape, a.shape)
        input = tf.concat([z, f, a], axis = -1)

        # print(flag)
        if(flag):
          output_v[i//2] = self.projector_v[v_cur](f)
          print(output_v[i//2].shape)
          v_cur = v_cur + 1
        else:
          output_u[i//2] = self.projector_u[u_cur](f)
          u_cur = u_cur + 1
        
        flag = not flag

    return (output_u, output_v)

In [None]:
class DiversityLayer(Layer):

    def __init__(self, hidden_dim, alpha_leaky):
      super(DiversityLayer, self).__init__()
      self.diversity_features_projection = Dense(hidden_dim)
      self.layer = Sequential([Dense(hidden_dim), LeakyReLU(alpha=alpha_leaky)])
    
    def call(self, x):
      output = x + self._diversity(x)
      return self.layer(output)

    def _diversity(self, X):
      output = self.diversity_features_projection(X)
      n = X.shape[0]
      results = []
      for i in range(n):
        xi = tf.stack([X[i]] * n, axis = 0)
        tmp = tf.norm(xi - output, ord=1, axis = 0)
        results.append(tmp)
      return tf.stack(results, axis = 0)

class Discriminator(keras.Model):  


  def __init__(self, number_div_layers, hidden_dim, diversity_input_dim, diversity_output_dim, alpha_leaky=0.03):
    super(Discriminator, self).__init__()

    self.layer1 = Sequential([Dense(hidden_dim), LeakyReLU(alpha=alpha_leaky)])

    self.diversity_layers =  Sequential([DiversityLayer(hidden_dim, alpha_leaky) for _ in range(number_div_layers)])

    self.output_projector = Dense(1, activation='sigmoid')

  
  def call(self, x):
    output = self.layer1(x)
    output = self.diversity_layers(output)
    output = self.output_projector(output)

    return output
  

  

In [None]:
def generator_loss(discriminator, generator, u_gen, v_gen, u,):
  u_gen = tf.reshape(u_gen, shape=(u_gen.shape[0], -1))
  v_gen = tf.reshape(v_gen, shape=(v_gen.shape[0], -1))

  uv_flat = tf.concat([u_gen, v_gen], axis = -1)

  out_dis = discriminator(uv_flat)

  loss = - tf.math.reduce_mean((tf.math.log(out_dis))) 
  loss +=  tf.losses.KLDivergence(u, u_gen)
  
  return loss

In [None]:
def discriminator_loss(discriminator, generator, x, u, v, u_gen, v_gen):
  # u_gen, v_gen = generator(x)
  u_gen = tf.reshape(u_gen, shape=(u_gen.shape[0], -1))
  v_gen = tf.reshape(v_gen, shape=(v_gen.shape[0], -1))

  uv_gen_flat = tf.concat([u_gen, v_gen], axis = -1)

  out_gen_dis = discriminator(uv_gen_flat)

  loss = tf.math.reduce_mean((tf.math.log(out_gen_dis))) 

  u = tf.reshape(u, shape=(u.shape[0], -1))
  v = tf.reshape(v, shape=(v.shape[0], -1))

  uv_flat = tf.concat([u, v], axis = -1)
  out_dis = discriminator(uv_flat)

  loss -=  tf.math.reduce_mean(tf.math.log(out_dis))

  return loss

In [None]:
generator_optimizer = tf.keras.optimizers.Adam(1e-4)
discriminator_optimizer = tf.keras.optimizers.Adam(1e-4)

In [None]:
generator_model = Generator(n_units=8, n_z = 4, output_length=8, n_x =4, n_u = [6, 7, 7, 4], n_v=[1, 1, 1, 1])
discriminator_model = Discriminator(number_div_layers=2, hidden_dim=4, diversity_input_dim=4, diversity_output_dim=2, alpha_leaky=0.03)

In [None]:
def my_concatenation(u_gen, v_gen):
  res = []
  for i in range(4):
    print(u_gen[i].shape)
    res.append(tf.reshape(v_gen[i], shape=(-1,)))
    res.extend(tf.reshape(u_gen[i], shape=(-1,)))

  res = tf.concat(res, axis=-1)
  return res

In [None]:
# Notice the use of `tf.function`
# This annotation causes the function to be "compiled".

def train_step():
    # noise = tf.random.normal([BATCH_SIZE, noise_dim])

    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
      u_gen, v_gen = generator_model(np.random.normal(0.0, 0.1, (32,1,4)))

      # real_output = discriminator_model(res)
      # print(u_gen.shape, v_gen.shape)
      fake_output = discriminator_model(my_concatenation(u_gen, v_gen))
      
      gen_loss = generator_loss(discriminator, generator, u_gen, v_gen, u)
      disc_loss = discriminator_loss(discriminator, generator, x, u, v, u_gen, v_gen)

    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)

    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))

In [None]:
def train(epochs):
  for epoch in range(epochs):
    start = time.time()

    train_step()
    print ('Time for epoch {} is {} sec'.format(epoch + 1, time.time()-start))

In [None]:
train(5)

NameError: ignored

In [None]:
tmp = tf.random.uniform(shape=(8,8))
x = tf.random.uniform(shape=(1,8))

In [None]:
tmp[0] = x

NameError: ignored

In [None]:
gen_model = Generator(8, 2, 8, 4, 5, 1)

TypeError: ignored

In [None]:
x = tf.random.normal(shape=(1,1,4))
y = gen_model.call(x)

In [None]:
discriminator = Discriminator(2, 4, 4, 4)

In [None]:
y = tf.random.normal(shape=(1,4))
discriminator(y)

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.77307785]], dtype=float32)>

# Conditional TGAN

## Class

In [None]:
import numpy as np


class CTGanSampler(object):
    """docstring for Sampler."""

    def __init__(self, data, output_info):
        super(CTGanSampler, self).__init__()
        self.data = data
        self.model = []
        self.n = len(data)

        st = 0
        skip = False
        for item in output_info:
            if item[1] == 'tanh':
                st += item[0]
                skip = True
            elif item[1] == 'softmax':
                if skip:
                    skip = False
                    st += item[0]
                    continue

                ed = st + item[0]
                tmp = []
                for j in range(item[0]):
                    tmp.append(np.nonzero(data[:, st + j])[0])

                self.model.append(tmp)
                st = ed
            else:
                assert 0

        assert st == data.shape[1]

    def sample(self, n, col, opt):
        if col is None:
            idx = np.random.choice(np.arange(self.n), n)
            return self.data[idx]

        idx = []
        for c, o in zip(col, opt):
            idx.append(np.random.choice(self.model[c][o]))

        return self.data[idx]


In [None]:
import numpy as np

def anderson_evaluation(real, generated):
    n = len(real)
    num_markets = len(real[0])
    
    u = np.zeros((n, num_markets))
    w = np.zeros(num_markets)
    
    for d in range(num_markets):
        sorted_generated = np.sort(generated[:, d])
        
        for i in range(1, n+1):
            u[i-1][d] = (len(real[:, d][real[:, d]<=sorted_generated[i-1]]) +1) / (n+2)
        
        sum_logs = 0
        for i in range(1, n+1):
            sum_logs += (2*i-1)*(np.log(u[i-1][d]) + np.log(1-u[n-i][d]))
        
        w[d] = -1 * (n + sum_logs/n)
    
    return np.mean(w)

def kendall_evaluation(real, generated):
    n = len(real)
    d = len(real[0])
    
    z_real = np.zeros(n)
    z_generated = np.zeros(n)
    
    for i in range(n):
        arr_real = np.tile(real[i], (n-1, 1)) # array where we repeat the row [X_i^1, ..., X_i^d] n-i times
        arr2_real = np.delete(real, (i), axis=0)
        # we extract the max of each column of the difference
        # useful because the condition a1<b1 and a2<b2 and... ==> a1-b1<0 and a2-b2<0 and... can be resumed as max(ai-bi)<0
        result_real = (arr2_real - arr_real).max(axis=1)
        z_real[i] = len(result_real[result_real<0])

        arr_generated = np.tile(generated[i], (n-1, 1)) # array where we repeat the row [X_i^1, ..., X_i^d] n-i times
        arr2_generated = np.delete(generated, (i), axis=0)
        result_generated = (arr2_generated - arr_generated).max(axis=1)
        z_generated[i] = len(result_generated[result_generated<0])
        
    z_real /= (n-1)
    z_generated /= (n-1)
    
    z_real = np.sort(z_real)
    z_generated = np.sort(z_generated)
    
    return np.mean(np.abs(z_real - z_generated))



In [None]:
import torch
from torch.nn import BatchNorm1d, Dropout, LeakyReLU, Linear, Module, ReLU, Sequential


class Discriminator(Module):

    def calc_gradient_penalty(self, real_data, fake_data, device='cpu', pac=10, lambda_=10):

        alpha = torch.rand(real_data.size(0) // pac, 1, 1, device=device)
        alpha = alpha.repeat(1, pac, real_data.size(1))
        alpha = alpha.view(-1, real_data.size(1))

        interpolates = alpha * real_data + ((1 - alpha) * fake_data)

        disc_interpolates = self(interpolates)

        gradients = torch.autograd.grad(
            outputs=disc_interpolates, inputs=interpolates,
            grad_outputs=torch.ones(disc_interpolates.size(), device=device),
            create_graph=True, retain_graph=True, only_inputs=True
        )[0]

        gradient_penalty = ((
            gradients.view(-1, pac * real_data.size(1)).norm(2, dim=1) - 1
        ) ** 2).mean() * lambda_

        return gradient_penalty

    def __init__(self, input_dim, dis_dims, pack=10):
        super(Discriminator, self).__init__()
        dim = input_dim * pack
        self.pack = pack
        self.packdim = dim
        seq = []
        for item in list(dis_dims):
            seq += [Linear(dim, item), LeakyReLU(0.2), Dropout(0.5)]
            dim = item

        seq += [Linear(dim, 1)]
        self.seq = Sequential(*seq)

    def forward(self, input):
        assert input.size()[0] % self.pack == 0
        return self.seq(input.view(-1, self.packdim))


class Residual(Module):
    def __init__(self, i, o):
        super(Residual, self).__init__()
        self.fc = Linear(i, o)
        self.bn = BatchNorm1d(o)
        self.relu = LeakyReLU()

    def forward(self, input):
        out = self.fc(input)
        out = self.bn(out)
        out = self.relu(out)
        return torch.cat([out, input], dim=1)


class Generator(Module):
    def __init__(self, embedding_dim, gen_dims, data_dim):
        super(Generator, self).__init__()
        dim = embedding_dim
        seq = []
        for item in list(gen_dims):
            seq += [Residual(dim, item)]
            dim += item
        seq.append(Linear(dim, data_dim))
        self.seq = Sequential(*seq)

    def forward(self, input):
        data = self.seq(input)
        return data


In [None]:
import numpy as np


class ConditionalGenerator(object):
    def __init__(self, data, output_info, log_frequency):
        self.model = []

        start = 0
        skip = False
        max_interval = 0
        counter = 0
        for item in output_info:
            if item[1] == 'tanh':
                start += item[0]
                skip = True
                continue

            elif item[1] == 'softmax':
                if skip:
                    skip = False
                    start += item[0]
                    continue

                end = start + item[0]
                max_interval = max(max_interval, end - start)
                counter += 1
                self.model.append(np.argmax(data[:, start:end], axis=-1))
                start = end

            else:
                assert 0

        assert start == data.shape[1]

        self.interval = []
        self.n_col = 0
        self.n_opt = 0
        skip = False
        start = 0
        self.p = np.zeros((counter, max_interval))
        for item in output_info:
            if item[1] == 'tanh':
                skip = True
                start += item[0]
                continue
            elif item[1] == 'softmax':
                if skip:
                    start += item[0]
                    skip = False
                    continue
                end = start + item[0]
                tmp = np.sum(data[:, start:end], axis=0)
                if log_frequency:
                    tmp = np.log(tmp + 1)
                tmp = tmp / np.sum(tmp)
                self.p[self.n_col, :item[0]] = tmp
                self.interval.append((self.n_opt, item[0]))
                self.n_opt += item[0]
                self.n_col += 1
                start = end
            else:
                assert 0

        self.interval = np.asarray(self.interval)

    def random_choice_prob_index(self, idx):
        a = self.p[idx]
        r = np.expand_dims(np.random.rand(a.shape[0]), axis=1)
        return (a.cumsum(axis=1) > r).argmax(axis=1)

    def sample(self, batch):
        if self.n_col == 0:
            return None

        batch = batch
        idx = np.random.choice(np.arange(self.n_col), batch)

        vec1 = np.zeros((batch, self.n_opt), dtype='float32')
        mask1 = np.zeros((batch, self.n_col), dtype='float32')
        mask1[np.arange(batch), idx] = 1
        opt1prime = self.random_choice_prob_index(idx)
        opt1 = self.interval[idx, 0] + opt1prime
        vec1[np.arange(batch), opt1] = 1

        return vec1, mask1, idx, opt1prime

    def sample_zero(self, batch):
        if self.n_col == 0:
            return None

        vec = np.zeros((batch, self.n_opt), dtype='float32')
        idx = np.random.choice(np.arange(self.n_col), batch)
        for i in range(batch):
            col = idx[i]
            pick = int(np.random.choice(self.model[col]))
            vec[i, pick + self.interval[col, 0]] = 1

        return vec


In [None]:
import numpy as np
import torch
from torch import optim
from torch.nn import functional



class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""

    def __init__(self, patience=7, verbose=False, delta=0):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement.
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta

    def __call__(self, val_loss):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
        elif score < self.best_score + self.delta:
            self.counter += 1
            #print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.counter = 0


class CTGANSynthesizer(object):
    """Conditional Table GAN Synthesizer.

    This is the core class of the CTGAN project, where the different components
    are orchestrated together.

    For more details about the process, please check the [Modeling Tabular data using
    Conditional GAN](https://arxiv.org/abs/1907.00503) paper.

    Args:
        embedding_dim (int):
            Size of the random sample passed to the Generator. Defaults to 128.
        gen_dim (tuple or list of ints):
            Size of the output samples for each one of the Residuals. A Resiudal Layer
            will be created for each one of the values provided. Defaults to (256, 256).
        dis_dim (tuple or list of ints):
            Size of the output samples for each one of the Discriminator Layers. A Linear Layer
            will be created for each one of the values provided. Defaults to (256, 256).
        l2scale (float):
            Wheight Decay for the Adam Optimizer. Defaults to 1e-6.
        batch_size (int):
            Number of data samples to process in each step.
    """

    def __init__(self, embedding_dim=128, gen_dim=(256, 256), dis_dim=(256, 256),
                 l2scale=1e-6, batch_size=500, patience=25):

        self.embedding_dim = embedding_dim
        self.gen_dim = gen_dim
        self.dis_dim = dis_dim
        self.patience = patience
        self.l2scale = l2scale
        self.batch_size = batch_size
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    def _apply_activate(self, data):
        data_t = []
        st = 0
        for item in self.transformer.output_info:
            if item[1] == 'tanh':
                ed = st + item[0]
                data_t.append(torch.tanh(data[:, st:ed]))
                st = ed
            elif item[1] == 'softmax':
                ed = st + item[0]
                data_t.append(functional.gumbel_softmax(data[:, st:ed], tau=0.2))
                st = ed
            else:
                assert 0

        return torch.cat(data_t, dim=1)

    def _cond_loss(self, data, c, m):
        loss = []
        st = 0
        st_c = 0
        skip = False
        for item in self.transformer.output_info:
            if item[1] == 'tanh':
                st += item[0]
                skip = True

            elif item[1] == 'softmax':
                if skip:
                    skip = False
                    st += item[0]
                    continue

                ed = st + item[0]
                ed_c = st_c + item[0]
                tmp = functional.cross_entropy(
                    data[:, st:ed],
                    torch.argmax(c[:, st_c:ed_c], dim=1),
                    reduction='none'
                )
                loss.append(tmp)
                st = ed
                st_c = ed_c

            else:
                assert 0

        loss = torch.stack(loss, dim=1)

        return (loss * m).sum() / data.size()[0]

    def fit(self, train_data, discrete_columns=(), epochs=300, log_frequency=True):
        """Fit the CTGAN Synthesizer models to the training data.

        Args:
            train_data (numpy.ndarray or pandas.DataFrame):
                Training Data. It must be a 2-dimensional numpy array or a
                pandas.DataFrame.
            discrete_columns (list-like):
                List of discrete columns to be used to generate the Conditional
                Vector. If ``train_data`` is a Numpy array, this list should
                contain the integer indices of the columns. Otherwise, if it is
                a ``pandas.DataFrame``, this list should contain the column names.
            epochs (int):
                Number of training epochs. Defaults to 300.
            log_frequency (boolean):
                Whether to use log frequency of categorical levels in conditional
                sampling. Defaults to ``True``.
        """

        self.transformer = DataTransformer()
        self.transformer.fit(train_data, discrete_columns)
        train_data = self.transformer.transform(train_data)

        data_sampler = CTGanSampler(train_data, self.transformer.output_info)

        data_dim = self.transformer.output_dimensions
        self.cond_generator = ConditionalGenerator(
            train_data,
            self.transformer.output_info,
            log_frequency
        )

        self.generator = Generator(
            self.embedding_dim + self.cond_generator.n_opt,
            self.gen_dim,
            data_dim
        ).to(self.device)

        discriminator = Discriminator(
            data_dim + self.cond_generator.n_opt,
            self.dis_dim
        ).to(self.device)

        optimizerG = optim.Adam(
            self.generator.parameters(), lr=4e-7, betas=(0.5, 0.9),
            weight_decay=self.l2scale
        )
        optimizerD = optim.Adam(discriminator.parameters(), lr=4e-7, betas=(0.5, 0.9))

        assert self.batch_size % 2 == 0
        #mean = torch.normal(0.01, 0.01, size=(self.batch_size, self.embedding_dim))
        mean = torch.zeros(self.batch_size, self.embedding_dim, device=self.device)
        std = mean + .01

        train_losses = []
        early_stopping = EarlyStopping(patience=self.patience, verbose=False)

        steps_per_epoch = max(len(train_data) // self.batch_size, 1)
        for i in range(epochs):
            for id_ in range(steps_per_epoch):
                fakez = torch.normal(mean=mean, std=std)

                condvec = self.cond_generator.sample(self.batch_size)
                if condvec is None:
                    c1, m1, col, opt = None, None, None, None
                    real = data_sampler.sample(self.batch_size, col, opt)
                else:
                    c1, m1, col, opt = condvec
                    c1 = torch.from_numpy(c1).to(self.device)
                    m1 = torch.from_numpy(m1).to(self.device)
                    fakez = torch.cat([fakez, c1], dim=1)

                    perm = np.arange(self.batch_size)
                    np.random.shuffle(perm)
                    real = data_sampler.sample(self.batch_size, col[perm], opt[perm])
                    c2 = c1[perm]

                fake = self.generator(fakez)
                fakeact = self._apply_activate(fake)

                real = torch.from_numpy(real.astype('float32')).to(self.device)

                if c1 is not None:
                    fake_cat = torch.cat([fakeact, c1], dim=1)
                    real_cat = torch.cat([real, c2], dim=1)
                else:
                    real_cat = real
                    fake_cat = fake

                y_fake = discriminator(fake_cat)
                y_real = discriminator(real_cat)

                pen = discriminator.calc_gradient_penalty(real_cat, fake_cat, self.device)
                loss_d = -(torch.mean(y_real) - torch.mean(y_fake))
                train_losses.append(loss_d.item())
                optimizerD.zero_grad()
                pen.backward(retain_graph=True)
                loss_d.backward()
                optimizerD.step()

                fakez = torch.normal(mean=mean, std=std)
                condvec = self.cond_generator.sample(self.batch_size)

                if condvec is None:
                    c1, m1, col, opt = None, None, None, None
                else:
                    c1, m1, col, opt = condvec
                    c1 = torch.from_numpy(c1).to(self.device)
                    m1 = torch.from_numpy(m1).to(self.device)
                    fakez = torch.cat([fakez, c1], dim=1)

                fake = self.generator(fakez)
                fakeact = self._apply_activate(fake)

                if c1 is not None:
                    y_fake = discriminator(torch.cat([fakeact, c1], dim=1))
                else:
                    y_fake = discriminator(fakeact)

                if condvec is None:
                    cross_entropy = 0
                else:
                    cross_entropy = self._cond_loss(fake, c1, m1)

                #fake_copy = fakeact #tf.identity(fake)
                #fake_copy = fake_copy.detach().numpy()
                
                #real_copy = real #tf.identity(real)
                #real_copy = real_copy.detach().numpy()

                #tmp_inv_gen = self.transformer.inverse_transform(fake_copy, None).values
                #tmp_inv_real = self.transformer.inverse_transform(real_copy, None).values

                
                # print(tmp_inv_gen.shape, tmp_inv_real.shape)
                loss_g = -torch.mean(y_fake) +  cross_entropy # +  anderson_evaluation(tmp_inv_real, tmp_inv_gen)
                train_losses.append(loss_g.item())
                optimizerG.zero_grad()
                loss_g.backward()
                optimizerG.step()
            early_stopping(np.average(train_losses))
            if early_stopping.early_stop:
                print("GAN: Early stopping after epochs {}".format(i))
                break
            train_losses = []

            # print("Epoch %d, Loss G: %.4f, Loss D: %.4f" %
            #       (i + 1, loss_g.detach().cpu(), loss_d.detach().cpu()),
            #       flush=True)

    def sample(self, n):
        """Sample data similar to the training data.

        Args:
            n (int):
                Number of rows to sample.

        Returns:
            numpy.ndarray or pandas.DataFrame
        """

        steps = n // self.batch_size + 1
        data = []
        for i in range(steps):
            mean = torch.zeros(self.batch_size, self.embedding_dim)
            std = mean + 0.01
            fakez = torch.normal(mean=mean, std=std).to(self.device)

            condvec = self.cond_generator.sample_zero(self.batch_size)
            if condvec is None:
                pass
            else:
                c1 = condvec
                c1 = torch.from_numpy(c1).to(self.device)
                fakez = torch.cat([fakez, c1], dim=1)

            fake = self.generator(fakez)
            fakeact = self._apply_activate(fake)
            data.append(fakeact.detach().cpu().numpy())

        data = np.concatenate(data, axis=0)
        data = data[:n]

        return self.transformer.inverse_transform(data, None)


In [None]:
import gc
from typing import List

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold



def save_dict_to_file(dic: dict, path: str, save_raw=False) -> None:
    """
    Save dict values into txt file
    :param dic: Dict with values
    :param path: Path to .txt file
    :return: None
    """

    f = open(path, "w")
    if save_raw:
        f.write(str(dic))
    else:
        for k, v in dic.items():
            f.write(str(k))
            f.write(str(v))
            f.write("\n\n")
    f.close()


def save_exp_to_file(dic: dict, path: str) -> None:
    """
    Save dict values into txt file
    :param dic: Dict with values
    :param path: Path to .txt file
    :return: None
    """

    f = open(path, "a+")
    keys = dic.keys()
    vals = [str(val) for val in dic.values()]

    if f.tell() == 0:
        header = "\t".join(keys)
        f.write(header + "\n")

    row = "\t".join(vals)
    f.write(row + "\n")
    f.close()


def cat_cols_info(
    X_train: pd.DataFrame, X_test: pd.DataFrame, cat_cols: List[str]
) -> dict:
    """
    Get the main info about cat columns in dataframe, i.e. num of values, uniqueness
    :param X_train: Train dataframe
    :param X_test: Test dataframe
    :param cat_cols: List of categorical columns
    :return: Dict with results
    """

    cc_info = {}

    for col in cat_cols:
        train_values = set(X_train[col])
        number_of_new_test = len(set(X_test[col]) - train_values)
        fraction_of_new_test = np.mean(
            X_test[col].apply(lambda v: v not in train_values)
        )

        cc_info[col] = {
            "num_uniq_train": X_train[col].nunique(),
            "num_uniq_test": X_test[col].nunique(),
            "number_of_new_test": number_of_new_test,
            "fraction_of_new_test": fraction_of_new_test,
        }
    return cc_info


def adversarial_test(left_df, right_df, cat_cols):
    """
    Trains adversarial model to distinguish train from test
    :param left_df:  dataframe
    :param right_df: dataframe
    :param cat_cols: List of categorical columns
    :return: trained model
    """
    # sample to shuffle the data
    left_df = left_df.copy().sample(frac=1).reset_index(drop=True)
    right_df = right_df.copy().sample(frac=1).reset_index(drop=True)

    left_df = left_df.head(right_df.shape[0])
    right_df = right_df.head(left_df.shape[0])

    left_df["gt"] = 0
    right_df["gt"] = 1

    concated = pd.concat([left_df, right_df])
    lgb_model = Model(
        cat_validation="Single",
        encoders_names=("OrdinalEncoder",),
        cat_cols=cat_cols,
        model_validation=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
        model_params={
            "metrics": "AUC",
            "max_depth": 2,
            "max_bin": 100,
            "n_estimators": 500,
            "learning_rate": 0.02,
            "random_state": 42,
        },
    )
    train_score, val_score, avg_num_trees = lgb_model.fit(
        concated.drop("gt", axis=1), concated["gt"]
    )

    print(
        "ROC AUC adversarial: train %.2f%% val %.2f%%"
        % (train_score * 100.0, val_score * 100.0)
    )
    return lgb_model


def extend_gan_train(x_train, y_train, x_test, cat_cols, gen_x_times=1.2, epochs=300):
    """
    Extends train by generating new data by GAN
    :param x_train:  train dataframe
    :param y_train: target for train dataframe
    :param x_test: dataframe
    :param cat_cols: List of categorical columns
    :param gen_x_times: Factor for which initial dataframe should be increased
    :param cat_cols: List of categorical columns
    :param epochs: Number of epoch max to train the GAN
    :return: extended train with target
    """

    if gen_x_times == 0:
        raise ValueError("Passed gen_x_times with value 0!")
    x_train["target"] = y_train
    x_test_bigger = int(1.1 * x_test.shape[0] / x_train.shape[0])
    ctgan = _CTGANSynthesizer()
    ctgan.fit(x_train, cat_cols, epochs=epochs)
    generated_df = ctgan.sample((x_test_bigger) * x_train.shape[0])
    data_dtype = x_train.dtypes.values

    for i in range(len(generated_df.columns)):
        generated_df[generated_df.columns[i]] = generated_df[
            generated_df.columns[i]
        ].astype(data_dtype[i])

    generated_df = pd.concat(
        [
            x_train,
            generated_df,
        ]
    ).reset_index(drop=True)

    num_cols = []
    for col in x_train.columns:
        if "num" in col:
            num_cols.append(col)

    for num_col in num_cols:
        min_val = x_test[num_col].quantile(0.02)
        max_val = x_test[num_col].quantile(0.98)
        generated_df = generated_df.loc[
            (generated_df[num_col] >= min_val) & (generated_df[num_col] <= max_val)
        ]
    generated_df = generated_df.reset_index(drop=True)
    ad_model = adversarial_test(x_test, generated_df.drop("target", axis=1), cat_cols)

    generated_df["test_similarity"] = ad_model.predict(
        generated_df.drop("target", axis=1), return_shape=False
    )
    generated_df.sort_values("test_similarity", ascending=False, inplace=True)
    generated_df = generated_df.head(int(gen_x_times * x_train.shape[0]))
    x_train = pd.concat(
        [x_train, generated_df.drop("test_similarity", axis=1)], axis=0
    ).reset_index(drop=True)
    del generated_df
    gc.collect()
    return x_train.drop("target", axis=1), x_train["target"]


def extend_from_original(x_train, y_train, x_test, cat_cols, gen_x_times=1.2):
    """
    Extends train by generating new data by GAN
    :param x_train:  train dataframe
    :param y_train: target for train dataframe
    :param x_test: test dataframe
    :param cat_cols: List of categorical columns
    :param gen_x_times: Factor for which initial dataframe should be increased
    :return: extended train with target
    """
    if gen_x_times == 0:
        raise ValueError("Passed gen_x_times with value 0!")
    x_train["target"] = y_train
    x_test_bigger = int(gen_x_times * x_test.shape[0] / x_train.shape[0])
    generated_df = x_train.sample(frac=x_test_bigger, replace=True, random_state=42)
    num_cols = []
    for col in x_train.columns:
        if "num" in col:
            num_cols.append(col)

    for num_col in num_cols:
        min_val = x_test[num_col].quantile(0.02)
        max_val = x_test[num_col].quantile(0.98)
        generated_df = generated_df.loc[
            (generated_df[num_col] >= min_val) & (generated_df[num_col] <= max_val)
        ]

    generated_df = generated_df.reset_index(drop=True)
    ad_model = adversarial_test(x_test, generated_df.drop("target", axis=1), cat_cols)

    generated_df["test_similarity"] = ad_model.predict(
        generated_df.drop("target", axis=1), return_shape=False
    )
    generated_df.sort_values("test_similarity", ascending=False, inplace=True)
    generated_df = generated_df.head(int(gen_x_times * x_train.shape[0]))
    x_train = pd.concat(
        [x_train, generated_df.drop("test_similarity", axis=1)], axis=0
    ).reset_index(drop=True)
    del generated_df
    gc.collect()
    return x_train.drop("target", axis=1), x_train["target"]


In [None]:
import gc
import logging
from abc import ABC, abstractmethod
from typing import Tuple

import pandas as pd

__author__ = "Insaf Ashrapov"
__copyright__ = "Insaf Ashrapov"
__license__ = "Apache 2.0"


class SampleData(ABC):
    """
        Factory method for different sampler strategies. The goal is to generate more train data
        which should be more close to test, in other word we trying to fix uneven distribution.
    """

    @abstractmethod
    def get_object_generator(self):
        """
        Getter for object sampler aka generator, which is not a generator
        """
        raise NotImplementedError

    def generate_data_pipe(
        self,
        train_df: pd.DataFrame,
        target: pd.DataFrame,
        test_df: pd.DataFrame,
        deep_copy: bool = True,
        only_adversarial: bool = False,
        use_adversarial: bool = True,
        only_generated_data: bool = False,
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Defines logic for sampling
        @param train_df: Train dataframe which has separate target
        @param target: Input target for the train dataset
        @param test_df: Test dataframe - newly generated train dataframe should be close to it
        @param deep_copy: make copy of input files or not. If not input dataframes will be overridden
        @param only_adversarial: only adversarial fitering to train dataframe will be performed
        @param use_adversarial: perform or not adversarial filtering
        @param only_generated_data: After generation get only newly generated, without concating input train dataframe.
        Only works for SamplerGAN.
        @return: Newly generated train dataframe and test data
        """
        generator = self.get_object_generator()
        if deep_copy:
            logging.info("Preprocessing input data with deep copying input data.")
            if target is None or test_df is None:
                new_train = generator.preprocess_data_df(train_df.copy())
                new_target = None
            else:
                new_train, new_target, test_df = generator.preprocess_data(
                    train_df.copy(), target.copy(), test_df
                )
        else:
            logging.info("Preprocessing input data with deep copying input data.")
            new_train, new_target, test_df = generator.preprocess_data(
                train_df, target, test_df
            )
        if only_adversarial and use_adversarial:
            logging.info("Applying adversarial filtering")
            return generator.adversarial_filtering(new_train, new_target, test_df)
        else:
            logging.info("Starting generation step.")
            new_train, new_target = generator.generate_data(
                new_train, new_target, test_df, only_generated_data
            )
            logging.info("Starting postprocessing step.")
            new_train, new_target = generator.postprocess_data(
                new_train, new_target, test_df
            )
            if use_adversarial:
                logging.info("Applying adversarial filtering")
                new_train, new_target = generator.adversarial_filtering(
                    new_train, new_target, test_df
                )
            gc.collect()

            logging.info("Total finishing, returning data")
            return new_train, new_target


class Sampler(ABC):
    """
        Interface for each sampling strategy
    """

    def get_generated_shape(self, input_df):
        """
        Calculates final output shape
        """
        if self.gen_x_times <= 0:
            raise ValueError(
                "Passed gen_x_times = {} should be bigger than 0".format(
                    self.gen_x_times
                )
            )
        return int(self.gen_x_times * input_df.shape[0])

    @abstractmethod
    def preprocess_data(self, train, target, test_df):
        """Before we can start data generation we might need some preprocessing, numpy to pandas
        and etc"""
        raise NotImplementedError

    @abstractmethod
    def generate_data(self, train_df, target, test_df):
        raise NotImplementedError

    @abstractmethod
    def postprocess_data(self, train_df, target, test_df):
        """Filtering data which far beyond from test_df data distribution"""
        raise NotImplementedError

    @abstractmethod
    def adversarial_filtering(self, train_df, target, test_df):
        raise NotImplementedError


In [None]:
import logging
import sys


def setup_logging(loglevel):
    """Setup basic logging

    Args:
      loglevel (int): minimum loglevel for emitting messages
    """

    logformat = "[%(asctime)s] %(levelname)s:%(name)s:%(message)s"
    logging.basicConfig(
        level=loglevel, stream=sys.stdout, format=logformat, datefmt="%Y-%m-%d %H:%M:%S"
    )


TEMP_TARGET = "_temp_target"


In [None]:
%%capture
!pip install category_encoders

In [None]:

from typing import List

import numpy as np
import pandas as pd
from category_encoders.backward_difference import BackwardDifferenceEncoder
from category_encoders.cat_boost import CatBoostEncoder
from category_encoders.helmert import HelmertEncoder
from category_encoders.james_stein import JamesSteinEncoder
from category_encoders.leave_one_out import LeaveOneOutEncoder
from category_encoders.m_estimate import MEstimateEncoder
from category_encoders.one_hot import OneHotEncoder
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.sum_coding import SumEncoder
from category_encoders.target_encoder import TargetEncoder
from category_encoders.woe import WOEEncoder
from sklearn.model_selection import RepeatedStratifiedKFold


def get_single_encoder(encoder_name: str, cat_cols: list):
    """
    Get encoder by its name
    :param encoder_name: Name of desired encoder
    :param cat_cols: Cat columns for encoding
    :return: Categorical encoder
    """
    if encoder_name == "FrequencyEncoder":
        encoder = FrequencyEncoder(cols=cat_cols)

    if encoder_name == "WOEEncoder":
        encoder = WOEEncoder(cols=cat_cols)

    if encoder_name == "TargetEncoder":
        encoder = TargetEncoder(cols=cat_cols)

    if encoder_name == "SumEncoder":
        encoder = SumEncoder(cols=cat_cols)

    if encoder_name == "MEstimateEncoder":
        encoder = MEstimateEncoder(cols=cat_cols)

    if encoder_name == "LeaveOneOutEncoder":
        encoder = LeaveOneOutEncoder(cols=cat_cols)

    if encoder_name == "HelmertEncoder":
        encoder = HelmertEncoder(cols=cat_cols)

    if encoder_name == "BackwardDifferenceEncoder":
        encoder = BackwardDifferenceEncoder(cols=cat_cols)

    if encoder_name == "JamesSteinEncoder":
        encoder = JamesSteinEncoder(cols=cat_cols)

    if encoder_name == "OrdinalEncoder":
        encoder = OrdinalEncoder(cols=cat_cols)

    if encoder_name == "CatBoostEncoder":
        encoder = CatBoostEncoder(cols=cat_cols)

    if encoder_name == "MEstimateEncoder":
        encoder = MEstimateEncoder(cols=cat_cols)
    if encoder_name == "OneHotEncoder":
        encoder = OneHotEncoder(cols=cat_cols)
    if encoder is None:
        raise NotImplementedError("To be implemented")
    return encoder


class DoubleValidationEncoderNumerical:
    """
    Encoder with validation within
    """

    def __init__(self, cols, encoders_names_tuple=()):
        """
        :param cols: Categorical columns
        :param encoders_names_tuple: Tuple of str with encoders
        """
        self.cols, self.num_cols = cols, None
        self.encoders_names_tuple = encoders_names_tuple

        self.n_folds, self.n_repeats = 5, 3
        self.model_validation = RepeatedStratifiedKFold(
            n_splits=self.n_folds, n_repeats=self.n_repeats, random_state=0
        )
        self.encoders_dict = {}

        self.storage = None

    def fit_transform(self, X: pd.DataFrame, y: np.array) -> pd.DataFrame:
        self.num_cols = [col for col in X.columns if col not in self.cols]
        self.storage = []

        for encoder_name in self.encoders_names_tuple:
            for n_fold, (train_idx, val_idx) in enumerate(
                self.model_validation.split(X, y)
            ):
                encoder = get_single_encoder(encoder_name, self.cols)

                X_train, X_val = (
                    X.loc[train_idx].reset_index(drop=True),
                    X.loc[val_idx].reset_index(drop=True),
                )
                y_train, y_val = y[train_idx], y[val_idx]
                _ = encoder.fit_transform(X_train, y_train)

                # transform validation part and get all necessary cols
                val_t = encoder.transform(X_val)
                val_t = val_t[
                    [col for col in val_t.columns if col not in self.num_cols]
                ].values

                if encoder_name not in self.encoders_dict.keys():
                    cols_representation = np.zeros((X.shape[0], val_t.shape[1]))
                    self.encoders_dict[encoder_name] = [encoder]
                else:
                    self.encoders_dict[encoder_name].append(encoder)

                cols_representation[val_idx, :] += val_t / self.n_repeats

            cols_representation = pd.DataFrame(cols_representation)
            cols_representation.columns = [
                f"encoded_{encoder_name}_{i}"
                for i in range(cols_representation.shape[1])
            ]
            self.storage.append(cols_representation)

        for df in self.storage:
            X = pd.concat([X, df], axis=1)

        X.drop(self.cols, axis=1, inplace=True)
        return X

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        self.storage = []
        for encoder_name in self.encoders_names_tuple:
            cols_representation = None

            for encoder in self.encoders_dict[encoder_name]:
                test_tr = encoder.transform(X)
                test_tr = test_tr[
                    [col for col in test_tr.columns if col not in self.num_cols]
                ].values

                if cols_representation is None:
                    cols_representation = np.zeros(test_tr.shape)

                cols_representation = (
                    cols_representation + test_tr / self.n_folds / self.n_repeats
                )

            cols_representation = pd.DataFrame(cols_representation)
            cols_representation.columns = [
                f"encoded_{encoder_name}_{i}"
                for i in range(cols_representation.shape[1])
            ]
            self.storage.append(cols_representation)

        for df in self.storage:
            X = pd.concat([X, df], axis=1)

        X.drop(self.cols, axis=1, inplace=True)
        return X


class MultipleEncoder:
    """
    Multiple encoder for categorical columns
    """

    def __init__(self, cols: List[str], encoders_names_tuple=()):
        """
        :param cols: List of categorical columns
        :param encoders_names_tuple: Tuple of categorical encoders names. Possible values in tuple are:
        "FrequencyEncoder", "WOEEncoder", "TargetEncoder", "SumEncoder", "MEstimateEncoder", "LeaveOneOutEncoder",
        "HelmertEncoder", "BackwardDifferenceEncoder", "JamesSteinEncoder", "OrdinalEncoder""CatBoostEncoder"
        """

        self.cols = cols
        self.num_cols = None
        self.encoders_names_tuple = encoders_names_tuple
        self.encoders_dict = {}

        # list for storing results of transformation from each encoder
        self.storage = None

    def fit_transform(self, X: pd.DataFrame, y: np.array) -> pd.DataFrame:
        self.num_cols = [col for col in X.columns if col not in self.cols]
        self.storage = []
        for encoder_name in self.encoders_names_tuple:
            encoder = get_single_encoder(encoder_name=encoder_name, cat_cols=self.cols)

            cols_representation = encoder.fit_transform(X, y)
            self.encoders_dict[encoder_name] = encoder
            cols_representation = cols_representation[
                [col for col in cols_representation.columns if col not in self.num_cols]
            ].values
            cols_representation = pd.DataFrame(cols_representation)
            cols_representation.columns = [
                f"encoded_{encoder_name}_{i}"
                for i in range(cols_representation.shape[1])
            ]
            self.storage.append(cols_representation)

        # concat cat cols representations with initial dataframe
        for df in self.storage:
            X = pd.concat([X, df], axis=1)

        # remove all columns as far as we have their representations
        X.drop(self.cols, axis=1, inplace=True)
        return X

    def transform(self, X) -> pd.DataFrame:
        self.storage = []
        for encoder_name in self.encoders_names_tuple:
            # get representation of cat columns and form a pd.DataFrame for it
            cols_representation = self.encoders_dict[encoder_name].transform(X)
            cols_representation = cols_representation[
                [col for col in cols_representation.columns if col not in self.num_cols]
            ].values
            cols_representation = pd.DataFrame(cols_representation)
            cols_representation.columns = [
                f"encoded_{encoder_name}_{i}"
                for i in range(cols_representation.shape[1])
            ]
            self.storage.append(cols_representation)

        # concat cat cols representations with initial dataframe
        for df in self.storage:
            X = pd.concat([X, df], axis=1)

        # remove all columns as far as we have their representations
        X.drop(self.cols, axis=1, inplace=True)
        return X


class FrequencyEncoder:
    def __init__(self, cols):
        self.cols = cols
        self.counts_dict = None

    def fit(self, X: pd.DataFrame):
        counts_dict = {}
        for col in self.cols:
            values, counts = np.unique(X[col], return_counts=True)
            counts_dict[col] = dict(zip(values, counts))
        self.counts_dict = counts_dict

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        counts_dict_test = {}
        res = []
        for col in self.cols:
            values, counts = np.unique(X[col], return_counts=True)
            counts_dict_test[col] = dict(zip(values, counts))

            # if value is in "train" keys - replace "test" counts with "train" counts
            for k in [
                key
                for key in counts_dict_test[col].keys()
                if key in self.counts_dict[col].keys()
            ]:
                counts_dict_test[col][k] = self.counts_dict[col][k]

            res.append(X[col].map(counts_dict_test[col]).values.reshape(-1, 1))
        res = np.hstack(res)

        X[self.cols] = res
        return X

    def fit_transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        self.fit(X, y)
        X = self.transform(X)
        return X


if __name__ == "__main__":
    df = pd.DataFrame({})
    df["cat_col"] = [1, 2, 3, 1, 2, 3, 1, 1, 1]
    df["target"] = [0, 1, 0, 1, 0, 1, 0, 1, 0]

    #
    temp = df.copy()
    enc = CatBoostEncoder(cols=["cat_col"])
    print(enc.fit_transform(temp, temp["target"]))

    #
    temp = df.copy()
    enc = MultipleEncoder(cols=["cat_col"], encoders_names_tuple=("CatBoostEncoder",))
    print(enc.fit_transform(temp, temp["target"]))

    #
    temp = df.copy()
    enc = DoubleValidationEncoderNumerical(
        cols=["cat_col"], encoders_names_tuple=("CatBoostEncoder",)
    )
    print(enc.fit_transform(temp, temp["target"]))


    cat_col  target
0  0.444444       0
1  0.444444       1
2  0.444444       0
3  0.222222       1
4  0.722222       0
5  0.222222       1
6  0.481481       0
7  0.361111       1
8  0.488889       0
   target  encoded_CatBoostEncoder_0
0       0                   0.444444
1       1                   0.444444
2       0                   0.444444
3       1                   0.222222
4       0                   0.722222
5       1                   0.222222
6       0                   0.481481
7       1                   0.361111
8       0                   0.488889




   target  encoded_CatBoostEncoder_0
0       0                   0.447619
1       1                   0.428571
2       0                   0.428571
3       1                   0.357143
4       0                   0.452381
5       1                   0.428571
6       0                   0.357143
7       1                   0.357143
8       0                   0.404762


In [None]:
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from scipy.stats import rankdata
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold



class AdversarialModel:
    def __init__(
            self,
            cat_validation="Single",
            encoders_names=("OrdinalEncoder",),
            cat_cols=None,
            model_validation=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
            model_params=None,
    ):
        '''
        Class for fit predicting tabular models, mostly - boostings. Several encoders for categorical features are
        supported

        Args:
            cat_validation: categorical type of validation, examples: "None", "Single" and "Double"
            encoders_names: different categorical encoders from category_encoders library, example CatBoostEncoder
            cat_cols: list of categorical columns
            model_validation: model training cross validation type from sklearn.model_selection,
            example StratifiedKFold(5)
            model_params: model training hyperparameters
        '''
        self.cat_validation = cat_validation
        self.encoders_names = encoders_names
        self.cat_cols = cat_cols
        self.model_validation = model_validation
        self.model_params = model_params

    def adversarial_test(self, left_df, right_df):
        """
        Trains adversarial model to distinguish train from test
        :param left_df:  dataframe
        :param right_df: dataframe
        :return: trained model
        """
        # sample to shuffle the data
        left_df = left_df.copy().sample(frac=1).reset_index(drop=True)
        right_df = right_df.copy().sample(frac=1).reset_index(drop=True)

        left_df = left_df.head(right_df.shape[0])
        right_df = right_df.head(left_df.shape[0])

        left_df["gt"] = 0
        right_df["gt"] = 1

        concated = pd.concat([left_df, right_df])
        lgb_model = Model(
            cat_validation=self.cat_validation,
            encoders_names=self.encoders_names,
            cat_cols=self.cat_cols,
            model_validation=self.model_validation,
            model_params=self.model_params,
        )
        train_score, val_score, avg_num_trees = lgb_model.fit(
            concated.drop("gt", axis=1), concated["gt"]
        )
        self.metrics = {"train_score": train_score,
                        "val_score": val_score,
                        "avg_num_trees": avg_num_trees}
        self.trained_model = lgb_model


class Model:
    def __init__(
            self,
            cat_validation="None",
            encoders_names=None,
            cat_cols=None,
            model_validation=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
            model_params=None,
    ):
        '''
        Class for fit predicting tabular models, mostly - boostings. Several encoders for categorical features are supported

        Args:
            cat_validation: categorical type of validation, examples: "None", "Single" and "Double"
            encoders_names: different categorical encoders from category_encoders library, example CatBoostEncoder
            cat_cols: list of categorical columns
            model_validation: model training cross validation type from sklearn.model_selection, example StratifiedKFold(5)
            model_params: model training hyperparameters
        '''
        self.cat_validation = cat_validation
        self.encoders_names = encoders_names
        self.cat_cols = cat_cols
        self.model_validation = model_validation

        if model_params is None:
            self.model_params = {
                "metrics": "AUC",
                "n_estimators": 5000,
                "learning_rate": 0.04,
                "random_state": 42,
            }
        else:
            self.model_params = model_params

        self.encoders_list = []
        self.models_list = []
        self.scores_list_train = []
        self.scores_list_val = []
        self.models_trees = []

    def fit(self, X: pd.DataFrame, y: np.array) -> tuple:
        """
        Fits model with speficified in init params
        Args:
            X: Input training dataframe
            y: Target for X

        Returns:
            mean_score_train, mean_score_val, avg_num_trees
        """
        # process cat cols
        if self.cat_validation == "None":
            encoder = MultipleEncoder(
                cols=self.cat_cols, encoders_names_tuple=self.encoders_names
            )
            X = encoder.fit_transform(X, y)

        for n_fold, (train_idx, val_idx) in enumerate(
                self.model_validation.split(X, y)
        ):
            X_train, X_val = (
                X.iloc[train_idx].reset_index(drop=True),
                X.iloc[val_idx].reset_index(drop=True),
            )
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            if self.cat_cols is not None:
                if self.cat_validation == "Single":
                    encoder = MultipleEncoder(
                        cols=self.cat_cols, encoders_names_tuple=self.encoders_names
                    )
                    X_train = encoder.fit_transform(X_train, y_train)
                    X_val = encoder.transform(X_val)
                if self.cat_validation == "Double":
                    encoder = DoubleValidationEncoderNumerical(
                        cols=self.cat_cols, encoders_names_tuple=self.encoders_names
                    )
                    X_train = encoder.fit_transform(X_train, y_train)
                    X_val = encoder.transform(X_val)
                self.encoders_list.append(encoder)

                # check for OrdinalEncoder encoding
                for col in [col for col in X_train.columns if "OrdinalEncoder" in col]:
                    X_train[col] = X_train[col].astype("category")
                    X_val[col] = X_val[col].astype("category")

            # fit model
            model = LGBMClassifier(**self.model_params)
            model.fit(
                X_train,
                y_train,
                eval_set=[(X_train, y_train), (X_val, y_val)],
                early_stopping_rounds=50,
                verbose=False,
            )
            self.models_trees.append(model.best_iteration_)
            self.models_list.append(model)

            y_hat = model.predict_proba(X_train)[:, 1]
            score_train = roc_auc_score(y_train, y_hat)
            self.scores_list_train.append(score_train)
            y_hat = model.predict_proba(X_val)[:, 1]
            score_val = roc_auc_score(y_val, y_hat)
            self.scores_list_val.append(score_val)

        mean_score_train = np.mean(self.scores_list_train)
        mean_score_val = np.mean(self.scores_list_val)
        avg_num_trees = int(np.mean(self.models_trees))

        return mean_score_train, mean_score_val, avg_num_trees

    def predict(self, X: pd.DataFrame) -> np.array:
        """
        Making inference with trained models for input dataframe
        Args:
            X: input dataframe for inference

        Returns: Predicted ranks

        """
        y_hat = np.zeros(X.shape[0])
        if self.encoders_list is not None and self.encoders_list != []:
            for encoder, model in zip(self.encoders_list, self.models_list):
                X_test = X.copy()
                X_test = encoder.transform(X_test)

                # check for OrdinalEncoder encoding
                for col in [col for col in X_test.columns if "OrdinalEncoder" in col]:
                    X_test[col] = X_test[col].astype("category")

                unranked_preds = model.predict_proba(X_test)[:, 1]
                y_hat += rankdata(unranked_preds)
        else:
            for model in self.models_list:
                X_test = X.copy()

                unranked_preds = model.predict_proba(X_test)[:, 1]
                y_hat += rankdata(unranked_preds)
        return y_hat


In [None]:
# -*- coding: utf-8 -*-
"""
todo write description
"""

import gc
import logging
import warnings
from typing import Tuple

import numpy as np
import pandas as pd



warnings.filterwarnings("ignore", category=FutureWarning)

__author__ = "Insaf Ashrapov"
__copyright__ = "Insaf Ashrapov"
__license__ = "Apache 2.0"

__all__ = ["OriginalGenerator", "GANGenerator"]


class OriginalGenerator(SampleData):
    def __init__(self, *args, **kwargs):
        self.args = args
        self.kwargs = kwargs

    def get_object_generator(self) -> Sampler:
        return SamplerOriginal(*self.args, **self.kwargs)


class GANGenerator(SampleData):
    def __init__(self, *args, **kwargs):
        self.args = args
        self.kwargs = kwargs

    def get_object_generator(self) -> Sampler:
        return SamplerGAN(*self.args, **self.kwargs)


class SamplerOriginal(Sampler):
    def __init__(
        self,
        gen_x_times: float = 1.1,
        cat_cols: list = None,
        bot_filter_quantile: float = 0.001,
        top_filter_quantile: float = 0.999,
        is_post_process: bool = True,
        adversaial_model_params: dict = {
            "metrics": "AUC",
            "max_depth": 2,
            "max_bin": 100,
            "n_estimators": 500,
            "learning_rate": 0.02,
            "random_state": 42,
        },
        pregeneration_frac: float = 2,
        only_generated_data: bool = False,
        gan_params: dict = {'batch_size': 500, 'patience': 25, "epochs" : 500,}
    ):
        """

        @param gen_x_times: float = 1.1 - how much data to generate, output might be less because of postprocessing and
        adversarial filtering
        @param cat_cols: list = None - categorical columns
        @param bot_filter_quantile: float = 0.001 - bottom quantile for postprocess filtering
        @param top_filter_quantile: float = 0.999 - bottom quantile for postprocess filtering
        @param is_post_process: bool = True - perform or not postfiltering, if false bot_filter_quantile
         and top_filter_quantile ignored
        @param adversarial_model_params: dict params for adversarial filtering model, default values for binary task
        @param pregeneration_frac: float = 2 - for generation step gen_x_times * pregeneration_frac amount of data
        will generated. However in postprocessing (1 + gen_x_times) % of original data will be returned
        @param only_generated_data: bool = False If True after generation get only newly generated, without concating input train dataframe.
        @param gan_params: dict params for GAN training
        Only works for SamplerGAN.
        """
        self.gen_x_times = gen_x_times
        self.cat_cols = cat_cols
        self.is_post_process = is_post_process
        self.bot_filter_quantile = bot_filter_quantile
        self.top_filter_quantile = top_filter_quantile
        self.adversarial_model_params = adversaial_model_params
        self.pregeneration_frac = pregeneration_frac
        self.only_generated_data = only_generated_data
        self.gan_params = gan_params
        self.TEMP_TARGET = "TEMP_TARGET"

    def preprocess_data_df(self, df) -> pd.DataFrame:
        logging.info("Input shape: {}".format(df.shape))
        if isinstance(df, pd.DataFrame) is False:
            raise ValueError(
                "Input dataframe aren't pandas dataframes: df is {}".format(type(df))
            )
        return df

    def preprocess_data(
        self, train, target, test_df
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        train = self.preprocess_data_df(train)
        target = self.preprocess_data_df(target)
        test_df = self.preprocess_data_df(test_df)
        self.TEMP_TARGET = target.columns[0]
        if self.TEMP_TARGET in train.columns:
            raise ValueError(
                "Input train dataframe already have {} column, consider removing it".format(
                    self.TEMP_TARGET
                )
            )
        if "test_similarity" in train.columns:
            raise ValueError(
                "Input train dataframe already have test_similarity, consider removing it"
            )

        return train, target, test_df

    def generate_data(
        self, train_df, target, test_df, only_generated_data
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        if only_generated_data:
            Warning.warn(
                "For SamplerOriginal setting only_generated_data doesn't change anything, "
                "because generated data sampled from the train!"
            )
        self._validate_data(train_df, target, test_df)
        train_df[self.TEMP_TARGET] = target
        generated_df = train_df.sample(
            frac=(1 + self.pregeneration_frac), replace=True, random_state=42
        )
        generated_df = generated_df.reset_index(drop=True)
        gc.collect()
        logging.info(
            "Generated shape: {} and {}".format(
                generated_df.drop(self.TEMP_TARGET, axis=1).shape,
                generated_df[self.TEMP_TARGET].shape,
            )
        )
        return (
            generated_df.drop(self.TEMP_TARGET, axis=1),
            generated_df[self.TEMP_TARGET],
        )

    def postprocess_data(self, train_df, target, test_df):
        if not self.is_post_process or test_df is None:
            logging.info("Skipping postprocessing")
            return train_df, target

        self._validate_data(train_df, target, test_df)
        train_df[self.TEMP_TARGET] = target

        for num_col in test_df.columns:
            if self.cat_cols is None or num_col not in self.cat_cols:
                min_val = test_df[num_col].quantile(self.bot_filter_quantile)
                max_val = test_df[num_col].quantile(self.top_filter_quantile)
                filtered_df = train_df.loc[
                    (train_df[num_col] >= min_val) & (train_df[num_col] <= max_val)
                ]
                if filtered_df.shape[0] < 10:
                    raise ValueError(
                        "After post-processing generated data's shape less than 10. For columns {} test "
                        "might be highly skewed. Filter conditions are min_val = {} and max_val = {}.".format(
                            num_col, min_val, max_val
                        )
                    )
                train_df = filtered_df

        if self.cat_cols is not None:
            for cat_col in self.cat_cols:
                filtered_df = train_df[
                    train_df[cat_col].isin(test_df[cat_col].unique())
                ]
                if filtered_df.shape[0] < 10:
                    raise ValueError(
                        "After post-processing generated data's shape less than 10. For columns {} test "
                        "might be highly skewed.".format(num_col)
                    )
                train_df = filtered_df
        gc.collect()
        logging.info(
            "Generated shapes after postprocessing: {} plus target".format(
                train_df.drop(self.TEMP_TARGET, axis=1).shape
            )
        )
        return (
            train_df.drop(self.TEMP_TARGET, axis=1).reset_index(drop=True),
            train_df[self.TEMP_TARGET].reset_index(drop=True),
        )

    def adversarial_filtering(self, train_df, target, test_df):
        if test_df is None:
            logging.info("Skipping adversarial filtering, because test_df is None.")
            return train_df, target
        ad_model = AdversarialModel(
            cat_cols=self.cat_cols, model_params=self.adversarial_model_params
        )
        self._validate_data(train_df, target, test_df)
        train_df[self.TEMP_TARGET] = target
        ad_model.adversarial_test(test_df, train_df.drop(self.TEMP_TARGET, axis=1))

        train_df["test_similarity"] = ad_model.trained_model.predict(
            train_df.drop(self.TEMP_TARGET, axis=1)
        )
        train_df.sort_values("test_similarity", ascending=False, inplace=True)
        train_df = train_df.head(self.get_generated_shape(train_df) * train_df.shape[0])
        del ad_model
        gc.collect()
        return (
            train_df.drop(["test_similarity", self.TEMP_TARGET], axis=1).reset_index(
                drop=True
            ),
            train_df[self.TEMP_TARGET].reset_index(drop=True),
        )

    @staticmethod
    def _validate_data(train_df, target, test_df):
        if test_df is not None:
            if train_df.shape[0] < 10 or test_df.shape[0] < 10:
                raise ValueError(
                    "Shape of train is {} and test is {}. Both should at least 10! "
                    "Consider disabling adversarial filtering".format(
                        train_df.shape[0], test_df.shape[0]
                    )
                )
        if target is not None:
            if train_df.shape[0] != target.shape[0]:
                raise ValueError(
                    "Something gone wrong: shape of train_df = {} is not equal to target = {} shape".format(
                        train_df.shape[0], target.shape[0]
                    )
                )


class SamplerGAN(SamplerOriginal):
    def generate_data(
        self, train_df, target, test_df, only_generated_data: bool
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        self._validate_data(train_df, target, test_df)
        if target is not None:
            train_df[self.TEMP_TARGET] = target
        ctgan = CTGANSynthesizer(batch_size=self.gan_params["batch_size"], patience=self.gan_params["patience"])
        logging.info("training GAN")
        if self.cat_cols is None:
            ctgan.fit(train_df, [], epochs=self.gan_params["epochs"])
        else:
            ctgan.fit(train_df, self.cat_cols, epochs=self.gan_params["epochs"])
        logging.info("Finished training GAN")
        generated_df = ctgan.sample(
            self.pregeneration_frac * self.get_generated_shape(train_df)
        )
        data_dtype = train_df.dtypes.values

        for i in range(len(generated_df.columns)):
            generated_df[generated_df.columns[i]] = generated_df[
                generated_df.columns[i]
            ].astype(data_dtype[i])
        gc.collect()
        if not only_generated_data:
            train_df = pd.concat([train_df, generated_df]).reset_index(drop=True)
            logging.info(
                "Generated shapes: {} plus target".format(
                    _drop_col_if_exist(train_df, self.TEMP_TARGET).shape
                )
            )
            return (
                _drop_col_if_exist(train_df, self.TEMP_TARGET),
                get_columns_if_exists(train_df, self.TEMP_TARGET),
            )
        else:
            logging.info(
                "Generated shapes: {} plus target".format(
                    _drop_col_if_exist(train_df, self.TEMP_TARGET).shape
                )
            )
            return (
                _drop_col_if_exist(generated_df, self.TEMP_TARGET),
                get_columns_if_exists(generated_df, self.TEMP_TARGET),
            )
        gc.collect()

        return (
            _drop_col_if_exist(train_df, self.TEMP_TARGET),
            get_columns_if_exists(train_df, self.TEMP_TARGET),
        )


def _sampler(creator: SampleData, in_train, in_target, in_test) -> None:
    _logger = logging.getLogger(__name__)
    _logger.info("Starting generating data")
    train, test = creator.generate_data_pipe(in_train, in_target, in_test)
    _logger.info(train, test)
    _logger.info("Finished generation\n")
    return train, test


def _drop_col_if_exist(df, col_to_drop) -> pd.DataFrame:
    """
    Drops col_to_drop from input dataframe df if sucj column exists
    """
    if col_to_drop in df.columns:
        return df.drop(col_to_drop, axis=1)
    else:
        return df


def get_columns_if_exists(df, col) -> pd.DataFrame:
    if col in df.columns:
        return df[col]
    else:
        return None

"""
if __name__ == "__main__":
    setup_logging(logging.DEBUG)
    train = pd.DataFrame(
        np.random.randint(-10, 150, size=(100, 4)), columns=list("ABCD")
    )
    logging.info(train)
    target = pd.DataFrame(np.random.randint(0, 2, size=(100, 1)), columns=list("Y"))
    test = pd.DataFrame(np.random.randint(0, 100, size=(100, 4)), columns=list("ABCD"))
    _sampler(OriginalGenerator(gen_x_times=15), train, target, test)
    _sampler(
        GANGenerator(gen_x_times=10, only_generated_data=False,
                     gan_params={"batch_size": 500, "patience": 25, "epochs" : 500,}), train, target, test
    )

    _sampler(OriginalGenerator(gen_x_times=15), train, None, train)
    _sampler(
        GANGenerator(cat_cols=["A"], gen_x_times=20, only_generated_data=True),
        train,
        None,
        train,
    )
"""

'\nif __name__ == "__main__":\n    setup_logging(logging.DEBUG)\n    train = pd.DataFrame(\n        np.random.randint(-10, 150, size=(100, 4)), columns=list("ABCD")\n    )\n    logging.info(train)\n    target = pd.DataFrame(np.random.randint(0, 2, size=(100, 1)), columns=list("Y"))\n    test = pd.DataFrame(np.random.randint(0, 100, size=(100, 4)), columns=list("ABCD"))\n    _sampler(OriginalGenerator(gen_x_times=15), train, target, test)\n    _sampler(\n        GANGenerator(gen_x_times=10, only_generated_data=False,\n                     gan_params={"batch_size": 500, "patience": 25, "epochs" : 500,}), train, target, test\n    )\n\n    _sampler(OriginalGenerator(gen_x_times=15), train, None, train)\n    _sampler(\n        GANGenerator(cat_cols=["A"], gen_x_times=20, only_generated_data=True),\n        train,\n        None,\n        train,\n    )\n'

## Model testing

In [None]:
gen = GANGenerator(gan_params = {"batch_size": 100,
                                 "patience": 25, "epochs" : 100}).generate_data_pipe(train_data, None,None)

calculate_metrics(train_data, gen[0], n_time=100)


GAN: Early stopping after epochs 47


Unnamed: 0,ander,kendall
real,1.006206,0.010928
random,13.797143,0.125817
model,20.000198,0.089588


In [None]:
def calculate_metrics(real_data, generate_data, size = 411,n_time = 20, mean = 0.01, mu = 0.01):
  random_list, real_list, generated_list = [], [], []

  for i in range(n_time):
    real_true = real_data.sample(size).values
    real_test = real_data.sample(size).values
    random = np.abs(np.random.normal(mean, mu, (size, 4)))
    generated = generate_data.sample(size).values

    #true data
    real_list.append({'ander': anderson_evaluation(real_true, real_test), 'kendall' : kendall_evaluation(real_true, real_test)})

    #random data
    random_list.append({'ander': anderson_evaluation(real_true,random), 'kendall' : kendall_evaluation(real_true, random)})

    #generate
    generated_list.append({'ander': anderson_evaluation(real_true, generated), 'kendall' : kendall_evaluation(real_true, generated)})
  

  random_df = pd.DataFrame(random_list)
  real_df = pd.DataFrame(real_list)
  generated_df = pd.DataFrame(generated_list)

  liste = [ {'ander': real_df['ander'].mean(), 'kendall': real_df['kendall'].mean()}, {'ander': random_df['ander'].mean(), 'kendall': random_df['kendall'].mean()}, 
            {'ander': generated_df['ander'].mean(), 'kendall': generated_df['kendall'].mean()} ]

  return pd.DataFrame(liste, index=['real', 'random', 'model'])


In [None]:
calculate_metrics(train_data, gen[0])

Unnamed: 0,ander,kendall
real,0.911547,0.012345
random,13.911728,0.125293
model,69.449626,0.084488


In [None]:
np.abs(np.random.normal(0.01, 0.01, (4,2)))

array([[0.01772611, 0.00212386],
       [0.00218133, 0.00489999],
       [0.00883648, 0.00984078],
       [0.01642336, 0.025435  ]])