In [None]:
from fastai2.basics import *
from fastai2.callback.all import *
from fastai2.data.all import *
from fastai2.data.core import *
from fastai2.distributed import *
from fastai2.data.transforms import *


In [None]:
from reformer_pytorch import Reformer, ReformerLM
from itertools import product

In [None]:
#import numpy as np
import random
#from tqdm import tqdm
import gc
from sklearn.model_selection import GroupKFold
import warnings
warnings.simplefilter('ignore')
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 500)



In [None]:
# configurations and main hyperparammeters
SEGMENT_SIZE = 100000 # 500000
WINDOW_SIZE = 200
BS = 3*38
SPLITS = 5

assert SEGMENT_SIZE % WINDOW_SIZE == 0
assert (SEGMENT_SIZE // WINDOW_SIZE) % SPLITS == 0
SEED = 321
DATA_SUFFIX = '_clean'

p_input = Path('input')

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)

In [None]:
# read data
train_dtypes = {'time': np.float32, 'signal': np.float32, 'open_channels': np.int32 }
test_dtypes  = {'time': np.float32, 'signal': np.float32 }
df_train  = pd.read_csv(p_input / f'train.csv', dtype= train_dtypes)
df_test   = pd.read_csv(p_input / f'test.csv',  dtype= test_dtypes)
df_train_drift = pd.read_csv(p_input / f'train{DATA_SUFFIX}.csv', dtype= train_dtypes)
df_test_drift  = pd.read_csv(p_input / f'test{DATA_SUFFIX}.csv',  dtype= test_dtypes)
sub   = pd.read_csv(p_input / 'sample_submission.csv',  dtype={'time': np.float32})
df_train['drift'] = df_train['signal'] - df_train_drift['signal']
df_test['drift']  = df_test['signal']  - df_test_drift['signal']

In [None]:
df_train['signal'] =  df_train_drift['signal']
df_test['signal']  =   df_test_drift['signal']

In [None]:
len(df_train),SEGMENT_SIZE

In [None]:
df_train['open_channels'][2300000:2400000][(df_train['open_channels'][2300000:2400000]==0)]=1

In [None]:
d_xtra_csvs = {
    1: ['outfinaltest10.csv',  'outfinaltest44.csv',],#  'outfinaltest78.csv',],  'outfinaltest10.csv',  'outfinaltest44.csv'],
    3: ['outfinaltest1.csv',   'outfinaltest2.csv',   'outfinaltest3.csv',   'outfinaltest4.csv', 'outfinaltest5.csv'],
    5: ['outfinaltest328.csv', 'outfinaltest534.csv', 'outfinaltest747.csv',]#, 'outfinaltest328.csv', 'outfinaltest534.csv']
}

df_train_xtra = None
for _,xtra_csvs in d_xtra_csvs.items():
    print(_,xtra_csvs)
    for xtra_csv in xtra_csvs:
        xx = pd.read_csv(p_input / xtra_csv , header=None,names=['time', 'signal', 'open_channels'])
        df_train_xtra = pd.concat((xx,df_train_xtra), axis=0)
df_train_xtra['drift']  = 0.
df_train = pd.concat((df_train,df_train_xtra), axis=0)

In [None]:
train_channels_in_segment=[]
for s in range(0,len(df_train),SEGMENT_SIZE):
    channels = LongTensor(df_train['open_channels'][s:s+SEGMENT_SIZE].unique())
    channels_hot = torch.zeros(11,11)
    channels_hot = channels_hot.scatter(0,channels.unsqueeze(0), 1.).sum(dim=1)
    train_channels_in_segment.append(channels_hot)
train_channels_in_segment

In [None]:
test_channels_in_segment = FloatTensor([
    [1,1,0,0,0,0,0,0,0,0,0],
    [0,1,1,1,0,0,0,0,0,0,0],
    [0,0,1,1,1,1,0,0,0,0,0],
    [1,1,0,0,0,0,0,0,0,0,0],
    [1,1,0,0,0,0,0,0,0,0,0],
    [0,0,0,0,0,1,1,1,1,1,1], # 
    [0,1,1,1,1,1,0,0,0,0,0],
    [0,0,0,0,0,1,1,1,1,1,1],
    [1,1,0,0,0,0,0,0,0,0,0],
    [0,1,1,1,0,0,0,0,0,0,0],
    [0,0,0,0,0,0,0,0,0,0,0], # 
])

In [None]:
train = torch.cat((torch.FloatTensor(df_train['signal'        ].values).unsqueeze(0),
                   torch.FloatTensor(df_train['drift'         ].values).unsqueeze(0),
                   torch.FloatTensor(df_train['open_channels' ].values).unsqueeze(0)))
test  = torch.cat((torch.FloatTensor(df_test ['signal'        ].values).unsqueeze(0),
                   torch.FloatTensor(df_test ['drift'         ].values).unsqueeze(0)))
signal = torch.cat((train[0],test[0]))
signal_mean, signal_std = signal.mean(),signal.std()
train[0] = (train[0] - signal_mean) / signal_std
test[0]  = ( test[0] - signal_mean) / signal_std
train = train.view(train.shape[0],-1,SEGMENT_SIZE)
test  =  test.view( test.shape[0], -1,SEGMENT_SIZE)

In [None]:
plt.plot(signal)

In [None]:
from pylab import rcParams
rcParams['figure.figsize'] = 20, 5
rcParams['figure.dpi'] = 300
rcParams['agg.path.chunksize'] = 10000

In [None]:
plt.plot(train[0].flatten())

In [None]:
plt.plot(train[0].flatten())

In [None]:
plt.plot(train[2].flatten())

In [None]:
plt.plot(train[0,:].flatten())

In [None]:
plt.plot(train[1,:].flatten())

In [None]:
split = 0
split_size = SEGMENT_SIZE//WINDOW_SIZE//SPLITS
valid_idx = split*split_size + np.arange(split_size)
train_idx = np.setdiff1d(np.arange(SEGMENT_SIZE//WINDOW_SIZE), valid_idx)
train_idx, valid_idx = list(product(range(train.shape[1]),train_idx)), list(product(range(train.shape[1]),valid_idx))
train_idx[:10],train_idx[10:]

In [None]:
#train_idx = np.arange(SEGMENT_SIZE//WINDOW_SIZE)
#valid_idx = train_idx
#train_idx, valid_idx = list(product(range(train.shape[1]),train_idx)), list(product(range(train.shape[1]),valid_idx))


In [None]:
s,o=train_idx[400*9]
plt.plot(train[2,s,o*WINDOW_SIZE:(o+1)*WINDOW_SIZE])
s,o

In [None]:
def get_xy(t):
    print("Hey t")
class IonDataset(torch.utils.data.Dataset):
    def __init__(self, data,idx=None,jitter=False,p_flip=0.):
        super().__init__()
        idx = ifnone(idx,list(product(range(data.shape[1]),np.arange(SEGMENT_SIZE//WINDOW_SIZE))))
        self.data,self.idx,self.jitter,self.p_flip = data, idx, jitter, p_flip
        self.n_inp = 1
        self.has_y = data.shape[0] == 3
    def __len__(self): return len(self.idx)
    def __getitem__(self, idx):
        s,o=self.idx[idx]
        jitter = 0
        if self.jitter:
            os,oe = 0,0
            if (s,(o-1)) in self.idx: os = -WINDOW_SIZE//2
            if (s,(o+1)) in self.idx: oe =  WINDOW_SIZE//2
            jitter = torch.randint(os,oe,(1,)).item()
        ss = slice(jitter+o*WINDOW_SIZE,jitter+(o+1)*WINDOW_SIZE)
        x =  self.data[0,s:s+1,ss]
        flip = (torch.rand(1) < self.p_flip).item()
        if flip: x=torch.flip(x,dims=(1,))
        x = (x,train_channels_in_segment[s])
        if self.has_y: 
            y_drift,y_open_channels = (self.data[1,s:s+1,ss], self.data[2,s:s+1,ss].long())
            if flip: y_drift,y_open_channels=(torch.flip(y_drift,dims=(1,)),torch.flip(y_open_channels,dims=(1,)))
            return (x,(y_drift,y_open_channels))
        return (x,)
    
train_ds, valid_ds, test_ds = IonDataset(train, train_idx, jitter=True, p_flip=0.5), IonDataset(train, valid_idx), IonDataset(test)
train_dl = DataLoader(train_ds, BS, shuffle=True,  num_workers=8, pin_memory=True)
valid_dl = DataLoader(valid_ds, BS, shuffle=False, num_workers=4, pin_memory=True)
test_dl  = DataLoader(test_ds,  BS, shuffle=False, num_workers=8, pin_memory=True)
train_ds[2]

In [None]:
x =train_ds[0][0][0].unsqueeze(0)
x.shape

In [None]:
w = 256
print(x.shape)
r = math.ceil(w/WINDOW_SIZE) + 1
x_lead  = x.repeat(1,1,r)[...,:WINDOW_SIZE+w-1]
x_lag   = x.repeat(1,1,r).flip((2))[...,:WINDOW_SIZE+w-1]
lead,lag=x_lead.unfold(2,w,1), x_lag.unfold(2,w,1).flip((2))
x=torch.cat((lead,lag),dim=3).squeeze(1)#.permute(0,2,1)
x.shape

In [None]:
test.flatten(),test.flatten().flip(dims=(0,))

In [None]:
plt.plot(train_ds[5][0][0].squeeze())

In [None]:
plt.plot(train_ds[5][0][0].squeeze())

In [None]:
dls = DataLoaders(train_dl, valid_dl, test_dl, device=default_device())


In [None]:
class FixedPositionalEmbedding(nn.Module):
    def __init__(self, dim, _):
        super().__init__()
        inv_freq = 1. / (10000 ** (torch.arange(0, dim, 2).float() / dim))
        self.register_buffer('inv_freq', inv_freq)

    def forward(self, x):
        t = torch.arange(x.shape[1], device=x.device).type(self.inv_freq.type())
        sinusoid_inp = torch.einsum("i,j->ij", t, self.inv_freq)
        emb = torch.cat((sinusoid_inp.sin(), sinusoid_inp.cos()), dim=-1)
        return emb[None, :, :]
    
class AbsolutePositionalEmbedding(nn.Module):
    def __init__(self, dim, max_seq_len):
        super().__init__()
        self.emb = nn.Embedding(max_seq_len, dim)
        self.emb.weight.data.uniform_(-0.01, 0.01)

    def forward(self, x):
        t = torch.arange(x.shape[1], device=x.device)
        return self.emb(t)
    
class DummyDecoder(Module):
    def __init__(self,dropout:float=0):
        self.dropout = nn.Dropout(dropout)

    def forward(self,tgt, memory, tgt_mask=None, memory_mask=None, 
                tgt_key_padding_mask=None, memory_key_padding_mask=None):
        return self.dropout(memory)

    
class DummyEncoder(Module):
    def __init__(self,dropout:float=0):
        self.dropout = nn.Dropout(dropout)

    def forward(self,src, mask=None, src_key_padding_mask=None):
        return self.dropout(src)

    
class Classiformer(Module):
    def __init__(self, dim, depth, heads, lsh_dropout, bucket_size):
        self.dim = dim
        
        if False:
            self.reformer = Reformer(
                dim = dim,
                depth = depth,
                max_seq_len = WINDOW_SIZE,
                heads = heads,
                lsh_dropout = lsh_dropout,
                bucket_size=bucket_size,
                causal = False
            )
        
        self.transformer = nn.Transformer(
            num_encoder_layers=0,num_decoder_layers=depth,
            nhead=heads,d_model=dim,dim_feedforward=dim*8,dropout=lsh_dropout, 
            custom_encoder=DummyEncoder(dropout=0.))
        
        self.pos_emb = FixedPositionalEmbedding(dim, WINDOW_SIZE)
        self.input_to_dim  = nn.Linear(1, dim)
        self.drift         = nn.Linear(dim, 1)
        self.open_channels = nn.Linear(dim, 11)

    def forward(self, x):
        x,mask_channels = x
        # B 1 S
        w = self.dim//2
        #print(x.shape)
        r = math.ceil(w/WINDOW_SIZE) + 1
        x_lead  = x.repeat(1,1,r)[...,:WINDOW_SIZE+w-1]
        x_lag   = x.repeat(1,1,r).flip((2))[...,:WINDOW_SIZE+w-1]
        lead,lag=x_lead.unfold(2,w,1), x_lag.unfold(2,w,1).flip((2))
        x=torch.cat((lead,lag),dim=3).squeeze(1)#.permute(0,2,1)

        #print(x.shape)
        #x = self.input_to_dim(x.permute(0,2,1))
        
        # B S d
        #x = x + self.pos_emb(x).type(x.type())
        x = x.permute(1,0,2)
        x = self.transformer(x,x) # S,N,E => T,N,E
                                  # S is the source sequence length, 
                                  # T is the target sequence length, 
                                  # N is the batch size, 
                                  # E is the feature number
        x = x.permute(1,0,2)
        drift         = self.drift(x)
        open_channels = self.open_channels(x) * mask_channels.unsqueeze(1)
        return drift, open_channels
    

class AtomTorchTransformer(Module):
    def __init__(self,n_layers,n_heads,d_model,d_inner,embed_p:float=0,
                 encoder_dropout:float=0,decoder_dropout:float=0,
                 d_head=None,deep_decoder=False,dense_out=False, **kwargs):
        
        self.d_model = d_model
        d_head = ifnone(d_head, d_model//n_heads)        
        
        

In [None]:
dim = 512
bucket_size = 50
depth = 16
heads = 16
lsh_dropout = 0.

model = ReformerLM(
    num_tokens = 11,
    dim = dim,
    depth = depth,
    max_seq_len = WINDOW_SIZE,
    heads = heads,
    lsh_dropout = lsh_dropout,
    bucket_size=bucket_size,
    causal = False,
    use_full_attn = False,
    fixed_position_emb = False,
    n_hashes = 4,
)
model.token_emb = nn.Linear(1,dim)

In [None]:
model = Classiformer(dim, depth, heads, lsh_dropout, bucket_size)
model

In [None]:
def tversky_loss(logits, true, alpha, beta, eps=1e-7):
    """Computes the Tversky loss [1].
    Args:
        true: a tensor of shape [B, H, W] or [B, 1, H, W].
        logits: a tensor of shape [B, C, H, W]. Corresponds to
            the raw output or logits of the model.
        alpha: controls the penalty for false positives.
        beta: controls the penalty for false negatives.
        eps: added to the denominator for numerical stability.
    Returns:
        tversky_loss: the Tversky loss.
    Notes:
        alpha = beta = 0.5 => dice coeff
        alpha = beta = 1 => tanimoto coeff
        alpha + beta = 1 => F beta coeff
    References:
        [1]: https://arxiv.org/abs/1706.05721
    """
    logits = logits.permute(0,2,1).unsqueeze(-1)
    true = true.unsqueeze(-1)
    num_classes = logits.shape[1]
    if num_classes == 1:
        true_1_hot = torch.eye(num_classes + 1)[true.squeeze(1)]
        true_1_hot = true_1_hot.permute(0, 3, 1, 2).float()
        true_1_hot_f = true_1_hot[:, 0:1, :, :]
        true_1_hot_s = true_1_hot[:, 1:2, :, :]
        true_1_hot = torch.cat([true_1_hot_s, true_1_hot_f], dim=1)
        pos_prob = torch.sigmoid(logits)
        neg_prob = 1 - pos_prob
        probas = torch.cat([pos_prob, neg_prob], dim=1)
    else:
        true_1_hot = torch.eye(num_classes)[true.squeeze(1)]
        true_1_hot = true_1_hot.permute(0, 3, 1, 2).float()
        probas = F.softmax(logits, dim=1)
    true_1_hot = true_1_hot.type(logits.type())
    dims = (0,) + tuple(range(2, true.ndimension()))
    intersection = torch.sum(probas * true_1_hot, dims)
    fps = torch.sum(probas * (1 - true_1_hot), dims)
    fns = torch.sum((1 - probas) * true_1_hot, dims)
    num = intersection
    denom = intersection + (alpha * fps) + (beta * fns)
    tversky_loss = (num / (denom + eps)).mean()
    return (1 - tversky_loss)

class LabelSmoothingCE(Module):
    def __init__(self, eps:float=0.1, reduction='mean'): self.eps,self.reduction = eps,reduction

    def forward(self, output, target):
        c = output.size()[-1]
        output = output.permute(0,2,1) # => B C S
        target = target.squeeze(1)     # => B S
        log_preds = F.log_softmax(output, dim=1)
        if self.reduction=='sum': loss = -log_preds.sum()
        else:
            loss = -log_preds.sum(dim=1)
            if self.reduction=='mean':  loss = loss.mean()
        return loss*self.eps/c + (1-self.eps) * F.nll_loss(log_preds, target, reduction=self.reduction)
    
class AwareLabelSmoothingCE(Module):
    def __init__(self, eps:float=0.1, reduction='mean'): self.eps,self.reduction = eps,reduction

    def forward(self, output, target):
        c = output.size()[-1]
        output = output.permute(0,2,1) # => B C S
        t_one_hot = torch.zeros(target.shape[0],11,target.shape[2],device=target.device).scatter_(1,target, 1.)
        t_one_hot = t_one_hot.sum(dim=(2,)) > 0. # B C true for classes present in batch
        t_c_per_batch =  t_one_hot.sum(dim=(1,)).float() # B 1 number of classes in each batch
        target = target.squeeze(1)     # => B S
        log_preds = F.log_softmax(output, dim=1) # B C S
        #print(log_preds.shape,t_one_hot.shape,t_c_per_batch.shape)
        #print(log_preds[:2],t_one_hot[:2],t_c_per_batch[:2])
        if self.reduction=='sum': 
            loss = -log_preds.sum()
            assert False
        else:
            loss = (-log_preds*t_one_hot.unsqueeze(-1)/t_c_per_batch.view(-1,1,1)).sum(dim=1) # B C S x B C 1 x B 1 
            if self.reduction=='mean':  loss = loss.mean()
        return loss*self.eps + (1-self.eps) * F.nll_loss(log_preds, target, reduction=self.reduction)

class DriftChannelsLoss(Module):
    def __init__(self):
        self.drift_loss = MSELossFlat()
        self.open_channels_loss_softf1 = partial(tversky_loss, alpha=0.5,beta=0.5) 
        self.open_channels_loss_ce     = LabelSmoothingCE() #CrossEntropyLossFlat() #LabelSmoothingCE()# CrossEntropyLossFlat()
    def __call__(self, input:Tensor, target:Tensor, **kwargs):
        i_drift,i_open_channels = input
        t_drift,t_open_channels = target
        return 0.0*self.drift_loss(i_drift, t_drift) + \
            0.*self.open_channels_loss_softf1(i_open_channels, t_open_channels) + \
            1.*self.open_channels_loss_ce(i_open_channels, t_open_channels) 

In [None]:
import sklearn.metrics as skm

# Cell
class OpenChannelsAccumMetric(Metric):
    "Stores predictions and targets on CPU in accumulate to perform final calculations with `func`."
    def __init__(self, func, dim_argmax=None, sigmoid=False, thresh=None, to_np=False, invert_arg=False,
                 flatten=True, **kwargs):
        store_attr(self,'func,dim_argmax,sigmoid,thresh,flatten')
        self.to_np,self.invert_args,self.kwargs = to_np,invert_arg,kwargs

    def reset(self): self.targs,self.preds = [],[]

    def accumulate(self, learn):
        t,p = learn.y[1],learn.pred[1]
        pred = p.argmax(dim=self.dim_argmax) if self.dim_argmax else p
        if self.sigmoid: pred = torch.sigmoid(pred)
        if self.thresh:  pred = (pred >= self.thresh)
        #pred = p.round()
        targ = t
        pred,targ = to_detach(pred),to_detach(targ)
        if self.flatten: pred,targ = flatten_check(pred,targ)
        self.preds.append(pred)
        self.targs.append(targ)

    @property
    def value(self):
        if len(self.preds) == 0: return
        preds,targs = torch.cat(self.preds),torch.cat(self.targs)
        if self.to_np: preds,targs = preds.numpy(),targs.numpy()
        return self.func(targs, preds, **self.kwargs) if self.invert_args else self.func(preds, targs, **self.kwargs)

    @property
    def name(self):  return self.func.func.__name__ if hasattr(self.func, 'func') else  self.func.__name__

# Cell
def skm_to__open_channels_fastai(func, is_class=True, thresh=None, axis=-1, sigmoid=None, **kwargs):
    "Convert `func` from sklearn.metrics to a fastai metric"
    dim_argmax = axis if is_class and thresh is None else None
    sigmoid = sigmoid if sigmoid is not None else (is_class and thresh is not None)
    return OpenChannelsAccumMetric(func, dim_argmax=dim_argmax, sigmoid=sigmoid, thresh=thresh,
                       to_np=True, invert_arg=True, **kwargs)

def F1Score(axis=-1, labels=None, pos_label=1, average='binary', sample_weight=None):
    "F1 score for single-label classification problems"
    return skm_to__open_channels_fastai(skm.f1_score, axis=axis,
                         labels=labels, pos_label=pos_label, average=average, sample_weight=sample_weight)

def accuracy(inp, targ, axis=-1):
    "Compute accuracy with `targ` when `pred` is bs * n_classes"
    pred,targ = inp[1], targ[1]
    pred,targ = flatten_check(pred.argmax(dim=axis), targ)
    return (pred == targ).float().mean()

In [None]:
learn = None
gc.collect()
torch.cuda.empty_cache()
learn = Learner(dls,model,loss_func=DriftChannelsLoss(),opt_func=Adam,
                metrics=[F1Score(labels=list(range(11)),average='macro'), accuracy])
learn.to_parallel().to_fp16()
learn.summary()

In [None]:
#learn.load('ref_256_4_4_50_0.1_64_400_cv0.9293_clean')

In [None]:
#lr_min, lr_steep=learn.lr_find()


In [None]:
learn.fit_one_cycle(10,lr_max=5e-4)#,pct_start=0.3)

epoch,train_loss,valid_loss,f1_score,accuracy,time
0,1.223913,1.280377,0.395723,0.568312,01:32
1,0.855124,1.015538,0.512022,0.73881,01:28


In [None]:
learn.recorder.plot_loss()

In [None]:
cv,_,time = learn.recorder.log[-3:];cv,_,time

In [None]:
modelname = f'ref_{dim}_{depth}_{heads}_{bucket_size}_{lsh_dropout}_{BS}_{WINDOW_SIZE}_cv{cv:0.04f}{DATA_SUFFIX}'
learn.save(modelname);modelname

In [None]:
learn.model

In [None]:
#learn.model=learn.model.module#.module.module.module

In [None]:
learn.model.eval()
test_preds = torch.zeros(test[0].numel(),11,dtype=torch.float)
with torch.no_grad():
    for flip in [True,False]:
        for i,x in enumerate(progress_bar(test_dl)):
            if flip: x[0] = torch.flip(x[0], dims=(2,))
            preds = learn.model(x[0])
            open_channels = preds[1]
            if flip: open_channels = torch.flip(open_channels, dims=(1,))
            test_preds[i*WINDOW_SIZE*BS:(i+1)*WINDOW_SIZE*BS] += open_channels.view(-1,11).cpu()

In [None]:
open_channels = test_preds.argmax(dim=1) # torch.clamp(test_preds.round(),0,10).int().squeeze()#.item() #
open_channels.shape

In [None]:
plt.plot(open_channels)

In [None]:
open_channels

In [None]:
plt.plot(open_channels)

In [None]:
plt.plot(test[0].flatten())

In [None]:
csv_fname = f'{modelname}.csv';csv_fname

In [None]:
submission_csv_path = p_input / 'sample_submission.csv'
ss = pd.read_csv(submission_csv_path, dtype={'time': str})
test_preds_all = test_preds
test_pred_frame = pd.DataFrame({'time': ss['time'].astype(str), 'open_channels': open_channels})
test_pred_frame.to_csv(csv_fname, index=False)

In [None]:
!kaggle competitions submit -c 'liverpool-ion-switching' -f {csv_fname} -m 'trans 1 feat jitter flip=0.5 20 epochs flip_tta'