---

# Load the data

In [1]:
import pandas as pd
import numpy as np

In [2]:
classesDF  = pd.read_csv('data/elliptic_bitcoin_dataset/elliptic_txs_classes.csv')
unlabled_classesDF  = pd.read_csv('data/elliptic_bitcoin_dataset/elliptic_txs_classes.csv')
# Remove all nodes that have unknown class
classesDF = classesDF[classesDF['class'] != "unknown"]
# Get all nodes that have unknown class
unlabled_classesDF = unlabled_classesDF[unlabled_classesDF['class'] == "unknown"]

# If class = 2, then map to 0, else map to 1
classesDF['class'] = classesDF['class'].map({'2': 0, '1': 1})
unlabled_classesDF['class'] = unlabled_classesDF['class'].map({'unknown': -1})

classesDF.head(5)

Unnamed: 0,txId,class
3,232438397,0
9,232029206,0
10,232344069,0
11,27553029,0
16,3881097,0


In [3]:
unlabled_classesDF.head(5)

Unnamed: 0,txId,class
0,230425980,-1
1,5530458,-1
2,232022460,-1
4,230460314,-1
5,230459870,-1


In [4]:
featuresDF = pd.read_csv('data/elliptic_bitcoin_dataset/elliptic_txs_features.csv')
unlabled_featuresDF = pd.read_csv('data/elliptic_bitcoin_dataset/elliptic_txs_features.csv')
# Change column names --> Column 1 is txId, Column 2 is timestep and the rest are unknown features
featuresDF.columns = ['txId', 'timestep'] + ['f' + str(i) for i in range(165)]
unlabled_featuresDF.columns = ['txId', 'timestep'] + ['f' + str(i) for i in range(165)]

# Remove all edges that do not appear in classesDF
featuresDF = featuresDF[featuresDF['txId'].isin(classesDF['txId'])]
unlabled_featuresDF = unlabled_featuresDF[unlabled_featuresDF['txId'].isin(unlabled_classesDF['txId'])]

featuresDF.head(5)

Unnamed: 0,txId,timestep,f0,f1,f2,f3,f4,f5,f6,f7,...,f155,f156,f157,f158,f159,f160,f161,f162,f163,f164
2,232438397,1,0.163054,1.96379,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,...,-0.577099,-0.613614,0.241128,0.241406,1.072793,0.08553,-0.131155,0.677799,-0.120613,-0.119792
8,232029206,1,-0.005027,0.578941,-0.091383,4.380281,-0.063725,4.667146,0.851305,-0.163645,...,-0.577099,-0.613614,0.241128,0.241406,0.60412,0.008632,-0.131155,0.333211,-0.120613,-0.119792
9,232344069,1,-0.147852,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.137933,...,-0.577099,-0.613614,0.241128,0.241406,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
10,27553029,1,-0.151357,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.141519,...,-0.539735,-0.582077,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
15,3881097,1,-0.172306,-0.184668,-1.201369,0.028105,-0.043875,-0.02914,0.242712,-0.16364,...,-0.577099,-0.600999,0.241128,0.241406,0.018279,-0.068266,-0.084674,-0.05445,-1.760926,-1.760984


In [5]:
unlabled_featuresDF.head(5)

Unnamed: 0,txId,timestep,f0,f1,f2,f3,f4,f5,f6,f7,...,f155,f156,f157,f158,f159,f160,f161,f162,f163,f164
0,5530458,1,-0.171484,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162112,...,0.947382,0.673103,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
1,232022460,1,-0.172107,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162749,...,0.670883,0.439728,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792
3,230460314,1,1.011523,-0.081127,-1.201369,1.153668,0.333276,1.312656,-0.061584,-0.163523,...,-0.511871,-0.400422,0.517257,0.579382,0.018279,0.277775,0.326394,1.29375,0.178136,0.179117
4,230459870,1,0.96104,-0.081127,-1.201369,1.303743,0.333276,1.480381,-0.061584,-0.163577,...,-0.504702,-0.422589,-0.22679,-0.117629,0.018279,0.277775,0.413931,1.149556,-0.696053,-0.69554
5,230333930,1,-0.171264,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.161887,...,-0.569626,-0.607306,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792


In [6]:
unlabled_classesDF

Unnamed: 0,txId,class
0,230425980,-1
1,5530458,-1
2,232022460,-1
4,230460314,-1
5,230459870,-1
...,...,...
203762,157581340,-1
203764,173077460,-1
203765,158577750,-1
203767,158654197,-1


In [7]:
# Append the class to the featuresDF based on txID
featuresDF = featuresDF.merge(classesDF, on='txId')
unlabled_featuresDF = unlabled_featuresDF.merge(unlabled_classesDF, on='txId')
# Move features 'class' to first column
cols = list(featuresDF.columns)
cols = cols[:1] + [cols[-1]] + cols[1:-1]
featuresDF = featuresDF[cols]

featuresDF.head(5)

Unnamed: 0,txId,class,timestep,f0,f1,f2,f3,f4,f5,f6,...,f155,f156,f157,f158,f159,f160,f161,f162,f163,f164
0,232438397,0,1,0.163054,1.96379,-0.646376,12.409294,-0.063725,9.782742,12.414558,...,-0.577099,-0.613614,0.241128,0.241406,1.072793,0.08553,-0.131155,0.677799,-0.120613,-0.119792
1,232029206,0,1,-0.005027,0.578941,-0.091383,4.380281,-0.063725,4.667146,0.851305,...,-0.577099,-0.613614,0.241128,0.241406,0.60412,0.008632,-0.131155,0.333211,-0.120613,-0.119792
2,232344069,0,1,-0.147852,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,...,-0.577099,-0.613614,0.241128,0.241406,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
3,27553029,0,1,-0.151357,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,...,-0.539735,-0.582077,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
4,3881097,0,1,-0.172306,-0.184668,-1.201369,0.028105,-0.043875,-0.02914,0.242712,...,-0.577099,-0.600999,0.241128,0.241406,0.018279,-0.068266,-0.084674,-0.05445,-1.760926,-1.760984


In [8]:
unlabled_featuresDF

Unnamed: 0,txId,timestep,f0,f1,f2,f3,f4,f5,f6,f7,...,f156,f157,f158,f159,f160,f161,f162,f163,f164,class
0,5530458,1,-0.171484,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,-0.162112,...,0.673103,-0.979074,-0.978556,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792,-1
1,232022460,1,-0.172107,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,-0.162749,...,0.439728,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792,-1
2,230460314,1,1.011523,-0.081127,-1.201369,1.153668,0.333276,1.312656,-0.061584,-0.163523,...,-0.400422,0.517257,0.579382,0.018279,0.277775,0.326394,1.293750,0.178136,0.179117,-1
3,230459870,1,0.961040,-0.081127,-1.201369,1.303743,0.333276,1.480381,-0.061584,-0.163577,...,-0.422589,-0.226790,-0.117629,0.018279,0.277775,0.413931,1.149556,-0.696053,-0.695540,-1
4,230333930,1,-0.171264,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,-0.161887,...,-0.607306,-0.979074,-0.978556,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157199,157581340,49,-0.172974,-0.156732,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,-0.163636,...,0.647874,0.241128,0.241406,7.165536,1.085202,-0.131155,5.157442,-0.120613,-0.119792,-1
157200,173077460,49,-0.145771,-0.163752,0.463609,-0.121970,-0.043875,-0.113002,-0.061584,-0.135803,...,-0.613614,0.241128,0.241406,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792,-1
157201,158577750,49,-0.165920,-0.123607,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,-0.156418,...,0.010822,1.461330,1.461369,-0.098889,-0.087490,-0.084674,-0.140597,-1.760926,-1.760984,-1
157202,158654197,49,-0.172842,-0.176622,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,-0.163501,...,-0.411776,1.461330,1.461369,-0.098889,-0.087490,-0.084674,-0.140597,1.519700,1.521399,-1


In [9]:
# Move features 'class' to first column
unlabled_cols = list(unlabled_featuresDF.columns)
unlabled_cols = unlabled_cols[:1] + [unlabled_cols[-1]] + unlabled_cols[1:-1]
unlabled_featuresDF = unlabled_featuresDF[cols]

unlabled_featuresDF.head(5)

Unnamed: 0,txId,class,timestep,f0,f1,f2,f3,f4,f5,f6,...,f155,f156,f157,f158,f159,f160,f161,f162,f163,f164
0,5530458,-1,1,-0.171484,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,...,0.947382,0.673103,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
1,232022460,-1,1,-0.172107,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,...,0.670883,0.439728,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792
2,230460314,-1,1,1.011523,-0.081127,-1.201369,1.153668,0.333276,1.312656,-0.061584,...,-0.511871,-0.400422,0.517257,0.579382,0.018279,0.277775,0.326394,1.29375,0.178136,0.179117
3,230459870,-1,1,0.96104,-0.081127,-1.201369,1.303743,0.333276,1.480381,-0.061584,...,-0.504702,-0.422589,-0.22679,-0.117629,0.018279,0.277775,0.413931,1.149556,-0.696053,-0.69554
4,230333930,-1,1,-0.171264,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,...,-0.569626,-0.607306,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792


In [10]:
featuresDF[10:25]

Unnamed: 0,txId,class,timestep,f0,f1,f2,f3,f4,f5,f6,...,f155,f156,f157,f158,f159,f160,f161,f162,f163,f164
10,232470704,0,1,-0.1729,-0.184668,-1.201369,0.028105,-0.043875,0.054722,-0.061584,...,-0.569626,-0.582077,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
11,232033533,0,1,-0.156114,-0.184668,-1.201369,-0.046932,-0.043875,-0.02914,-0.061584,...,-0.577099,-0.600999,0.241128,0.241406,-0.098889,-0.08749,-0.084674,-0.140597,1.5197,1.521399
12,230473487,0,1,-0.17229,-0.184668,-1.201369,-0.046932,-0.043875,-0.02914,-0.061584,...,-0.539735,-0.569462,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
13,7089694,0,1,-0.157608,-0.181768,0.463609,0.328255,-0.043875,0.390171,-0.061584,...,0.087993,0.036052,1.46133,1.461369,0.018279,1.642711,4.052141,3.77909,1.5197,1.521399
14,231179595,0,1,-0.172729,-0.184668,-1.201369,-0.046932,-0.024025,-0.02914,-0.061584,...,-0.577099,-0.600999,0.241128,0.241406,-0.098889,-0.08749,-0.093204,-0.126239,1.299939,1.301521
15,231177927,0,1,-0.171669,-0.184668,-1.201369,0.028105,-0.043875,0.054722,-0.061584,...,-0.577099,-0.600999,0.241128,0.241406,0.135448,-0.068266,-0.131155,-0.011377,-0.120613,-0.119792
16,2758467,0,1,0.092328,1.239009,-0.646376,8.207194,-0.063725,8.608671,1.155601,...,-0.577099,-0.613614,0.241128,0.241406,0.252616,-0.049041,-0.131155,0.07477,-0.120613,-0.119792
17,232437171,0,1,-0.167243,-0.115037,-0.091383,0.403293,-0.043875,0.474034,-0.061584,...,-0.517316,-0.537925,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
18,3878886,0,1,-0.172886,-0.184668,-1.201369,-0.046932,-0.043875,-0.02914,-0.061584,...,-0.569626,-0.594691,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
19,231182296,0,1,-0.17231,-0.184668,-1.201369,-0.046932,-0.024025,-0.02914,-0.061584,...,-0.577099,-0.600999,0.241128,0.241406,0.018279,-0.068266,-0.093204,-0.040092,1.299939,1.301521


---

# Create the dataset class

In [11]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import datasets, transforms
import torcheval #https://pytorch.org/torcheval/stable/
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report#https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html
from sklearn.metrics import ConfusionMatrixDisplay#https://scikit-learn.org/stable/modules/generated/sklearn.metrics.ConfusionMatrixDisplay.html
from torcheval.metrics.functional import multiclass_f1_score
#from sklearn.model_selection import train_test_split

In [12]:
class Data(Dataset):
    def __init__(self, X_train,y_train):
        
        self.labels = y_train
        self.features = X_train
        
        #self.vectors = self.data[self.features]
        #self.vectors = torch.tensor(self.data[self.features].values, dtype=torch.float32)

    def __len__(self):
        #print(self.data.shape)
        #print(self.labels)
        #print(self.features.shape)
        return len(self.features)

    def __getitem__(self, idx):
        
        X = torch.tensor(self.features[idx]).to(torch.float32).to(device)#.values
        y = torch.tensor(self.labels[idx]).to(torch.float32).to(device)#.values
        
        
        return X, y

In [130]:
total_length = len(featuresDF)
percentage_train = 0.7
percentage_test = 0.3
cut_train = int(total_length*percentage_train)
X_train = np.array(featuresDF[:cut_train].drop('class', axis=1))
y_train = np.array(featuresDF[:cut_train]['class'])
X_test = np.array(featuresDF[cut_train:].drop('class', axis=1))
y_test = np.array(featuresDF[cut_train:]['class'])
# Create an instance of the custom dataset
train_dataset = Data(X_train,y_train)
test_dataset = Data(X_test, y_test)
#[print(i.shape) for i in train_dataset]
#Instance of the custom validation dataset
#test_dataset = Data(os.path.join(dataset_root, "val"), transform=transform)

# Define the batch size for the DataLoader
batch_size = 3000

# Create the DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Create the DataLoader for validation
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [131]:
#If this cell fails you need to change the runtime of your colab notebook to GPU
# Go to Runtime -> Change Runtime Type and select GPU
assert torch.cuda.is_available(), "GPU is not enabled"

#  use gpu if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [132]:
UX_train = np.array(unlabled_featuresDF.drop('class', axis=1))
Uy_train = np.array(unlabled_featuresDF['class'])
print(UX_train.shape)
# Create an instance of the custom dataset
unlabled_train_dataset = Data(UX_train,Uy_train)

#Instance of the custom validation dataset
#test_dataset = Data(os.path.join(dataset_root, "val"), transform=transform)

# Define the batch size for the DataLoader
batch_size = 3000

# Create the DataLoader
unlabled_train_loader = DataLoader(unlabled_train_dataset, batch_size=batch_size, shuffle=True)



(157204, 167)


In [133]:
UX_train[6]

array([ 2.32013274e+08,  1.00000000e+00, -1.23126716e-01, -1.84667551e-01,
       -1.20136880e+00, -1.21969600e-01, -4.38745479e-02, -1.13002009e-01,
       -6.15837941e-02, -1.12635219e-01, -1.19164648e-01, -4.97069644e-02,
       -1.14995790e-01, -2.87412859e-02, -3.53905526e-02, -4.29552993e-02,
       -1.32816149e-02,  6.28865779e-02, -1.28448427e-01, -1.33662990e-01,
       -1.17240089e-01,  8.87057872e-01,  8.84556525e-01, -1.39710213e-01,
       -1.48898494e-01, -8.01472697e-02, -1.55643640e-01, -1.07630095e-02,
       -1.21074518e-02, -1.39712010e-01, -1.48893792e-01, -8.01467358e-02,
       -1.55643608e-01, -1.06685611e-02, -1.20051821e-02, -2.46688307e-02,
       -3.12723905e-02, -2.30451564e-02, -2.62146552e-02,  1.42781371e-03,
        1.48264379e-03, -2.27215446e-01, -2.39368369e-01, -7.52555315e-02,
       -2.34951518e-01,  3.74680288e-02,  4.34442213e-02, -2.27203324e-01,
       -2.43236093e-01, -9.78946779e-02, -2.35896423e-01,  3.65766504e-02,
        4.23451348e-02, -

In [134]:
class CNN(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.encoder = nn.Sequential(
            nn.Linear(in_features=167, out_features=50),
            #nn.Dropout(p=0.3),
            nn.Sigmoid(),
            nn.LayerNorm(50),
            nn.Dropout(p=0.2),
            nn.Linear(in_features = 50, out_features  = 1),
            nn.Sigmoid()
            #12x12x16
        )


    def forward(self, x):
        x = self.encoder(x)
        return x

In [135]:
def model_train(model, loader, optimizer, criterion, reshape=False):
    loss = 0
    model.train()

    for batch_features, labels in loader:
        # load it to the active device
        batch_features = batch_features.to(device)
        labels = labels.to(device)

        # reset the gradients back to zero
        optimizer.zero_grad()

        # compute predictions
        outputs = model(batch_features)

        # compute training loss
        train_loss = criterion(outputs, labels)

        # compute accumulated gradients
        train_loss.backward()

        # perform parameter update based on current gradients
        optimizer.step()

        # add the batch training loss to epoch loss
        loss += train_loss.item()

    # compute the epoch training loss
    loss = loss / len(loader)
    print("epoch : {}/{}, Train loss = {:.6f}".format(epoch + 1, epochs, loss))

def model_test(model, test_loader, criterion, reshape=False):
    loss = 0
    model.eval()
    tot_pred_labels = torch.tensor([]).to(device)
    tot_true_labels = torch.tensor([]).to(device)

    for features, labels in test_loader:
        features = features.to(device)
        labels = labels.to(device)

        with torch.no_grad():
            outputs = model(features)
        print("outputs", outputs)
        outputs = outputs.squeeze()
        
        #outputs = torch.nn.functional.sigmoid(outputs)
        #pred_labels = torch.argmax(outputs, dim = 1)
        
        pred_labels = torch.round(outputs)
        #print("outputs",outputs, outputs.shape)
        #print("labels",pred_labels)
        test_acc = torch.sum(pred_labels == labels)/len(pred_labels)
        print("TEST ACCURACY:", test_acc)
        true_labels = labels
        tot_pred_labels = torch.cat((tot_pred_labels, pred_labels), dim = 0)
        tot_true_labels = torch.cat((tot_true_labels, true_labels), dim = 0)
        # compute training loss
        
        test_loss = criterion(outputs,labels)

        # add the batch training loss to epoch loss
        loss += test_loss.item()

    # compute the epoch test loss
    
    loss = loss / len(test_loader)

    # display the epoch training loss
    print("epoch : {}/{}, Test loss = {:.6f}".format(epoch + 1, epochs, loss))
    return tot_pred_labels,tot_true_labels

In [136]:
def add_samples(model, unlabled_loader,threshold):
    added_samples = []
    non_added_samples = []
    true_added_labels = torch.tensor([]).to(device)
    pred_added_labels = torch.tensor([]).to(device)
    tot_pred_labels = torch.tensor([]).to(device)
    tot_true_labels = torch.tensor([]).to(device)
    #new_data = labled_data

    for img, lab in unlabled_loader:
        img = img.to(device)
        lab = lab.to(device)
        with torch.no_grad():
            self_outputs = model(img)
            
            self_outputs = self_outputs.squeeze()
            

        #soft_outputs = torch.nn.functional.sigmoid(self_outputs)
        
        #probs,pred_labels = torch.max(soft_outputs, dim=0)
        #print("probs:",probs.shape, probs)
        for idx,prob in enumerate(self_outputs):
            if prob.item() > threshold:

                added_samples.append((img[idx], torch.round(prob).item()))
                #print("added_samp:", added_samples[0][0],added_samples[0][1],added_samples)
                true_added_labels = torch.cat((true_added_labels, lab[idx].reshape(1)), dim = 0)
                pred_added_labels = torch.cat((pred_added_labels, torch.round(prob).reshape(1)), dim = 0)
                
            elif prob.item() < (1-threshold):
                added_samples.append((img[idx], torch.round(prob).item()))
                #print("added_samp:", added_samples[0][0],added_samples[0][1],added_samples)
                true_added_labels = torch.cat((true_added_labels, lab[idx].reshape(1)), dim = 0)
                pred_added_labels = torch.cat((pred_added_labels, torch.round(prob).reshape(1)), dim = 0)
            else:
                non_added_samples.append((img[idx], -1))

        true_labels = lab
        tot_pred_labels = torch.cat((tot_pred_labels, torch.round(prob).unsqueeze(0)), dim = 0)
        tot_true_labels = torch.cat((tot_true_labels, true_labels), dim = 0)


    num_correct_preds = torch.sum(torch.eq(true_added_labels,pred_added_labels)).item()

    print(f'number of items added to the labelled data: {len(added_samples)}')
    print(f'correctly classified items added to the labelled data: {num_correct_preds}')
    print(f'incorrectly classified items added to the labelled data: {len(added_samples) - num_correct_preds}')
    
    return added_samples, non_added_samples #new_data

In [137]:
def self_train(model, X, y,UX_train,Uy_train,threshold, optimizer, criterion, reshape=False):
    loss = 0
    model.train()
    train_dataset = Data(X,y)
    labled_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    if len(UX_train) != 0:
        unlabled_dataset = Data(UX_train, Uy_train)
        unlabled_loader = torch.utils.data.DataLoader(unlabled_dataset, batch_size=batch_size, shuffle=True)
    
    print(f'labled loader length: {len(labled_loader)}')
    
    for batch_features,labels in labled_loader:
        
        # load it to the active device
        
        batch_features = batch_features.to(device)
        labels = labels.to(device)

        # reset the gradients back to zero
        optimizer.zero_grad()
        # compute reconstructions
        outputs = model(batch_features)
        outputs = outputs.squeeze()
        #print(outputs)
        #outputs = torch.nn.functional.sigmoid(outputs.squeeze())
        
        #print(outputs.dtype, labels.dtype)
        # compute training loss
        #print("outputs:",outputs.shape)
        labels = labels.to(torch.float32)
        train_loss = criterion(outputs, labels)
        pred_labels = torch.round(outputs)
        train_acc = torch.sum(pred_labels == labels)/len(pred_labels)
        print("TRAIN ACCURACY:", train_acc)
        # compute accumulated gradients
        train_loss.backward()

        # perform parameter update based on current gradients
        optimizer.step()

        # add the batch training loss to epoch loss
        loss += train_loss.item()
        
    # compute the epoch training loss
    loss = loss / len(labled_loader)
    print("epoch : {}/{}, Train loss = {:.6f}".format(epoch + 1, epochs, loss))
    
    if len(UX_train) != 0:
        print("ux_train")
        add_data, non_add_data = add_samples(model, unlabled_loader,threshold)
        if len(add_data) != 0:
            X_unlabled= [np.array(XU.cpu()) for XU,yU in add_data]
            y_unlabled = [np.array(yU) for XU,yU in add_data]
            #new_data = labled_data + add_data
            new_X = np.concatenate((X, np.array(X_unlabled)), axis = 0)
            new_y = np.concatenate((y,np.array(y_unlabled)), axis = 0)
        
            X_non_add= [np.array(XU_non.cpu()) for XU_non,yU_non in non_add_data]
            y_non_add = [np.array(yU_non) for XU_non,yU_non in non_add_data]
        
            return new_X, new_y, X_non_add, y_non_add
        else:
            return X, y,UX_train, Uy_train
    else: 
        return X, y, [],[]

In [143]:
X_train = np.array(featuresDF[:cut_train].drop('class', axis=1))
y_train = np.array(featuresDF[:cut_train]['class'])
X_test = np.array(featuresDF[cut_train::].drop('class', axis=1))
y_test = np.array(featuresDF[cut_train::]['class'])
# Create an instance of the custom dataset
train_dataset = Data(X_train,y_train)

In [144]:
UX_train = np.array(unlabled_featuresDF.drop('class', axis=1))
Uy_train = np.array(unlabled_featuresDF['class'])
print(UX_train.shape)
print(np.unique(Uy_train))
# Create an instance of the custom dataset
unlabled_train_dataset = Data(UX_train,Uy_train)

#Instance of the custom validation dataset
#test_dataset = Data(os.path.join(dataset_root, "val"), transform=transform)

# Define the batch size for the DataLoader
batch_size = 3000

# Create the DataLoader
unlabled_train_loader = DataLoader(unlabled_train_dataset, batch_size=batch_size, shuffle=True)

(157204, 167)
[-1]


In [145]:
test_dataset = Data(X_test, y_test)
# Create the DataLoader for validation
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [146]:
model = CNN().to(device)


optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

criterion = nn.BCELoss()

In [147]:
epochs = 10000
threshold = 0.99
for epoch in range(epochs):
    
    X_train, y_train, UX_train, Uy_train = self_train(model, X_train, y_train,UX_train,Uy_train ,threshold, optimizer, criterion, reshape=True)
    tot_pred_labels,tot_true_labels = model_test(model, test_loader, criterion, reshape=True)

num_correct_preds = torch.sum(torch.eq(tot_true_labels,tot_pred_labels)).item()
accuracy = num_correct_preds/len(tot_true_labels)
f1_score = multiclass_f1_score(tot_pred_labels,tot_true_labels)
confusion = confusion_matrix(tot_true_labels.cpu(),tot_pred_labels.cpu() )
avg_score = (accuracy + f1_score)/2
#best_score = actualize_best_results(best_score,(avg_score.item(), threshold))

print(f'Accuracy : {accuracy} threshold: {threshold}' )
print(f'F1 score: {f1_score.cpu().item()} threshold {threshold}')
print("\n")

labled loader length: 11
TRAIN ACCURACY: tensor(0.8207, device='cuda:0')
TRAIN ACCURACY: tensor(0.8267, device='cuda:0')
TRAIN ACCURACY: tensor(0.8323, device='cuda:0')
TRAIN ACCURACY: tensor(0.8360, device='cuda:0')
TRAIN ACCURACY: tensor(0.8420, device='cuda:0')
TRAIN ACCURACY: tensor(0.8403, device='cuda:0')
TRAIN ACCURACY: tensor(0.8300, device='cuda:0')
TRAIN ACCURACY: tensor(0.8323, device='cuda:0')
TRAIN ACCURACY: tensor(0.8420, device='cuda:0')
TRAIN ACCURACY: tensor(0.8447, device='cuda:0')
TRAIN ACCURACY: tensor(0.8524, device='cuda:0')
epoch : 1/10000, Train loss = 0.558505
ux_train
number of items added to the labelled data: 0
correctly classified items added to the labelled data: 0
incorrectly classified items added to the labelled data: 0
outputs tensor([[0.3854],
        [0.3854],
        [0.3854],
        ...,
        [0.3854],
        [0.3854],
        [0.3854]], device='cuda:0')
TEST ACCURACY: tensor(0.9427, device='cuda:0')
outputs tensor([[0.3854],
        [0.3854],

KeyboardInterrupt: 

epochs=5
for epoch in range(epochs):
    model_train(model, train_loader, optimizer, criterion, reshape=True)
    tot_pred_labels,tot_true_labels = model_test(model, val_loader, criterion, reshape=True)

In [None]:
torch.round(torch.tensor([0.55, 0.7,0.2]))

### Load the data into the dataset class

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

licitSamples = featuresDF[featuresDF['class'] == 0]
illicitSamples = featuresDF[featuresDF['class'] == 1]

# Obtain the needed samples to balance the dataset
AE_data = licitSamples.sample(n=len(licitSamples) - len(illicitSamples), random_state=42)

# Normalize the data
scaler = StandardScaler()
scaler.fit(AE_data.iloc[:, 2:])
AE_data.iloc[:, 2:] = scaler.transform(AE_data.iloc[:, 2:])

# Store licitSamples in a csv file
#AE_data.to_csv('../../../data/noPCA_AEData2.csv', index=False)

trainAE, valAE = train_test_split(AE_data, test_size=0.15, random_state=42)

train_dataset = Data(trainAE)
val_dataset = Data(valAE)

print(f'Train size: {len(train_dataset)}')
print(f'Validation size: {len(val_dataset)}')

### Load the datasets into Dataloaders

from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=2048, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2048, shuffle=True)

---

# Create the model class

In [None]:
import torch.nn as nn

# Create autoencoder for input vectors of size 65

class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(166, 128),
            nn.LeakyReLU(),
            nn.Linear(128, 86),
            nn.LeakyReLU(),
            nn.Linear(86, 48),
            nn.LeakyReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(48, 86),
            nn.LeakyReLU(),
            nn.Linear(86, 128),
            nn.LeakyReLU(),
            nn.Linear(128, 166),
            nn.LeakyReLU() 
        )
    
    def forward(self, in_features):
        encoded_features = self.encoder(in_features)
        out = self.decoder(encoded_features)
        return out

---

# Create the train and validate functions

In [None]:
import matplotlib.pyplot as plt
# from tqdm import tqdm

### Create train function

In [None]:
def train(model, criterion, optimizer, train_loader, device):
    model.to(device)
    model.train()
    train_loss = 0

    for batch_features in train_loader:
        batch_features = batch_features.view(-1, 166).to(device)

        optimizer.zero_grad()
        outputs = model(batch_features)
        loss = criterion(outputs, batch_features)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    return train_loss

### Create validate function

In [None]:
def validate(model, criterion, val_loader, device):
    model.to(device)
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch_features in val_loader:
            batch_features = batch_features.view(-1, 166).to(device)
            outputs = model(batch_features)
            loss = criterion(outputs, batch_features)
            val_loss += loss.item()*10
    return val_loss

### Create function to do both

In [None]:
def train_and_val(model, criterion, optimizer, scheduler, train_loader, val_loader, epochs, device='cuda', name='AE_loss'):
    losses = []
    for epoch in range(1, epochs+1):
        train_loss = train(model, criterion, optimizer, train_loader, device)
        val_loss = validate(model, criterion, val_loader, device)
        scheduler.step()
        if epoch%100==0:
            print(f'Epoch: {epoch}, Training Loss: {train_loss}, Validation Loss: {val_loss}\n')
            losses.append([train_loss, val_loss])
    # plot the evolving loss
    plt.figure(figsize=(12, 8))
    plt.plot([i[0] for i in losses], label='Training Loss')
    plt.plot([i[1] for i in losses], label='Validation Loss')
    plt.legend()
    plt.savefig('models/' + str(name) + '.png')
    plt.show()
    return losses

---

# Train small sample of model

In [None]:
model = Autoencoder()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1000, gamma=0.1)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
losses = train_and_val(model, criterion, optimizer, scheduler, train_loader, val_loader, epochs=3500, device=device, name='noPCA_Autoencoder_secondRUN')

# Save the model
torch.save(model.state_dict(), 'models/NoPCA_Autoencoder_secondRUN.pth')