In [1]:
#Relevant packages
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch 
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torchtext
import torchtext.vocab as vocab
import os
from datetime import date
today = date.today()

from misc import get_split_indices
from misc import export_results
from data_preprocessing import data_loader
from build_vocabulary import make_vocabulary
from create_dataset import NCBIDataset
from bert_builder import BERT
from trainer import BertTrainer

#Data directory
#Lokalt
local_data_dir = 'c:\\Users\\erika\\Desktop\\Exjobb\\data'
#saga
saga_data_dir = "/home/aeerik/data/raw/"

#save directory
save_directory = 'c:\\Users\\erika\\Desktop\\Exjobb\\savefiles'

#System information
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#Hyperparameters
include_pheno = True
threshold_year = 1970
data_path = local_data_dir #ÄNDRA DENNA
max_length = [88,51,37]
mask_prob = 0.15
embedding_dim = 32
drop_prob = 0.2
limit_data = True
reduced_samples = 1000 #Ta bort denna senare

enc_dim_inp = 32 
enc_dim_out = 32 
attention_heads = 8 

num_encoders = 4

epochs = 2
batch_size = 32
lr = 0.001
stop_patience = 10
export_model = True

####################################################

if device.type == "cuda":
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
    torch.cuda.empty_cache()
else:
    print("Using CPU")  
    
print(f"\n Retrieving data from: {data_path}")
print("Loading data...")
NCBI = data_loader(include_pheno,threshold_year,data_path)
print(f"Data correctly loaded, {len(NCBI)} samples found")

print("Creating vocabulary...")
vocabulary = make_vocabulary(NCBI, include_pheno)
print(f"Vocabulary created with number of elements:",len(vocabulary))

if limit_data:
    print(f"Reducing samples to {reduced_samples}")
    NCBI = NCBI.head(reduced_samples)

train_indices, val_indices = get_split_indices(len(NCBI), 0.2)
train_set = NCBIDataset(NCBI.iloc[train_indices], vocabulary, max_length, mask_prob, include_pheno)
val_set = NCBIDataset(NCBI.iloc[val_indices], vocabulary, max_length, mask_prob,include_pheno)
print(f"Datasets has been created with {len(train_set)} samples in the training set and {len(val_set)} samples in the validation set")

print(f"Creating model...")
model = BERT(len(vocabulary), max_length, enc_dim_inp, enc_dim_out, attention_heads, num_encoders, drop_prob)
print(f"Model successfully loaded")
print(f"---------------------------------------------------------")
print(f"Starting training...")
trainer = BertTrainer(model, train_set, val_set, epochs, batch_size, lr, device, stop_patience, save_directory)
results = trainer()
print(f"---------------------------------------------------------")
if export_model:
    print(f"Exporting model...")
    export_model_label = str(today)+"model.pkl"
    trainer._save_model(save_directory+"/"+export_model_label)
print("Exporting results...")
export_results_label = str(today)+"run.pkl"
export_results(results, save_directory+"/"+export_results_label)



ImportError: cannot import name 'make_vocabulary' from 'build_vocabulary' (c:\Users\erika\Desktop\Exjobb\repo\base\build_vocabulary.py)