In [1]:
from functions import *
import pickle

logger = get_logger('log_preprocess')

seed_everything(seed=42)

In [2]:
def run_tpbg_on_dataset(database_name):

    try:
    
        database = load_data(database_name=database_name)
        data_preprocessed = SimplePreprocessing().transform(database.data)
        vectorizer = TfidfVectorizer()
        data_vectorized_fit = vectorizer.fit_transform(data_preprocessed)
        y = database.target
        y_train_real = y.copy()
        
        y[database.is_train==0] = -1
        
        def eval(self):        
            self.create_transduction()    
            y_predicted = self.transduction_[database.is_train==0]    
            y_real = y_train_real[database.is_train==0]    
            print(classification_report(y_predicted, y_real, digits=4))
            
        K = len(database.target_names)
        tpbg = TPBG(K, alpha=0.05, beta=0.0001, local_max_itr=30,
                         global_max_itr=5, local_threshold=1e-6, global_threshold=1e-6,
                         save_interval=-1, 
                         feature_names=vectorizer.get_feature_names_out(), 
                         target_name=database.target_names, 
                         silence=True, eval_func=eval)   
        tpbg.fit(data_vectorized_fit, y)
        eval(tpbg)
        logger.info(f'Loaded TPBG for {database_name} with K={K}.')
        
        # doc2vec embeddings
        sentences = [re.findall("[a-z\-]+", s.lower()) for s in data_preprocessed]
        documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(sentences)]
        doc2vec_model = Doc2Vec(documents, vector_size=10, window=10, min_count=1, workers=4)
        doc2vec_model.train(documents, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)
        document_features = doc2vec_model.dv.vectors
        logger.info(f'Computed 400 document features with doc2vec.')

        tpbg.document_features = document_features
        tpbg.is_train = database.is_train
        tpbg.n_class = len(database.target_names)

        # Final embeddings
        tpbg_train = TPBG(K)
        tpbg_train.log_A = tpbg.log_A[database.is_train==1]
        tpbg_train.log_B = tpbg.log_B
        tpbg_train.document_features = tpbg.document_features[database.is_train==1]
        tpbg_train.Xc = tpbg.X[database.is_train==1]
        tpbg_train.y = y_train_real[database.is_train==1]
        tpbg_train.n_class = len(database.target_names)

        tpbg_test = TPBG(K)
        tpbg_test.log_A = tpbg.log_A[database.is_train==0]
        tpbg_test.log_B = tpbg.log_B
        tpbg_test.document_features = tpbg.document_features[database.is_train==0]
        tpbg_test.Xc = tpbg.X[database.is_train==0]
        tpbg_test.y = y_train_real[database.is_train==0]
        tpbg_test.n_class = len(database.target_names)
        
        return tpbg, tpbg_train, tpbg_test

    except Exception as e:
        logger.info(f'Error fitting TPBG for {database_name}: \n {e}')

In [5]:
import os
database_name = 'x'

base_path = f'./pickle_objects/preprocess/tpbg/{database_name}/'
os.makedirs(os.path.dirname(base_path), exist_ok=True)

In [18]:
databse_list = ['20ng', 'bbc', 'classic4', 'nsf', 'cstr', 'dmoz_computers', 'dmoz_health', 'dmoz_science', 'dmoz_sports', 're8']
Kz_list = [None]
docst_list = [None, 'merge', 'replace']
hidden_channels_list = [10, 20, 100, 200]
num_layers_list = [3]
p_dropout_list = [0.0, 0.2, 0.6]
loss_function_list = ['ce', 'fl']
gnn_version_list = [1, 2, 3]
num_epochs = [1500]

iter_params = itertools.product(databse_list, 
                           Kz_list, 
                           docst_list, 
                           hidden_channels_list,
                           num_layers_list, 
                           p_dropout_list,
                           loss_function_list, 
                           gnn_version_list,
                           num_epochs)

#logger.info('Running experiments on datasets with heterographs and GNNs.')

for params in iter_params:
    print(params)

for params in iter_params:
    database_name = params[0]
    K = params[1]
    hidden_channels = params[2]
    num_layers = params[3]
    p_dropout = params[4]
    docf = params[5]
    docst = params[6]
    loss_function = params[7]
    version = params[8]
    num_epochs = params[9]

    print(database_name, 
    hidden_channels, 
    num_layers, 
    p_dropout, 
    docf, 
    docst,
    loss_function, 
    version,
    num_epochs)



('20ng', None, None, 10, 3, 0.0, 'ce', 1, 1500)
('20ng', None, None, 10, 3, 0.0, 'ce', 2, 1500)
('20ng', None, None, 10, 3, 0.0, 'ce', 3, 1500)
('20ng', None, None, 10, 3, 0.0, 'fl', 1, 1500)
('20ng', None, None, 10, 3, 0.0, 'fl', 2, 1500)
('20ng', None, None, 10, 3, 0.0, 'fl', 3, 1500)
('20ng', None, None, 10, 3, 0.2, 'ce', 1, 1500)
('20ng', None, None, 10, 3, 0.2, 'ce', 2, 1500)
('20ng', None, None, 10, 3, 0.2, 'ce', 3, 1500)
('20ng', None, None, 10, 3, 0.2, 'fl', 1, 1500)
('20ng', None, None, 10, 3, 0.2, 'fl', 2, 1500)
('20ng', None, None, 10, 3, 0.2, 'fl', 3, 1500)
('20ng', None, None, 10, 3, 0.6, 'ce', 1, 1500)
('20ng', None, None, 10, 3, 0.6, 'ce', 2, 1500)
('20ng', None, None, 10, 3, 0.6, 'ce', 3, 1500)
('20ng', None, None, 10, 3, 0.6, 'fl', 1, 1500)
('20ng', None, None, 10, 3, 0.6, 'fl', 2, 1500)
('20ng', None, None, 10, 3, 0.6, 'fl', 3, 1500)
('20ng', None, None, 20, 3, 0.0, 'ce', 1, 1500)
('20ng', None, None, 20, 3, 0.0, 'ce', 2, 1500)
('20ng', None, None, 20, 3, 0.0, 'ce', 3

In [14]:
for value in x:
    print(value[0])

20ng
20ng
20ng
20ng
20ng
20ng
bbc
bbc
bbc
bbc
bbc
bbc
classic4
classic4
classic4
classic4
classic4
classic4
nsf
nsf
nsf
nsf
nsf
nsf
cstr
cstr
cstr
cstr
cstr
cstr
dmoz_computers
dmoz_computers
dmoz_computers
dmoz_computers
dmoz_computers
dmoz_computers
dmoz_health
dmoz_health
dmoz_health
dmoz_health
dmoz_health
dmoz_health
dmoz_science
dmoz_science
dmoz_science
dmoz_science
dmoz_science
dmoz_science
dmoz_sports
dmoz_sports
dmoz_sports
dmoz_sports
dmoz_sports
dmoz_sports
re8
re8
re8
re8
re8
re8


In [3]:
databases = ['20ng', 'bbc', 'classic4', 'nsf', 'cstr', 'dmoz_computers', 'dmoz_health', 'dmoz_science', 'dmoz_sports', 're8']

for database_name in databases:
    print(f'TESTING FOR {database_name}')
    tpbg, tpbg_train, tpbg_test = run_tpbg_on_dataset(database_name)

    heterodata_pbg_train = get_heterograph_pbg_features(tpbg_train, doc_features=None)
    heterodata_pbg_test = get_heterograph_pbg_features(tpbg_test, doc_features=None)

    df_experiment = run_heterognn_splitted(database_name=database_name+'t',
           description='', 
           heterodata_train=heterodata_pbg_train,
           heterodata_test=heterodata_pbg_test,
           hidden_channels=100,
           num_layers=2,
           p_dropout=0.2,
           num_epochs=1000, 
           aggr='sum',
           version=1, 
           loss_function='ce', 
           verbose=False)
    print(df_experiment)
    print('*************************************')

[INFO][2023-05-25 20:46:29 - Mod: functions - Func: load_data - Line: 599]: Loaded data for nsf. 5 classes: ['oceanography', 'politic', 'data', 'theory', 'ecology']. Number of documents: 3326.


TESTING FOR nsf


initialing.[]:   :   0%|          | 0/4476 [00:00<?, ?it/s]

[INFO][2023-05-25 20:47:02 - Mod: 18719587 - Func: run_tpbg_on_dataset - Line: 29]: Loaded TPBG for nsf with K=5.


              precision    recall  f1-score   support

           0     0.8359    0.9194    0.8757       360
           1     0.8963    0.9391    0.9172       230
           2     0.8696    0.7292    0.7932       192
           3     0.9322    0.7783    0.8483       212
           4     0.8371    0.8843    0.8600       337

    accuracy                         0.8640      1331
   macro avg     0.8742    0.8501    0.8589      1331
weighted avg     0.8668    0.8640    0.8626      1331



[INFO][2023-05-25 20:47:05 - Mod: 18719587 - Func: run_tpbg_on_dataset - Line: 37]: Computed 400 document features with doc2vec.
[INFO][2023-05-25 20:47:05 - Mod: functions - Func: get_heterograph_pbg_features - Line: 869]: Creating source-target edges.
[INFO][2023-05-25 20:47:05 - Mod: functions - Func: get_heterograph_pbg_features - Line: 895]: Generated bipartite graph: num. source nodes: 1995, num. target nodes: 4476, num. classes: 5
[INFO][2023-05-25 20:47:05 - Mod: functions - Func: get_heterograph_pbg_features - Line: 869]: Creating source-target edges.
[INFO][2023-05-25 20:47:05 - Mod: functions - Func: get_heterograph_pbg_features - Line: 895]: Generated bipartite graph: num. source nodes: 1331, num. target nodes: 4476, num. classes: 5
 53%|█████████████████████▎                  | 532/1000 [00:21<00:19, 24.27it/s][INFO][2023-05-25 20:47:27 - Mod: functions - Func: run_heterognn_splitted - Line: 1044]: Early stopping at epoch 533.
 53%|█████████████████████▎                  |

(0.6285914778709412, 0.8859966696690007, 0.8858001502629602, 32)
*************************************
TESTING FOR cstr


initialing.[]:   :   0%|          | 0/4426 [00:00<?, ?it/s]

[INFO][2023-05-25 20:47:34 - Mod: 18719587 - Func: run_tpbg_on_dataset - Line: 29]: Loaded TPBG for cstr with K=4.


              precision    recall  f1-score   support

           0     0.4510    0.8846    0.5974        26
           1     0.5500    0.8148    0.6567        27
           2     0.8947    0.6296    0.7391        27
           3     0.8000    0.2000    0.3200        40

    accuracy                         0.5833       120
   macro avg     0.6739    0.6323    0.5783       120
weighted avg     0.6894    0.5833    0.5502       120



[INFO][2023-05-25 20:47:34 - Mod: 18719587 - Func: run_tpbg_on_dataset - Line: 37]: Computed 400 document features with doc2vec.
[INFO][2023-05-25 20:47:34 - Mod: functions - Func: get_heterograph_pbg_features - Line: 869]: Creating source-target edges.
[INFO][2023-05-25 20:47:34 - Mod: functions - Func: get_heterograph_pbg_features - Line: 895]: Generated bipartite graph: num. source nodes: 179, num. target nodes: 4426, num. classes: 4
[INFO][2023-05-25 20:47:34 - Mod: functions - Func: get_heterograph_pbg_features - Line: 869]: Creating source-target edges.
[INFO][2023-05-25 20:47:34 - Mod: functions - Func: get_heterograph_pbg_features - Line: 895]: Generated bipartite graph: num. source nodes: 120, num. target nodes: 4426, num. classes: 4
 53%|█████████████████████▎                  | 532/1000 [00:16<00:14, 31.73it/s][INFO][2023-05-25 20:47:51 - Mod: functions - Func: run_heterognn_splitted - Line: 1044]: Early stopping at epoch 533.
 53%|█████████████████████▎                  | 5

(0.8563113808631897, 0.746734964993392, 0.75, 32)
*************************************
TESTING FOR dmoz_computers


initialing.[]:   :   0%|          | 0/6058 [00:00<?, ?it/s]

[INFO][2023-05-25 20:48:24 - Mod: 18719587 - Func: run_tpbg_on_dataset - Line: 29]: Loaded TPBG for dmoz_computers with K=5.


              precision    recall  f1-score   support

           0     0.8200    0.8159    0.8180       201
           1     0.9300    0.8651    0.8964       215
           2     0.9250    0.8447    0.8831       219
           3     0.5700    0.7170    0.6351       159
           4     0.7650    0.7427    0.7537       206

    accuracy                         0.8020      1000
   macro avg     0.8020    0.7971    0.7972      1000
weighted avg     0.8156    0.8020    0.8068      1000



[INFO][2023-05-25 20:48:26 - Mod: 18719587 - Func: run_tpbg_on_dataset - Line: 37]: Computed 400 document features with doc2vec.
[INFO][2023-05-25 20:48:26 - Mod: functions - Func: get_heterograph_pbg_features - Line: 869]: Creating source-target edges.
[INFO][2023-05-25 20:48:26 - Mod: functions - Func: get_heterograph_pbg_features - Line: 895]: Generated bipartite graph: num. source nodes: 1500, num. target nodes: 6058, num. classes: 5
[INFO][2023-05-25 20:48:26 - Mod: functions - Func: get_heterograph_pbg_features - Line: 869]: Creating source-target edges.
[INFO][2023-05-25 20:48:26 - Mod: functions - Func: get_heterograph_pbg_features - Line: 895]: Generated bipartite graph: num. source nodes: 1000, num. target nodes: 6058, num. classes: 5
 74%|█████████████████████████████▋          | 741/1000 [00:34<00:11, 21.94it/s][INFO][2023-05-25 20:49:00 - Mod: functions - Func: run_heterognn_splitted - Line: 1044]: Early stopping at epoch 741.
 74%|█████████████████████████████▋          |

(0.7568603754043579, 0.8060213274892131, 0.808, 240)
*************************************
TESTING FOR dmoz_health


initialing.[]:   :   0%|          | 0/6248 [00:00<?, ?it/s]

[INFO][2023-05-25 20:49:29 - Mod: 18719587 - Func: run_tpbg_on_dataset - Line: 29]: Loaded TPBG for dmoz_health with K=5.


              precision    recall  f1-score   support

           0     0.8450    0.8802    0.8622       192
           1     0.8700    0.8286    0.8488       210
           2     0.8600    0.9247    0.8912       186
           3     0.8550    0.8507    0.8529       201
           4     0.7850    0.7441    0.7640       211

    accuracy                         0.8430      1000
   macro avg     0.8430    0.8457    0.8438      1000
weighted avg     0.8424    0.8430    0.8422      1000



[INFO][2023-05-25 20:49:31 - Mod: 18719587 - Func: run_tpbg_on_dataset - Line: 37]: Computed 400 document features with doc2vec.
[INFO][2023-05-25 20:49:31 - Mod: functions - Func: get_heterograph_pbg_features - Line: 869]: Creating source-target edges.
[INFO][2023-05-25 20:49:31 - Mod: functions - Func: get_heterograph_pbg_features - Line: 895]: Generated bipartite graph: num. source nodes: 1500, num. target nodes: 6248, num. classes: 5
[INFO][2023-05-25 20:49:31 - Mod: functions - Func: get_heterograph_pbg_features - Line: 869]: Creating source-target edges.
[INFO][2023-05-25 20:49:31 - Mod: functions - Func: get_heterograph_pbg_features - Line: 895]: Generated bipartite graph: num. source nodes: 1000, num. target nodes: 6248, num. classes: 5
100%|███████████████████████████████████████| 1000/1000 [00:48<00:00, 20.64it/s]
[INFO][2023-05-25 20:50:20 - Mod: functions - Func: run_heterognn_splitted - Line: 1047]: Optimal sol. database dmoz_healtht: 
GNN ver. 1, lf. ce 
epoch: 747/1000, 

(0.5500752329826355, 0.858656374632808, 0.858, 747)
*************************************
TESTING FOR dmoz_science


initialing.[]:   :   0%|          | 0/7413 [00:00<?, ?it/s]

[INFO][2023-05-25 20:50:52 - Mod: 18719587 - Func: run_tpbg_on_dataset - Line: 29]: Loaded TPBG for dmoz_science with K=5.


              precision    recall  f1-score   support

           0     0.7850    0.7371    0.7603       213
           1     0.8950    0.8483    0.8710       211
           2     0.7050    0.8198    0.7581       172
           3     0.8950    0.7397    0.8100       242
           4     0.7050    0.8704    0.7790       162

    accuracy                         0.7970      1000
   macro avg     0.7970    0.8030    0.7957      1000
weighted avg     0.8081    0.7970    0.7983      1000



[INFO][2023-05-25 20:50:54 - Mod: 18719587 - Func: run_tpbg_on_dataset - Line: 37]: Computed 400 document features with doc2vec.
[INFO][2023-05-25 20:50:54 - Mod: functions - Func: get_heterograph_pbg_features - Line: 869]: Creating source-target edges.
[INFO][2023-05-25 20:50:54 - Mod: functions - Func: get_heterograph_pbg_features - Line: 895]: Generated bipartite graph: num. source nodes: 1500, num. target nodes: 7413, num. classes: 5
[INFO][2023-05-25 20:50:54 - Mod: functions - Func: get_heterograph_pbg_features - Line: 869]: Creating source-target edges.
[INFO][2023-05-25 20:50:54 - Mod: functions - Func: get_heterograph_pbg_features - Line: 895]: Generated bipartite graph: num. source nodes: 1000, num. target nodes: 7413, num. classes: 5
100%|███████████████████████████████████████| 1000/1000 [00:55<00:00, 17.98it/s]
[INFO][2023-05-25 20:51:49 - Mod: functions - Func: run_heterognn_splitted - Line: 1047]: Optimal sol. database dmoz_sciencet: 
GNN ver. 1, lf. ce 
epoch: 778/1000,

(0.9145424365997314, 0.8084426922498174, 0.81, 778)
*************************************
TESTING FOR dmoz_sports


initialing.[]:   :   0%|          | 0/5123 [00:00<?, ?it/s]

[INFO][2023-05-25 20:52:21 - Mod: 18719587 - Func: run_tpbg_on_dataset - Line: 29]: Loaded TPBG for dmoz_sports with K=5.


              precision    recall  f1-score   support

           0     0.8750    0.8537    0.8642       205
           1     0.7400    0.9193    0.8199       161
           2     0.8650    0.8782    0.8715       197
           3     0.8850    0.8510    0.8676       208
           4     0.9100    0.7948    0.8485       229

    accuracy                         0.8550      1000
   macro avg     0.8550    0.8594    0.8544      1000
weighted avg     0.8614    0.8550    0.8556      1000



[INFO][2023-05-25 20:52:23 - Mod: 18719587 - Func: run_tpbg_on_dataset - Line: 37]: Computed 400 document features with doc2vec.
[INFO][2023-05-25 20:52:23 - Mod: functions - Func: get_heterograph_pbg_features - Line: 869]: Creating source-target edges.
[INFO][2023-05-25 20:52:23 - Mod: functions - Func: get_heterograph_pbg_features - Line: 895]: Generated bipartite graph: num. source nodes: 1500, num. target nodes: 5123, num. classes: 5
[INFO][2023-05-25 20:52:23 - Mod: functions - Func: get_heterograph_pbg_features - Line: 869]: Creating source-target edges.
[INFO][2023-05-25 20:52:23 - Mod: functions - Func: get_heterograph_pbg_features - Line: 895]: Generated bipartite graph: num. source nodes: 1000, num. target nodes: 5123, num. classes: 5
 58%|███████████████████████                 | 578/1000 [00:24<00:17, 24.47it/s][INFO][2023-05-25 20:52:48 - Mod: functions - Func: run_heterognn_splitted - Line: 1044]: Early stopping at epoch 578.
 58%|███████████████████████                 |

(0.4352548122406006, 0.8737927365595427, 0.874, 77)
*************************************
TESTING FOR re8


[INFO][2023-05-25 20:52:48 - Mod: functions - Func: load_data - Line: 599]: Loaded data for re8. 8 classes: ['acq', 'trade', 'earn', 'crude', 'interest', 'money', 'ship', 'grain']. Number of documents: 7674.


initialing.[]:   :   0%|          | 0/19397 [00:00<?, ?it/s]

[INFO][2023-05-25 20:55:04 - Mod: 18719587 - Func: run_tpbg_on_dataset - Line: 29]: Loaded TPBG for re8 with K=8.


              precision    recall  f1-score   support

           0     0.7514    0.9609    0.8433       717
           1     0.9389    0.6989    0.8013       176
           2     0.7234    1.0000    0.8395      1135
           3     0.8933    0.8171    0.8535       164
           4     0.7870    0.3664    0.5000       232
           5     0.8462    0.5562    0.6712       178
           6     0.9828    0.4161    0.5846       137
           7     0.7000    0.0423    0.0798       331

    accuracy                         0.7609      3070
   macro avg     0.8279    0.6072    0.6467      3070
weighted avg     0.7723    0.7609    0.7103      3070



[INFO][2023-05-25 20:55:13 - Mod: 18719587 - Func: run_tpbg_on_dataset - Line: 37]: Computed 400 document features with doc2vec.
[INFO][2023-05-25 20:55:14 - Mod: functions - Func: get_heterograph_pbg_features - Line: 869]: Creating source-target edges.
[INFO][2023-05-25 20:55:14 - Mod: functions - Func: get_heterograph_pbg_features - Line: 895]: Generated bipartite graph: num. source nodes: 4604, num. target nodes: 19397, num. classes: 8
[INFO][2023-05-25 20:55:14 - Mod: functions - Func: get_heterograph_pbg_features - Line: 869]: Creating source-target edges.
[INFO][2023-05-25 20:55:14 - Mod: functions - Func: get_heterograph_pbg_features - Line: 895]: Generated bipartite graph: num. source nodes: 3070, num. target nodes: 19397, num. classes: 8
 56%|██████████████████████▍                 | 561/1000 [02:24<02:04,  3.52it/s][INFO][2023-05-25 20:57:39 - Mod: functions - Func: run_heterognn_splitted - Line: 1044]: Early stopping at epoch 561.
 56%|██████████████████████▍                

(1.0257148742675781, 0.8029578967792362, 0.7964169381107492, 60)
*************************************


In [None]:
#tpbg, tpbg_train, tpbg_test = run_tpbg_on_dataset('bbc_news')

dataframe = pd.read_csv('data/bbc_news.csv', sep=',')
dataframe_train, dataframe_test = train_test_split(dataframe, test_size=0.4, random_state=1234, stratify=dataframe[['category']])
dataframe_train['is_train'] = 1.0
dataframe_test['is_train'] = 0.0

target_names = dataframe["category"].unique().tolist()
mapping_categories = {}
for idx, element in enumerate(target_names):
    mapping_categories[element] = idx
data = []
target = []
is_train = []
filenames = None
DESCR = '''All rights, including copyright, in the content of the original articles are owned by the BBC. Consists of 2225 documents from the BBC news website corresponding to stories in five topical areas from 2004-2005. Class Labels: 5 (business, entertainment, politics, sport, tech).'''
for index, row in dataframe.iterrows():
    data.append(row['text'])
    target.append(mapping_categories[row['category']])
    is_train.append(row['is_train'])

target = np.array(target)
is_train = np.array(is_train)
database = Database(target_names, data, target, is_train, filenames, DESCR)

In [None]:
heterodata_pbg_train = get_heterograph_pbg_features(tpbg_train, doc_features=None)
heterodata_pbg_test = get_heterograph_pbg_features(tpbg_test, doc_features=None)

df_experiment = run_heterognn_splitted(database_name='20ng-t',
       description='', 
       heterodata_train=heterodata_pbg_train,
       heterodata_test=heterodata_pbg_test,
       hidden_channels=100,
       num_layers=2,
       p_dropout=0.2,
       num_epochs=1000, 
       aggr='sum',
       version=1, 
       loss_function='fl', 
       verbose=True)

In [None]:
for i in range(100):
    print(f'i: {i}, train_y: {tpbg_train.y[i]}, train_loga:{tpbg_train.log_A[i]}, test_y: {tpbg_test.y[i]}, test_loga:{tpbg_test.log_A[i]}')


In [None]:
database = load_data(database_name='20ng')
data_preprocessed = SimplePreprocessing().transform(database.data)
vectorizer = TfidfVectorizer()
data_vectorized_fit = vectorizer.fit_transform(data_preprocessed)
y = database.target
y_train_real = y.copy()

unlabeled_set = database.target[np.where(database.is_train==0)]
y[database.is_train==0] = -1

In [None]:
heterodata_pbg_train

In [None]:
heterodata_pbg_test.__dict__

In [None]:
def run_heterognn_splitted(database_name,
                           description, 
                           heterodata_train,
                           heterodata_test,
                           hidden_channels,
                           num_layers,
                           p_dropout,
                           num_epochs, 
                           aggr,
                           version, 
                           loss_function, 
                           verbose=False):

    try:

        import warnings
        import timeit 

        warnings.simplefilter(action='ignore', category=FutureWarning)

        time_start = timeit.default_timer()

        model = HeteroGNN(metadata=heterodata_train.metadata(), 
                          hidden_channels=hidden_channels, 
                          out_channels=heterodata_train['source']['num_classes'],
                          num_layers=num_layers,
                          p_dropout=p_dropout,
                          aggr=aggr, 
                          version=version)
        min_loss = np.inf
        max_acc = 0
        patience = int(num_epochs/2)
        epoch_convergence = 0

        output_list = []
        df = pd.DataFrame(columns=['database_name', 'description', 'hidden_channels', 'num_layers', 'p_dropout', 'loss_function', 'version', 'loss_train', 'micro_train', 'acc_train', 'loss_test', 'micro_test', 'acc_test', 'epoch', 'epoch_convergence', 'elapsed_time'])

        model_name = f"model_{database_name}_{description}_hid_{hidden_channels}_layers_{num_layers}_pdrop_{p_dropout}".replace("=", "_").replace(" ", "_")

        optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-3)

        for epoch in trange(num_epochs):
            loss_train, micro_train, acc_train = train_model(model=model, train_dataset=heterodata_train, optimizer=optimizer, loss_function=loss_function)
            loss_test, micro_test, acc_test = test_model(model, heterodata_test, loss_function=loss_function)

            if loss_train <= min_loss and acc_test >= max_acc:
                min_loss = loss_train
                max_acc = acc_test
                best_model = model
                best_loss_test = loss_test
                best_micro_test = micro_test
                best_acc_test = acc_test
                epoch_convergence = epoch

            time_end = timeit.default_timer()
            elapsed_time = round((time_end - time_start) * 10 ** 0, 3)
            output_list = [database_name, description, hidden_channels, num_layers, p_dropout, loss_function, version, loss_train, micro_train, acc_train, loss_test, micro_test, acc_test, epoch, epoch_convergence, elapsed_time]
            row = pd.Series(output_list, index=df.columns)
            df = df.append(row,ignore_index=True) 

            if verbose:
                logger.info(f'\nLoss (train): {loss_train:.4f}, Loss (test): {loss_test:.4f}, F1 (train): {micro_train:.4f}, F1 (test): {micro_test:.4f}')

            if (epoch >= patience or acc_train > 0.99) and (epoch - epoch_convergence) > 500:
                logger.info(f'Early stopping at epoch {epoch}.')
                break

        logger.info(f'Optimal sol. database {database_name}: \nGNN ver. {version}, lf. {loss_function} \nepoch: {epoch_convergence}/{num_epochs}, loss (test): {best_loss_test:.4f}, f1 (test): {best_micro_test:.4f}, acc (test): {best_acc_test:.4f}')
        
        df.to_csv(f'./csv_objects/training/{model_name}.csv', sep=';', decimal=',', index=False)
        
        with open(f'./pickle_objects/models/{model_name}.pickle', 'wb') as f:
            pickle.dump(best_model, f, pickle.HIGHEST_PROTOCOL)
        return best_loss_test, best_micro_test, best_acc_test, epoch_convergence
    
    except Exception as e:
        logger.error(f'Error training model on heterodata: \n {e}')




In [None]:
heterodata_pbg_train = get_heterograph_pbg_features(tpbg_train, doc_features=None)
heterodata_pbg_test = get_heterograph_pbg_features(tpbg_test, doc_features=None)

df_experiment = run_heterognn_splitted(database_name='20ng-t',
       description='', 
       heterodata_train=heterodata_pbg_train,
       heterodata_test=heterodata_pbg_test,
       hidden_channels=100,
       num_layers=2,
       p_dropout=0.2,
       num_epochs=1000, 
       aggr='sum',
       version=1, 
       loss_function='fl', 
       verbose=True)


In [None]:
def run_upbg_on_dataset(database_name, K, disable_tqdm=True):

    try:
    
        database = load_data(database_name=database_name)
        data_preprocessed = SimplePreprocessing().transform(database.data)
        vectorizer = TfidfVectorizer()
        data_vectorized_fit = vectorizer.fit_transform(data_preprocessed)
        y = database.target
                    
        upbg = PBG(K, alpha=0.005, beta=0.001, local_max_itr=50, global_max_itr=10,
            local_threshold=1e-6, global_threshold=1e-6,
            feature_names=vectorizer.get_feature_names_out())
        
        upbg.fit(data_vectorized_fit)
        logger.info(f'Loaded UPBG for {database_name} with K={K}.')
        
        # doc2vec embeddings
        sentences = [re.findall("[a-z\-]+",s.lower()) for s in data_preprocessed]
        documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(sentences)]
        doc2vec_model = Doc2Vec(documents, vector_size=400, window=10, min_count=1, workers=4)
        doc2vec_model.train(documents, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)
        document_features = doc2vec_model.dv.vectors
        logger.info(f'Computed 400 document features with doc2vec.')

        upbg.document_features = document_features
        upbg.is_train = database.is_train
        upbg.n_class = len(database.target_names)
        upbg.y = y

        # Final embeddings
        upbg_train = UPBG(K)
        upbg_train.log_A = upbg.log_A[database.is_train==1]
        upbg_train.log_B = upbg.log_B
        upbg_train.document_features = upbg.document_features[database.is_train==1]
        upbg_train.Xc = upbg.X[database.is_train==1]
        upbg_train.y = database.target[database.is_train==1]
        upbg_train.n_class = len(database.target_names)

        upbg_test = UPBG(K)
        upbg_test.log_A = upbg.log_A[database.is_train==0]
        upbg_test.log_B = upbg.log_B
        upbg_test.document_features = upbg.document_features[database.is_train==0]
        upbg_test.Xc = upbg.X[database.is_train==0]
        upbg_test.y = database.target[database.is_train==0]
        upbg_test.n_class = len(database.target_names)
        
        return upbg, upbg_train, upbg_test

    except Exception as e:
        logger.info(f'Error fitting UPBG for {database_name}: \n {e}') 

#tpbg, tpbg_train, tpbg_test = run_tpbg_on_dataset('20ng')



#run_upbg_on_dataset('20ng', 100)

upbg, upbg_train, upbg_test = run_upbg_on_dataset('20ng', 100)

In [None]:
def run_heterognn_splitted(database_name,
                           description, 
                           heterodata_train,
                           heterodata_test,
                           hidden_channels,
                           num_layers,
                           p_dropout,
                           num_epochs, 
                           aggr,
                           version, 
                           loss_function, 
                           verbose=False):

    try:

        import warnings
        import timeit 

        warnings.simplefilter(action='ignore', category=FutureWarning)

        time_start = timeit.default_timer()

        model = HeteroGNN(metadata=heterodata_train.metadata(), 
                          hidden_channels=hidden_channels, 
                          out_channels=heterodata_train['source']['num_classes'],
                          num_layers=num_layers,
                          p_dropout=p_dropout,
                          aggr=aggr, 
                          version=version)
        min_loss = np.inf
        max_acc = 0
        patience = int(num_epochs/2)
        epoch_convergence = 0

        output_list = []
        df = pd.DataFrame(columns=['database_name', 'description', 'hidden_channels', 'num_layers', 'p_dropout', 'loss_function', 'version', 'loss_train', 'micro_train', 'acc_train', 'loss_test', 'micro_test', 'acc_test', 'epoch', 'epoch_convergence', 'elapsed_time'])

        model_name = f"model_{database_name}_{description}_hid_{hidden_channels}_layers_{num_layers}_pdrop_{p_dropout}".replace("=", "_").replace(" ", "_")

        optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-3)

        for epoch in trange(num_epochs):
            loss_train, micro_train, acc_train = train_model(model=model, train_dataset=heterodata_train, optimizer=optimizer, loss_function=loss_function)
            loss_test, micro_test, acc_test = test_model(model, heterodata_test, loss_function=loss_function)

            if loss_train <= min_loss and acc_test >= max_acc:
                min_loss = loss_train
                max_acc = acc_test
                best_model = model
                best_loss_test = loss_test
                best_micro_test = micro_test
                best_acc_test = acc_test
                epoch_convergence = epoch

            time_end = timeit.default_timer()
            elapsed_time = round((time_end - time_start) * 10 ** 0, 3)
            output_list = [database_name, description, hidden_channels, num_layers, p_dropout, loss_function, version, loss_train, micro_train, acc_train, loss_test, micro_test, acc_test, epoch, epoch_convergence, elapsed_time]
            row = pd.Series(output_list, index=df.columns)
            df = df.append(row,ignore_index=True) 

            if verbose:
                logger.info(f'\nLoss (train): {loss_train:.4f}, Loss (test): {loss_test:.4f}, F1 (train): {micro_train:.4f}, F1 (test): {micro_test:.4f}')

            if (epoch >= patience or acc_train > 0.99) and (epoch - epoch_convergence) > 500:
                logger.info(f'Early stopping at epoch {epoch}.')
                break

        logger.info(f'Optimal sol. database {database_name}: \nGNN ver. {version}, lf. {loss_function} \nepoch: {epoch_convergence}/{num_epochs}, loss (test): {best_loss_test:.4f}, f1 (test): {best_micro_test:.4f}, acc (test): {best_acc_test:.4f}')
        
        df.to_csv(f'./csv_objects/training/{model_name}.csv', sep=';', decimal=',', index=False)
        
        with open(f'./pickle_objects/models/{model_name}.pickle', 'wb') as f:
            pickle.dump(best_model, f, pickle.HIGHEST_PROTOCOL)
        return best_loss_test, best_micro_test, best_acc_test, epoch_convergence
    
    except Exception as e:
        logger.error(f'Error training model on heterodata: \n {e}')




In [None]:
upbg.__dict__.keys()


In [None]:
upbg_rain.__dict__

In [None]:
# heterodata_pbg_train = get_heterograph_pbg_features(upbg_train, doc_features=None)
# heterodata_pbg_test = get_heterograph_pbg_features(upbg_test, doc_features=None)

df_experiment = run_heterognn_splitted(database_name='20ng-t',
       description='', 
       heterodata_train=heterodata_pbg_train,
       heterodata_test=heterodata_pbg_train,
       hidden_channels=200,
       num_layers=2,
       p_dropout=0.2,
       num_epochs=1000, 
       aggr='sum',
       version=1, 
       loss_function='fl', 
       verbose=True)

In [None]:
heterodata_pbg_train

In [None]:
df_experiment

In [None]:

heterodata_pbg_train = get_heterograph_pbg_features(tpbg_train, doc_features='replace')
heterodata_pbg_test = get_heterograph_pbg_features(tpbg_test, doc_features='replace')

df_experiment = run_heterognn_splitted(database_name='20ng-t2',
       description='', 
       heterodata_train=heterodata_pbg_train,
       heterodata_test=heterodata_pbg_test,
       hidden_channels=100,
       num_layers=2,
       p_dropout=0.2,
       num_epochs=200, 
       aggr='sum',
       version=1, 
       loss_function='ce', 
       verbose=False)

print(df_experiment)

In [None]:
df_experiment = run_heterognn_splitted(database_name='20ng-t2',
       description='', 
       heterodata_train=heterodata_pbg_train,
       heterodata_test=heterodata_pbg_test,
       hidden_channels=400,
       num_layers=3,
       p_dropout=0.2,
       num_epochs=2000, 
       aggr='sum',
       version=1, 
       loss_function='ce', 
       verbose=True)

print(df_experiment)