In [None]:
if 'google.colab' in str(get_ipython()):
    from google.colab import userdata
    access_token = userdata.get('DEFORMER_TOKEN')
    !pip install git+https://$access_token@github.com/ay94/deformer-extractor.git@v1.0.3

Collecting git+https://****@github.com/ay94/deformer-extractor.git@v1.0.2
  Cloning https://****@github.com/ay94/deformer-extractor.git (to revision v1.0.2) to /tmp/pip-req-build-hnj0vskj
  Running command git clone --filter=blob:none --quiet 'https://****@github.com/ay94/deformer-extractor.git' /tmp/pip-req-build-hnj0vskj
  Resolved https://****@github.com/ay94/deformer-extractor.git to commit 8eca06f0fd20fb58884bbddb5fc5f19b1b0260cd
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting appnope (from experiment_utils==1.0.2)
  Downloading appnope-0.1.4-py2.py3-none-any.whl.metadata (908 bytes)
Collecting arabert (from experiment_utils==1.0.2)
  Downloading arabert-1.0.1-py3-none-any.whl.metadata (16 kB)
Collecting asttokens (from experiment_utils==1.0.2)
  Downloading asttokens-2.4.1-py2.py3-none-any.whl.metadata (5.2 kB)
Collecting comm (from experiment_utils==1.0.2)
  Downloading comm-0.2.2-py3-none-any.whl.metadata (3.7 kB)
Collecting emoji (from experiment_utils==1.0.2)


In [None]:
from experiment_utils import env_setup
from experiment_utils.utils import  FileHandler
from experiment_utils.pipelines import  ExperimentInitializer

In [None]:
base_folder = env_setup.init(drive_folder='My Drive', drive_mount='drive')

Mounted at /content/drive


In [None]:
experiment_base_folder = base_folder / 'Final Year Experiments/Thesis-Experiments/Experiments'
experiments_fh = FileHandler(experiment_base_folder)
experiments = experiments_fh.load_yaml('experiments.yaml')

In [None]:
variant = "ANERCorp_CamelLab_arabertv02"
variant_data = experiments.get(variant)


experiment_config = {
    "experiment_name": "BaseLineExperiment",
    "corpora_path": "ExperimentData",
    "variant": variant,
    "dataset_name": variant_data['dataset_name'],
    "model_name": variant_data['model_name'],
    "model_path": variant_data['model_path'],
    "results_config": "results_config.yaml",
    "extraction_config": "extraction_config.yaml",
    "fine_tuning_config": "fine_tuning_config.yaml"
}


extraction_config = {
    "fine_tuning": {
        "args": {
            "train_batch_size": 16,
            "test_batch_size": 8,
            "shuffle": True,
            "num_workers": 2,
            "epochs": 4,
            "splits": 4,
            "learning_rate": 5e-5,
            "warmup_ratio": 0.1,
            "max_grad_norm": 1.0,
            "accumulation_steps": 1,
            "logging_step": 50
        },
        "model": {
            "model_path": variant_data['model_path'],
            "dropout_rate": 0.1,
            "enable_attentions": False,
            "enable_hidden_states": False,
            "initialize_output_layer": False
        },
        "evaluation": {
            "scheme": None,
            "mode": None
        }
    },
    "extraction": {
        "tokenization": {
            "tokenizer_path": variant_data['model_path'],
            "preprocessor_path": variant_data['model_path'],
            "max_seq_len": 256,
            "strategy": {
                "type": "core",
                "index": 0,
                "schema": None
            }
        },
        "umap": {
            "n_neighbors": 15,
            "min_dist": 0.1,
            "metric": "cosine",
            "random_state": 1,
            "verbose": True,
            "normalize_embeddings": False
        },
        "clustering": {
            "init_method": "k-means++",
            "n_init": 10,
            "random_state": 1,
            "n_clusters": [3, 4, 9],
            "n_clusters_map": {
                3: "boundary_clusters",
                4: "entity_clusters",
                9: "token_clusters"
            },
            "silhouette_metric": "cosine",
            "norm": "l2"
        }
    }
}

results_config = {
    "results_dir": "extractions",
    "analysis_data": {
        "folder": "analysis",
        "filename": "analysis_data",
        "format": "json"
    },
    "entity_report": {
        "folder": "results",
        "filename": "entity_report",
        "format": "json"
    },
    "token_report": {
        "folder": "results",
        "filename": "token_report",
        "format": "json"
    },
    "results": {
        "folder": "results",
        "filename": "results",
        "format": "json"
    },
    "kmeans_results": {
        "folder": "results",
        "filename": "kmeans_results",
        "format": "json"
    },
    "entity_confusion_data": {
        "folder": "results",
        "filename": "entity_confusion_data",
        "format": "json"
    },
    "attention_weights_similarity_matrix": {
        "folder": "matrices",
        "filename": "attention_weights_similarity_matrix",
        "format": "npy"
    },
    "attention_weights_similarity_heatmap": {
        "folder": "matrices",
        "filename": "attention_weights_similarity_heatmap",
        "format": "json"
    },
    "attention_similarity_matrix": {
        "folder": "matrices",
        "filename": "attention_similarity_matrix",
        "format": "npy"
    },
    "attention_similarity_heatmap": {
        "folder": "matrices",
        "filename": "attention_similarity_heatmap",
        "format": "json"
    },
    "centroids_avg_similarity_matrix": {
        "folder": "matrices",
        "filename": "centroids_avg_similarity_matrix",
        "format": "json"
    },
    "train_data": {
        "folder": "analysis",
        "filename": "train_data",
        "format": "json"
    },
}

fine_tuning_config = {
    "save_dir": "fine_tuning",  # Base directory for saving outputs
    "metrics": {
        "folder": "metrics",
        "filename": "evaluation_metrics",
        "format": "json"
    },
    "model": {
        "state_dict": {
            "filename": "model_state_dict",
            "format": "pth"
        },
        "binary": {
            "filename": "model_binary",
            "format": "bin"
        }
    },

}

In [None]:
# setup experiment folder
manager = ExperimentInitializer(experiment_base_folder, experiment_config, extraction_config, results_config, fine_tuning_config)
manager.setup_experiment()

In [None]:
from google.colab import drive
drive.flush_and_unmount()