# MolFinDS – Experiments

## 1. Imports

In [None]:
import os
import sys

import numpy as np
import torch
import nglview
from Bio.PDB import *

sys.path.insert(0, '..')

from data.preprocessing_helpers import load_data

## 2. Settings / Configurations

In [None]:
# relevant paths
DATA_PATH = '../data/'
RAW_PATH = os.path.join(DATA_PATH, 'raw/')


# configure experiments to run; controls which data gets loaded
experiments_to_run = {}
experiments_to_run['pocket_classification'] = True
experiments_to_run['binding_site_prediction'] = True
experiments_to_run['search'] = True

## 3. Preprocessing

### 3.1. Data Loading

This subsection depends on the `experiments_to_run` dictionary in section 2. Depending on which experiments are to be run, this subsection will find a list (in the form of a newline-separated text file in the relevant `data` directory) of structures to load, and run a script to load them into `data/raw/experiment` for the relevant experiment. This will be passed off to the next subsection, which will process the loaded data into formats that can be fed into machine learning models.

In [None]:
for experiment in experiments_to_run.keys():
    if experiments_to_run[experiment]:
        # check if data has been loaded already, to avoid unecessary loading
        with open(os.path.join(DATA_PATH, experiment, 'full_list.txt')) as f:
            
            num_ids = len(f.readlines())
            if len(list(os.listdir(os.path.join(RAW_PATH, experiment)))) >= num_ids:
                print(len(list(os.listdir(os.path.join(RAW_PATH, experiment)))))
                print(list(os.listdir(os.path.join(RAW_PATH, experiment))))
                print(os.path.join(RAW_PATH, experiment))
                print('The data for this experiment ' + experiment + ' has been loaded already. Moving on...')
                continue
        # otherwise load the data
        path = os.path.join(DATA_PATH, experiment)
        in_file = os.path.join(path, 'full_list.txt')
        out_dir = os.path.join(RAW_PATH, experiment)
        print(in_file, out_dir)
        load_data(path, in_file, out_dir)

### 3.2. Exploratory Data Analysis

In [None]:
# visualize molecule
parser = PDBParser()
pdb_filenames = os.listdir(os.path.join(RAW_PATH, 'pocket_classification'))
pdb_files = [os.path.join(RAW_PATH, 'pocket_classification', pdb_filename) for pdb_filename in pdb_filenames]
print('first file: ' + pdb_files[0])

structure = parser.get_structure('x', pdb_files[0])
print(list(structure.get_atoms())[5].get_vector())

In [None]:
# build sample protein mesh with graphein
from data.preprocessing_helpers import build_mesh_from_pdb_file

mesh = build_mesh_from_pdb_file(pdb_files[0])

## 4. Initialize Models

## 5. Train Models

## 6. Test Models

## 7. Results