In [1]:
# Import all libraries

import numpy as np
from sklearn.cluster import DBSCAN
import pandas as pd
from barmccandidate import BARMCCandidate
from utilities import HB_truth, getCoordinates, find_key_by_value, printConvoys, extractInterestingMoleculesPerTimeframe, filterDF
from hyperparameters import Hyperparameters
import time
import copy

In [4]:
class BARMCMiner(object):
    """BAG Miner algorithm

    Attributes:
        k (int):  Min number of consecutive timestamps to be considered a BARMC
        m (int):  Min number of elements to be considered a BARMC
        t1 (int): Max allowed time gap
        t2 (float): Percentage of total time gaps in a relaxed BARMC
    """
    def __init__(self, clf, k, m, t1, t2, data, threshold, df, atomType, nitrogens_and_oxygens_indices):
        self.clf = clf
        self.k = k
        self.m = m
        self.t1 = t1
        self.t2 = t2
        self.data = data
        self.threshold = threshold
        self.df = df
        self.atomType = atomType
        self.nitrogens_and_oxygens_indices = nitrogens_and_oxygens_indices

    def HBCheck(self, atomsIHave, column):
        for hn in atomsIHave["hn"]:
            for n in atomsIHave["n"]:
                coord_hn = getCoordinates(self.data[column], hn)
                coord_n = getCoordinates(self.data[column], n)
                for o in atomsIHave["o"]:
                    coord_o = getCoordinates(self.data[column], o)
                    if HB_truth(coord_hn, coord_n, coord_o):
                        return True, [hn, n, o]
        
        return False, None

    def createClusters(self, X, y, column, indicesOfFilteredData):
        values = [row[column] if isinstance(row[column], (list, set)) else [row[column]] for row in X]
        values = np.array(values)[indicesOfFilteredData]
        if len(values) < self.m:
            return 1, 0 , 0 , 0
        clusters = self.clf.fit_predict(values, y=y)
        unique_clusters = set(clusters)
        clusters_indices = dict((cluster, BAGCandidate(indices=set(), is_assigned=False, start_time=None, end_time=None)) for cluster in unique_clusters)

        return 0, clusters, unique_clusters, clusters_indices
    
    def checkBAGs(self, atoms, candidate):
        if atoms == None:
            return False
        return (atoms[0] == candidate[0] and atoms[1] == candidate[1] and atoms[2] == candidate[2])
    
    def fit_predict(self, X, y=None):
        BAG_candidates = set()
        columns = len(X[0])
        BAGs = set()

        for column in range(columns):

            results = extractInterestingMoleculesPerTimeframe(self.data[column], self.nitrogens_and_oxygens_indices, self.threshold)

            reducedMolecules = []
            if len(results) > 0:
                reducedMolecules = np.unique(np.hstack((np.unique(np.array(results)[:, 0]), np.unique(np.array(results)[:, 1]))))

            if len(reducedMolecules) > 0:
                indicesOfFilteredData = np.sort(self.df[self.df["subst_id"].isin(reducedMolecules)].reset_index()["index"].values)
                tempIndices = np.arange(0, indicesOfFilteredData.shape[0])
                reverser = dict(zip(tempIndices, indicesOfFilteredData)) #reverser to use to get original indices of the atoms in clusters after cluster creation

            enough_objects, clusters, unique_clusters, clusters_indices = self.createClusters(X, y, column, indicesOfFilteredData)

            if enough_objects == 1 or len(reducedMolecules) == 0:
                continue
            
            for index, cluster_assignment in enumerate(clusters):
                clusters_indices[cluster_assignment].indices.add(reverser[index])

            # update existing convoys
            current_BAG_candidates = set()
            for BAG_cand in BAG_candidates:


                # found = False

                BAG_cand.is_assigned = False
                for cluster in unique_clusters:
                    cluster_indices = clusters_indices[cluster].indices
                    cluster_candidate_intersection = cluster_indices & BAG_cand.indices

                    atomsIHave = {
                        "n": [],
                        "hn": [],
                        "o": []
                    }
                    for index in cluster_candidate_intersection:
                        if not find_key_by_value(index, self.atomType) == None:
                            atomsIHave[find_key_by_value(index, self.atomType)].append(index)
                    HB, HBAtoms = self.HBCheck(atomsIHave, column)
                    # print(HB, HBAtoms)
                    if HB and len(cluster_candidate_intersection) >= self.m and self.checkBAGs(HBAtoms, BAG_cand.atoms):
                        BAG_cand.atoms = HBAtoms
                        BAG_cand.indices = cluster_candidate_intersection
                        BAG_cand.end_time = column
                        BAG_cand.gap = 0
                        clusters_indices[cluster].is_assigned = True
                        BAG_cand.is_assigned = True
                        # found = True

                        break

                if not BAG_cand.is_assigned:

                    BAG_cand.gap += 1
                    BAG_cand.gaps += 1

                if (BAG_cand.is_assigned) or (BAG_cand.gap <= self.t1):
                    # print("Continuing", BAG_cand.totalGaps, BAG_cand.start_time, BAG_cand.end_time, BAG_cand.atoms)
                    current_BAG_candidates.add(copy.deepcopy(BAG_cand))
                elif BAG_cand.totalLength >= self.k and not BAG_cand.is_assigned:
                    # print("Checking 1", BAG_cand.totalGaps, BAG_cand.start_time, BAG_cand.end_time, BAG_cand.atoms)
                    BAG_cand.gaps = BAG_cand.gaps - (self.t1 + 1)
                    # print("Checking 2", BAG_cand.totalGaps, BAG_cand.start_time, BAG_cand.end_time, BAG_cand.atoms)
                    if(BAG_cand.totalGaps <= self.t2):
                    # if():
                        BAGs.add(copy.deepcopy(BAG_cand))



            # create new candidates
            for cluster in unique_clusters:
                cluster_data = clusters_indices[cluster]

                if cluster_data.is_assigned: #cluster is not a new cluster
                    continue
                atomsIHave = {
                    "n": [],
                    "hn": [],
                    "o": []
                }
                for index in cluster_data.indices:
                    if not find_key_by_value(index, self.atomType) == None:
                        atomsIHave[find_key_by_value(index, self.atomType)].append(index)
                HB, HBAtoms = self.HBCheck(atomsIHave, column)
                if not HB: #if HB not found
                    continue
                # print("strarting new", column,  HBAtoms)
                cluster_data.start_time = column
                cluster_data.end_time = column
                cluster_data.gap = 0
                cluster_data.gaps = 0
                cluster_data.atoms = HBAtoms
                current_BAG_candidates.add(copy.deepcopy(cluster_data))

            BAG_candidates = current_BAG_candidates
            if column == columns - 1:
                for BAG_cand in current_BAG_candidates:
                    if BAG_cand.totalLength >= self.k and BAG_cand.totalGaps <= self.t2:
                        BAGs.add(copy.deepcopy(BAG_cand))
        return BAGs

In [1]:
data = np.load(r"..\data\dataFile.npy")

fileLocation = r"..\data\allAtomsWithRings.csv" #location for information of atoms
df = pd.read_csv(fileLocation)


nitrogens_and_oxygens_indices = filterDF(df)



dataSize = [50, 100]
Ks = np.arange(0, 10, 1)
Ms = [25, 50]

for varyk in Ks:
    for varyM in Ms:
        for size in dataSize:
            data = np.load(r"..\data\dataFile.npy")
            hyperParameters = Hyperparameters(m = varyM, k = varyk, epsilon = 3.5, t1 = 2,  t2 = 0.25, threshold=4)

            print(f"K {varyk + 1}, M {varyM}, S {size}")

            transposedData = list()
            for x in range(data.shape[1]):
                transposedData.append(data[0:size,x,0:3].tolist())

            data = data[0:size]

            # get indices of atoms with atom_type 'n' and subst_id between 1-4
            n_atoms = df[(df['atom_type'] == 'n') & (df['subst_id'] >= 1) & (df['subst_id'] <= 4)].index.tolist()
            hn_atoms = df[(df['atom_type'] == 'hn') & (df['subst_id'] >= 1) & (df['subst_id'] <= 4)].index.tolist()

            # get indices of atoms with atom_type 'o' or 'os' and subst_id between 5-14
            o_atoms = df[((df['atom_type'] == 'o') | (df['atom_type'] == 'os')) & (df['subst_id'] >= 5) & (df['subst_id'] <= 14)].index.tolist()

            atomType = {
                "n": n_atoms,
                "hn": hn_atoms,
                "o": o_atoms
            }


            # Min elements for convoy = m
            # Min consecutive timesteps = k
            totalTime = 0
            t1 = time.time()
            clustering_clf = DBSCAN(eps=hyperParameters.epsilon)

            clf = BARMCMiner(clustering_clf, k=hyperParameters.k, m=hyperParameters.m, t1=hyperParameters.t1, t2 = hyperParameters.t2, data=data, threshold=hyperParameters.threshold, df=df, atomType=atomType, nitrogens_and_oxygens_indices=nitrogens_and_oxygens_indices)

            convoys = clf.fit_predict(transposedData)

            sorted_list = sorted(convoys, key=lambda x: x.start_time)
            totalTime += time.time() - t1

            print(totalTime)
            printConvoys(sorted_list)