# **_Molecular Framework Results Merging_**


## Environment Setting Up and Packages Importing:
        !conda activate env-framework
        Please generate this environment from the .yml file: framework.yml

In [None]:
import sys
import time
import glob, os

import pandas as pd
from tqdm import tqdm 
import pickle
import numpy as np

#RDKit:
from rdkit.Chem import AllChem
from rdkit.Chem import rdchem
from rdkit import Chem
from rdkit.Chem.rdmolops import *
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import MolsToGridImage

### 1. Collect all the Pickle Files in the Results Folder:

In [None]:
# Target Directory (contains all 'filename-sorted-without-ROMol.pickle')
directory = str(os.getcwd()) + "/results"

iteration = 0
row_count = 0

for filename in os.listdir(directory):
    if filename.endswith(".pickle"):
        
        iteration += 1
        if iteration == 1:
            df = pd.read_pickle('results/' + filename)
            #print("basis lenght = " + str(len(df)))
            row_count += len(df)

        if iteration != 1:
            df2 = pd.read_pickle('results/' + filename)
            #print("added lenght = " + str(len(df2)))
            row_count += len(df2)

            df = df.append(df2, 2)

print("Total enteries that have been merged: \t" + str(row_count))


### 2. Group the Molecules by Same Frameworks:

In [None]:
df_result = df.groupby(['Framework'], as_index = False).agg({'Frequency': 'sum'})

In [None]:
len(df_result)
print("The number of frameworks obtained is:", len(df_result)-1)

### 3. Sort the Data According to the Frequency:

In [None]:
# For the data already has frequency:
df_result = df_result.sort_values(by=['Frequency'], ascending = False)

In [None]:
# Reset the index:
df_result = df_result.reset_index(drop = True)
df_result

In [None]:
# Save the pickle file:
with open("results_merged.pickle", "wb") as f:
    pickle.dump(df_result,f)

### 4. Display the Frameworks:

In [None]:
# For the databases too large, display the first 10000 frameworks:
df_result = df_result.head(10000)

In [None]:
# Add ROMol to each framework:
PandasTools.AddMoleculeColumnToFrame(df_result, smilesCol = "Framework")
df_result.head(3)

# If the ROMols already have been added before, to display them again:
# PandasTools.RenderImagesInAllDataFrames(images=True)

In [None]:
# Display the results (all or top 10000 frequent frameworks) in a .html file:

fmolport = open('top-10000-results-with-ROMol.html','w')
h = df_result.to_html()
fmolport.write(h)
fmolport.close()