# Conversion and Organising Structure Files 
- RNA Ligand Database
- Author: Akshay Shirsath

CONTENT
1. Converting mol files to sdf
2. Reorganising sdf & mol files


In [None]:
import os
from tqdm import tqdm
import numpy as np

## Converting .mol, .mol2 files to .sdf files

### Google Drive path for the sdf and mol compounds

In [None]:
path = r"/content/drive/MyDrive/current_work/work_with_aniket/RNA-Ligand Database/all"

### Getting the number of files

In [None]:
# Finding how many files are .mol, .mol2 and sdf

extensions = [file.split('.')[1] for file in os.listdir(path)]

files_type, files_count = np.unique(extensions, return_counts=True)

print("\tType\t|\tCount\n")
print(f"\t--------|------------\n")
print(f"\t{files_type[0]}\t|\t{files_count[0]}\n")
print(f"\t{files_type[1]}\t|\t{files_count[1]}\n")
print(f"\t{files_type[2]}\t|\t{files_count[2]}\n")
print(f"\t{files_type[3]}\t|\t{files_count[3]}\n")

	Type	|	Count

	--------|------------

	mol	|	41

	mol2	|	80

	pdb	|	3

	sdf	|	1074



### Segrating files into different directories based on their extensions

#### .pdb file

In [None]:
## Listing first four files

!ls /content/drive/MyDrive/current_work/work_with_aniket/RNA-Ligand\ Database/all -U | head -4

Cibacronblue.sdf
Structure2D_CID_25863.sdf
Conformer3D_CID_25863.sdf
Structure2D_CID_7099.sdf


In [None]:
## Bash/ linux command for moving specific file(s) based on their extensions ##

! cp -r /content/drive/MyDrive/current_work/work_with_aniket/RNA-Ligand\ Database/all/*.pdb /content/drive/MyDrive/current_work/work_with_aniket/RNA-Ligand\ Database/Final_Structure_Files/archived/all_pdb/

#### .mol file

In [None]:
## Bash/ linux command for moving specific file(s) based on their extensions ##
## mol file ##
! cp -r /content/drive/MyDrive/current_work/work_with_aniket/RNA-Ligand\ Database/all/*.mol /content/drive/MyDrive/current_work/work_with_aniket/RNA-Ligand\ Database/Final_Structure_Files/archived/all_mol/

#### .mol2

In [None]:
## Bash/ linux command for moving specific file(s) based on their extensions ##
## mol2 file ##
! cp -r /content/drive/MyDrive/current_work/work_with_aniket/RNA-Ligand\ Database/all/*.mol2 /content/drive/MyDrive/current_work/work_with_aniket/RNA-Ligand\ Database/Final_Structure_Files/archived/all_mol2/

#### .sdf

In [None]:
## Bash/ linux command for moving specific file(s) based on their extensions ##
## sdf file ##
! cp -r /content/drive/MyDrive/current_work/work_with_aniket/RNA-Ligand\ Database/all/*.sdf /content/drive/MyDrive/current_work/work_with_aniket/RNA-Ligand\ Database/Final_Structure_Files/archived/all_sdfs/

#### Summary

In [None]:
# Finding how many files are .mol, .mol2 and sdf
path_mol = r"/content/drive/MyDrive/current_work/work_with_aniket/RNA-Ligand Database/Final_Structure_Files/archived/all_mol"
path_pdb = r"/content/drive/MyDrive/current_work/work_with_aniket/RNA-Ligand Database/Final_Structure_Files/archived/all_pdb"
path_mol2 = r"/content/drive/MyDrive/current_work/work_with_aniket/RNA-Ligand Database/Final_Structure_Files/archived/all_mol2"
path_sdfs = r"/content/drive/MyDrive/current_work/work_with_aniket/RNA-Ligand Database/Final_Structure_Files/archived/all_sdfs"

move_location_sdf = len(os.listdir(path_sdfs))
move_location_pdb = len(os.listdir(path_pdb))
move_location_mol = len(os.listdir(path_mol))
move_location_mol2 = len(os.listdir(path_mol2))

print("\tFILES in| OLD DIRECTORY\t| NEW DIRECTORY\n")
print(f"\t--------|---------------|------------\n")
print(f"\t{files_type[0]}\t|\t{files_count[0]}\t|\t{move_location_mol}\n")
print(f"\t{files_type[1]}\t|\t{files_count[1]}\t|\t{move_location_mol2}\n")
print(f"\t{files_type[2]}\t|\t{files_count[2]}\t|\t{move_location_pdb}\n")
print(f"\t{files_type[3]}\t|\t{files_count[3]}\t|\t{move_location_sdf}\n")

	FILES in| OLD DIRECTORY	| NEW DIRECTORY

	--------|---------------|------------

	mol	|	41	|	41

	mol2	|	80	|	80

	pdb	|	3	|	3

	sdf	|	1074	|	1074



## File conversion by OpenBabel Python

In [None]:
! wget https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.2-Linux-x86_64.sh
! chmod +x Miniconda3-py37_4.8.2-Linux-x86_64.sh
! bash ./Miniconda3-py37_4.8.2-Linux-x86_64.sh -b -f -p /usr/local
import sys
sys.path.append('/usr/local/lib/python3.7/site-packages/')

In [None]:
!conda install -c conda-forge openbabel

In [53]:
from openbabel import openbabel
import os
from tqdm import tqdm

In [50]:
def Mol2ToSDF(mol2_file, directory_path, store_path):
    """Converts .mol2 to .sdf """
    
    file_name = mol2_file.split(".")[0]

    complete_path = f"{directory_path}/{mol2_file}"
    
    obConversion = openbabel.OBConversion()
    obConversion.SetInAndOutFormats("mol2", "sdf")
    mol = openbabel.OBMol()
    
    obConversion.ReadFile(mol, complete_path) 
    
    status = obConversion.WriteFile(mol, f'{store_path}/{file_name}.sdf')
    
    if status:
        pass
    else:
        print(f"ERROR in Conversion!. Check {file_name} file.")

In [49]:
# Directory to store all the newly converted .mol and previous sdf files
store_directory_path = r"/content/drive/MyDrive/current_work/work_with_aniket/RNA-Ligand Database/Final_Structure_Files/all_converted_structure_files"

In [47]:
print(f"Number of Mol Files: {len(os.listdir(path_mol))}")

Number of Mol Files: 41


In [52]:
for mol in tqdm(os.listdir(path_mol)):
    Mol2ToSDF(mol, path_mol, store_directory_path)

100%|██████████| 41/41 [00:00<00:00, 54.26it/s]


In [54]:
for mol2 in tqdm(os.listdir(path_mol2)):
    Mol2ToSDF(mol2, path_mol2, store_directory_path)

100%|██████████| 80/80 [00:02<00:00, 34.26it/s]


In [57]:
! cp /content/drive/MyDrive/current_work/work_with_aniket/RNA-Ligand\ Database/Final_Structure_Files/archived/all_sdfs/*.sdf /content/drive/MyDrive/current_work/work_with_aniket/RNA-Ligand\ Database/Final_Structure_Files/all_previous_sdf_structure_files/

In [58]:
store_directory_path = r"/content/drive/MyDrive/current_work/work_with_aniket/RNA-Ligand Database/Final_Structure_Files/all_converted_mol_to_sdf_structure_files"

In [59]:
len(os.listdir(store_directory_path))

121