In [1]:
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
FOLDERNAME = '/content/drive/MyDrive/Polymer-TL-Datasets'
assert FOLDERNAME is not None, "[!] Enter the foldername."

import sys
sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))

%cd /$FOLDERNAME/

%ls /$FOLDERNAME

/content/drive/MyDrive/Polymer-TL-Datasets
mtl_khazana.csv  vipea.csv


In [5]:
# import packages
import numpy as np
np.set_printoptions(threshold=sys.maxsize)

from rdkit.Chem import AllChem

# define function that transforms SMILES strings into ECFPs
def ECFP_from_smiles(smiles,
                     R = 3,
                     L = 2**10,
                     use_features = False,
                     use_chirality = False):
    """
    Inputs:
    
    - smiles ... SMILES string of input compound
    - R ... maximum radius of circular substructures
    - L ... fingerprint-length
    - use_features ... if false then use standard DAYLIGHT atom features, if true then use pharmacophoric atom features
    - use_chirality ... if true then append tetrahedral chirality flags to atom features
    
    Outputs:
    - np.array(feature_list) ... ECFP with length L and maximum radius R
    """
    
    molecule = AllChem.MolFromSmiles(smiles)
    feature_list = AllChem.GetMorganFingerprintAsBitVect(molecule,
                                                         radius = R,
                                                         nBits = L,
                                                         useFeatures = use_features,
                                                         useChirality = use_chirality)
    return np.array(feature_list)

In [6]:
copoly = pd.read_csv("./vipea.csv")

# get size of the copoly dataframe
copoly.shape

# get schema of the copoly dataframe
print(copoly.dtypes)

# select monoA, monoB, fracA, fracB, chain_arch, EA(eV) columns from the copoly dataframe
copoy_filtered_ea = copoly[['monoA', 'monoB', 'fracA', 'fracB', 'chain_arch', 'EA (eV)']] 

# rename ea column to property
copoy_filtered_ea = copoy_filtered_ea.rename(columns = {'EA (eV)': 'value'})

# add a string column to copoly_filtered_ea
copoy_filtered_ea['property'] = 'EA'

# select monoA, monoB, fracA, fracB, chain_arch, IP (eV) columns from the copoly dataframe
copoy_filtered_ip = copoly[['monoA', 'monoB', 'fracA', 'fracB', 'chain_arch', 'IP (eV)']] 

# rename ea column to property
copoy_filtered_ip = copoy_filtered_ip.rename(columns = {'IP (eV)': 'value'})

# add a string column to copoly_filtered_ip
copoy_filtered_ip['property'] = 'IP'

# join copoly_filtered_ea and copoly_filtered_ip
copolymers = pd.concat([copoy_filtered_ea, copoy_filtered_ip], axis = 0)


monoA              object
monoB              object
fracA             float64
fracB             float64
chain_arch         object
EA (eV)           float64
IP (eV)           float64
EA vs SHE (eV)    float64
IP vs SHE (eV)    float64
dtype: object


In [7]:
# dump dataframe to file

def dump_to_csv(df, name):
  compression_opts = dict(method='zip', 
                          archive_name=name + '.csv')  
  df.to_csv(name + '.zip', 
                    index=False, 
                    compression=compression_opts)



In [8]:
# get copolymers dataframe schema
copolymers.dtypes # monoA, monoB, fracA, fracB, chain_arch, value, property

# get size of filtered copoly
copolymers.shape # (85932, 7)

copolymers['fracA'].unique() # [0.5  0.75 0.25]
copolymers['chain_arch'].unique() # ['alternating' 'block' 'random']
copolymers['property'].unique() # ['EA' 'IP']

# get list of all unique monoA and monoB in copolymers
unique_copolymers = copolymers['monoA'].unique()
unique_copolymers = np.concatenate((unique_copolymers, copolymers['monoB'].unique()))

# get size of unique copolymers
unique_copolymers.shape # (691,)

# create copolymer-fingerprint bit vector mapping 
cp_fp_dict = {}

for copolymer in unique_copolymers:
  cp_fp_dict[copolymer] = ECFP_from_smiles(copolymer)

# size of cp_fp_dict
len(cp_fp_dict) # 683

683

In [9]:
# a1 = ECFP_from_smiles("*c1cc(F)c(*)cc1F", R = 3, L = 2**10)
# a2 = ECFP_from_smiles("*c1c(O)cc(O)c(*)c1O", R = 3, L = 2**10)
# fps = a1 * 0.5 + a2 * 0.5

# print(ECFP_from_smiles("*c1cc(F)c(*)cc1F"))
# print(AllChem.GetMorganFingerprint(AllChem.MolFromSmiles("*c1cc(F)c(*)cc1F"), radius = 3))
# print(AllChem.GetMorganFingerprint(AllChem.MolFromSmiles("*c1c(O)cc(O)c(*)c1O"), radius = 3).GetNonzeroElements())
# print(np.array(AllChem.RDKFingerprint(AllChem.MolFromSmiles("*c1cc(F)c(*)cc1F"))))

In [10]:
# processing homopolymer data now

homopoly = pd.read_csv("mtl_khazana.csv")

# get unique values of the property column
homopoly['property'].unique() # ['Eat' 'Xc' 'Egc' 'Egb' 'Eea' 'Ei' 'nc' 'eps']

# select only the smiles, property and value columns from the homopoly dataframe
homopoly_filtered = homopoly[['smiles', 'property', 'value']]

# filter the dataframe to select rows with only Eea or Ei properties
homopoly_filtered = homopoly_filtered[homopoly_filtered['property'].isin(['Eea', 'Ei'])]

# rename smiles column to mono
homopolymers = homopoly_filtered.rename(columns = {'smiles': 'mono'})

homopolymers.shape # (738, 3)

(738, 3)

In [17]:
# get list of all unique monomers in homolymers

unique_homopolymers = homopolymers['mono'].unique()

# get size of unique copolymers
unique_homopolymers.shape # (370,)

# create copolymer-fingerprint bit vector mapping 
hp_fp_dict = {}

for homopolymer in unique_homopolymers:
  hp_fp_dict[homopolymer] = ECFP_from_smiles(homopolymer)

hp_fp_dict.update(cp_fp_dict)
poly_fp_dict = hp_fp_dict

# size of poly_fp_dict
len(poly_fp_dict) # 1053

# create pandas dataframe from poly_fp_dict
poly_fp_df = pd.DataFrame.from_dict(poly_fp_dict, orient='index')

poly_fp_df.reset_index(inplace=True)
poly_fp_df = poly_fp_df.rename(columns = {'index':'mono'})

# get top 2 elements of poly_fp_df
poly_fp_df.head(10)

Unnamed: 0,mono,0,1,2,3,4,5,6,7,8,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,[*]CC([*])C,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,[*]CC([*])F,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,[*]CC([*])(F)F,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,[*]C(F)C([*])(F)F,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,[*]CCC(F)(F)C([*])(F)F,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,[*]CO[*],0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,[*]CCO[*],0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,[*]CC(O[*])C(F)(F)F,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
8,[*]CCCO[*],0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,[*]Oc1ccc(Cc2ccc([*])cc2)cc1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# Dumping all datasets to csv files

# The copolymer dataset
dump_to_csv(copolymers, 'copolymers')

# monomer fingerprint bit vector mappings
dump_to_csv(poly_fp_df, 'polymer_fingerprint_mappings')

# The homopolymer dataset
dump_to_csv(homopolymers, 'homopolymers')
