# Identify monomers in donor polymers by SMARTS

In [1]:
from rdkit import Chem
import pandas as pd

## Identify common monomers in each donor

Import dataset

In [2]:
df = pd.read_csv('../OSC_dataset_metaanalysis.csv')

SMARTS patterns for common monomers. Monomer A is  benzodithiophene (BDT), Monomer B is benzodithiophenedione (BDD), Monomer C is benzotriazole (BTA), Monomer D is benzothiadiazole, Monomer E is quinoxaline (Qx), Monomer F is thieno[3,4-b]thiophene (TT), Monomer G is dithienobenzothiadiazole (DTBT), Monomer H is an imide-fused benzotriazole (TzBI), Monomer I is thieno[3,4-c]pyrrole-4,6-dione (TPD), Monomer J is Dithienoquinoxaline (DTQx), Monomer K is thiazolo[5,4-d]thiazole, Monomer L is benzo[1,2-b:4,5-b']difuran (BDF)



In [3]:
smarts_A = "[#6r5R1]@[#6r5R1]@[#16r5R1]@[#6R2]@[#6r6R1]@[#6R2]@[#6r5R1]@[#6r5R1]@[#16r5R1]@[#6R2]@[#6r6R1]@[#6R2]"
smarts_B = "[#16r5R1]@[#6r5R1]@[#6R2]@[#6r6R1](=[#8])@[#6R2]@[#6r5R1]@[#16r5R1]@[#6r5R1]@[#6R2]@[#6r6R1](=[#8])@[#6R2]@[#6r5R1]"
smarts_C = "[#7r5R1]@[#7r5R1]@[#7r5R1]@[#6R2]@[#6r6R1]@[#6r6R1]@[#6r6R1]@[#6r6R1]@[#6R2]"
smarts_D = "[#7r5R1]@[#16r5R1]@[#7r5R1]@[#6R2]@[#6r6R1]@[#6r6R1]@[#6r6R1]@[#6r6R1]@[#6R2]"
smarts_E = "[#7r6R1]@[#6r6R1]@[#6r6R1]@[#7r6R1]@[#6R2]@[#6r6R1]@[#6r6R1]@[#6r6R1]@[#6r6R1]@[#6R2]"
smarts_F = "[#6r5R1]@[#6r5R1]@[#16r5R1]@[#6R2]@[#6r5R1]@[#16r5R1]@[#6r5R1]@[#6R2]"
smarts_G = "[#6r5R1]@[#6r5R1]@[#16r5R1]@[#6R2]@[#6R2]@[#16r5R1]@[#6r5R1]@[#6r5R1]@[#6R2]@[#6R2]@[#7r5R1]@[#16r5R1]@[#7r5R1]@[#6R2]@[#6R2]"
smarts_H = "[#7r5R1]@[#6r5R1](=[#8])@[#6R2]@[#6r6R1]@[#6R2]@[#7r5R1]@[#7r5R1]@[#7r5R1]@[#6R2]@[#6r6R1]@[#6R2]@[#6r5R1](=[#8])"
smarts_I = "[#7r5R1]@[#6r5R1](=[#8])@[#6R2]@[#6r5R1]@[#16r5R1]@[#6r5R1]@[#6R2]@[#6r5R1](=[#8])"
smarts_J = "[#6r5R1]@[#6r5R1]@[#16r5R1]@[#6R2]@[#6R2]@[#16r5R1]@[#6r5R1]@[#6r5R1]@[#6R2]@[#6R2]@[#7r6R1]@[#6r6R1]@[#6r6R1]@[#7r6R1]@[#6R2]@[#6R2]"
smarts_K = "[#16r5R1]@[#6r5R1]@[#7r5R1]@[#6R2]@[#16r5R1]@[#6r5R1]@[#7r5R1]@[#6R2]"
smarts_L = "[#6r5R1]@[#6r5R1]@[#8r5R1]@[#6R2]@[#6r6R1]@[#6R2]@[#6r5R1]@[#6r5R1]@[#8r5R1]@[#6R2]@[#6r6R1]@[#6R2]"

smarts_list = [smarts_A, smarts_B, smarts_C, smarts_D, smarts_E, smarts_F, smarts_G, smarts_H, smarts_I, smarts_J, smarts_K, smarts_L]

Count how many times each type of monomer appears

In [4]:
def count_monomers(smiles, smarts_list):
    count_list = []
    for smarts in smarts_list:
        count_list.append(len(Chem.MolFromSmiles(smiles).GetSubstructMatches(Chem.MolFromSmarts(smarts))))

    return count_list

In [5]:
monomer_A_count = []
monomer_B_count = []
monomer_C_count = []
monomer_D_count = []
monomer_E_count = []
monomer_F_count = []
monomer_G_count = []
monomer_H_count = []
monomer_I_count = []
monomer_J_count = []
monomer_K_count = []
monomer_L_count = []

monomer_count_list = [monomer_A_count, monomer_B_count, monomer_C_count, monomer_D_count, monomer_E_count, monomer_F_count, monomer_G_count, monomer_H_count, monomer_I_count, monomer_J_count, monomer_K_count, monomer_L_count]

for x in range(len(df)):
    mon_smiles = df["don_mon_SMILES"][x]
    count_list = count_monomers(mon_smiles, smarts_list)

    for i in range(len(count_list)):
        monomer_count_list[i].append(count_list[i])

Add monomers counts to the dataframe

In [6]:
df["A_count"] = monomer_A_count
df["B_count"] = monomer_B_count
df["C_count"] = monomer_C_count
df["D_count"] = monomer_D_count
df["E_count"] = monomer_E_count
df["F_count"] = monomer_F_count
df["G_count"] = monomer_G_count
df["H_count"] = monomer_H_count
df["I_count"] = monomer_I_count
df["J_count"] = monomer_J_count
df["K_count"] = monomer_K_count
df["L_count"] = monomer_L_count

Filter dataframe and make new csv

In [8]:
df2 = df.filter(['updated_NFA_ID','reported_acceptor', 'don_ID', 'reported_donor', 'DOI', 'PCE_avg', 'donor_type', 'acc_SMILES_sidechains', 'don_mon_SMILES', 'A_count', 'B_count', 'C_count', 'D_count', 'E_count', 'F_count', 'G_count', 'H_count', 'I_count', 'J_count', 'K_count', 'L_count'], axis=1)
df2.to_csv("don_monomer_smarts.csv")