In [None]:
import pandas as pd
from src.constants import (
    PREPROCESSING_PATH,
    TARGET
)

import numpy as np
import re
from typing import List
from collections import Counter
import itertools
import seaborn as sns

In [None]:
Train = pd.read_csv(f"{PREPROCESSING_PATH}/train_merged_CIF/Train.csv")
Train = Train.set_index(Train.columns[0])

Pretest = pd.read_csv(f"{PREPROCESSING_PATH}/pretest_merged_CIF/Pretest.csv")
Pretest = Pretest.set_index(Pretest.columns[0])

In [None]:
def molecule_to_element_list(molecule: str) -> List[str]:
    try:
        return re.findall(
            r'[A-Z][a-z]?\d*|\((?:[^()]*(?:\(.*\))?[^()]*)+\)\d+',
            molecule
        )
    except TypeError:
        return []



print(molecule_to_element_list("NHMe-OH"))
print(molecule_to_element_list("SO3H"))

['N', 'H', 'Me', 'O', 'H']
['S', 'O3', 'H']


In [None]:
df = Train.copy()

In [None]:
itertools.tee

<function itertools.tee(iterable, n=2, /)>

In [None]:
class FunctionalGroupToGramExtractor():
    specific_columns: list = []

    @staticmethod
    def molecule_to_element_list(molecule: str) -> List[str]:
        try:
            return re.findall(
                r'[A-Z][a-z]?\d*|\((?:[^()]*(?:\(.*\))?[^()]*)+\)\d+',
                molecule
            )
        except TypeError:
            return []

    @staticmethod
    def molecule_to_element_list(molecule: str) -> List[str]:
        try:
            return re.findall(
                r'[A-Z][a-z]?\d*|\((?:[^()]*(?:\(.*\))?[^()]*)+\)\d+',
                molecule
            )
        except TypeError:
            return []

    @staticmethod
    def bigram(molecule):
        a, b = itertools.tee(molecule)
        next(b, None)
        return ["-".join(gram) for gram in (zip(a, b))] 

    @staticmethod
    def trigram(molecule):
        a, b, c = itertools.tee(molecule, 3)
        next(b, None)
        next(c, None)
        next(c, None)
        return ["-".join(gram) for gram in (zip(a, b, c))]


    def transform(self, df, fit=False, filter_ratio=0.01):
        df['functional_group_list'] = df.functional_groups.apply(
            self.molecule_to_element_list
        )
        df['functional_group_bigram_list'] = df.functional_group_list.apply(
            lambda x: self.bigram(x)
        )

        df['functional_group_unigram'] = df.functional_group_list.apply(
            lambda x: dict(Counter(x))
        )

        df['functional_group_bigram'] = df.functional_group_bigram_list.apply(
            lambda x: dict(Counter(x))
        )

        unigram_df = pd.DataFrame(
            list(df['functional_group_unigram'].values)
        ).fillna(0)
        unigram_df.index = df.index
        unigram_df.columns = 'functional_group_unigram_' + unigram_df.columns

        bigram_df = pd.DataFrame(list(df['functional_group_bigram'].values)).fillna(0)
        bigram_df.index = df.index
        bigram_df.columns = 'functional_group_bigram_' + bigram_df.columns

        congram_df = pd.concat([unigram_df, bigram_df], axis=1)
        if fit:
            self.specific_columns = []
            for col in congram_df.columns:
                if np.mean(congram_df[col] > 0) > filter_ratio:
                    self.specific_columns.append(col)
        else:
            for col in self.specific_columns:
                congram_df[col] = congram_df.get(col, 0)
        return congram_df[self.specific_columns]



funtional_group_extractor = FunctionalGroupToGramExtractor()
train = funtional_group_extractor.transform(Train, fit=True)

In [None]:
pretest = funtional_group_extractor.transform(Pretest, fit=False)

In [None]:
assert all(train.columns == pretest.columns)

In [None]:
for col in test.columns:
    print(col)
    print(np.mean(test[col] > 0))

functional_group_unigram_C
0.2482182676752219
functional_group_unigram_O
0.49914739189366447
functional_group_unigram_H
0.5182691326716511
functional_group_unigram_Et
0.14325273636191393
functional_group_unigram_F
0.09951466923177824
functional_group_unigram_Me
0.26130616646991095
functional_group_unigram_S
0.08464868173669712
functional_group_unigram_O3
0.08464868173669712
functional_group_unigram_N
0.3359275938962004
functional_group_unigram_Pr
0.1347266552985586
functional_group_unigram_O2
0.12193024645475348
functional_group_unigram_H2
0.08915220147785405
functional_group_unigram_Br
0.0709195050500634
functional_group_unigram_Cl
0.091440397592293
functional_group_unigram_Ph
0.04443764301225715
functional_group_unigram_I
0.06402576771165815
functional_group_bigram_C-O
0.18484835235305264
functional_group_bigram_O-O
0.09794062349700494
functional_group_bigram_O-H
0.20469881800824916
functional_group_bigram_H-O
0.03991954877355603
functional_group_bigram_O-Et
0.0749857898648944
functi

In [None]:
bigram_df = pd.DataFrame(list(df['functional_group_bigram'].values)).fillna(0)
bigram_df.index = df.index
bigram_df[TARGET] = df[TARGET]
corr = bigram_df.corr()[[TARGET]]

In [None]:
'functional_group_unigram_' + unigram_df.columns

Index(['functional_group_unigram_C', 'functional_group_unigram_O',
       'functional_group_unigram_H', 'functional_group_unigram_Et',
       'functional_group_unigram_F', 'functional_group_unigram_Me',
       'functional_group_unigram_S', 'functional_group_unigram_O3',
       'functional_group_unigram_N', 'functional_group_unigram_Pr',
       'functional_group_unigram_O2', 'functional_group_unigram_H2',
       'functional_group_unigram_Br', 'functional_group_unigram_Cl',
       'functional_group_unigram_Ph', 'functional_group_unigram_I',
       'functional_group_unigram_CO2_working_capacity [mL/g]'],
      dtype='object')

In [None]:

unigram_df[TARGET] = df[TARGET]
corr = unigram_df.corr()[[TARGET]]

In [None]:
pd.DataFrame([{'x': 1}])

Unnamed: 0,x
0,1


In [None]:
pd.DataFrame(df['functional_group_trigram'].values.flatten())

Unnamed: 0,0
0,"{'C-O-O': 1, 'O-O-H': 1, 'O-H-O': 1, 'H-O-Et': 1}"
1,{'F-O-Me': 1}
2,"{'O-Me-C': 1, 'Me-C-O': 1, 'C-O-O': 1, 'O-O-H'..."
3,"{'H-S-O3': 1, 'S-O3-H': 1}"
4,"{'N-H-Me': 1, 'H-Me-O': 1, 'Me-O-H': 1}"
...,...
68608,{}
68609,{'S-O3-H': 1}
68610,{}
68611,{'O-Pr-Me': 1}


In [None]:
df['functional_group_list'].apply(
    lambda x: [for tee in itertools.tee(x, 2)]
)

Unnamed: 0
1        [<itertools._tee object at 0x7f41152fc200>, <i...
2        [<itertools._tee object at 0x7f4114acffc0>, <i...
3        [<itertools._tee object at 0x7f41152d4980>, <i...
4        [<itertools._tee object at 0x7f41152d4180>, <i...
5        [<itertools._tee object at 0x7f41152609c0>, <i...
                               ...                        
68609    [<itertools._tee object at 0x7f410db22bc0>, <i...
68610    [<itertools._tee object at 0x7f410db22c80>, <i...
68611    [<itertools._tee object at 0x7f410db22d40>, <i...
68612    [<itertools._tee object at 0x7f410db22e00>, <i...
68613    [<itertools._tee object at 0x7f410db22ec0>, <i...
Name: functional_group_list, Length: 68613, dtype: object

In [None]:
import itertools




trigram(['N', 'H', 'Me', 'O', 'H'])

['N-H-Me', 'H-Me-O', 'Me-O-H']

In [None]:
Counter(['N', 'H', 'Me', 'O', 'H'])

Counter({'N': 1, 'H': 2, 'Me': 1, 'O': 1})

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=cf8541de-dbc3-45f6-bc1e-4fa446cacbcd' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>