# Make some new features

In [52]:
import pandas as pd 
from collections import defaultdict
import math
import re

In [2]:
df = pd.read_csv('../data/X_frac_block.csv')

In [51]:
df.head()

Unnamed: 0.1,Unnamed: 0,[W],[Ta],[Tr],[R],total_count,sequence_type,block_number,block_type,R_avg_cluster_size,...,max_cluster_size,R_no_of_cluster,Ta_no_of_cluster,Tr_no_of_cluster,W_no_of_cluster,R_max_cluster_size,Ta_max_cluster_size,Tr_max_cluster_size,W_max_cluster_size,smiles
0,0,4.0,4.0,0.0,0.0,8.0,block,2,[W][Ta],0.0,...,4,0.0,1.0,0.0,1.0,0.0,4.0,0.0,4.0,[W][W][W][W][Ta][Ta][Ta][Ta]
1,1,12.0,4.0,0.0,0.0,16.0,block,2,[W][Ta],0.0,...,12,0.0,1.0,0.0,1.0,0.0,4.0,0.0,12.0,[W][W][W][W][W][W][W][W][W][W][W][W][Ta][Ta][T...
2,2,4.0,12.0,0.0,0.0,16.0,block,2,[W][Ta],0.0,...,12,0.0,1.0,0.0,1.0,0.0,12.0,0.0,4.0,[W][W][W][W][Ta][Ta][Ta][Ta][Ta][Ta][Ta][Ta][T...
3,3,12.0,12.0,0.0,0.0,24.0,block,2,[W][Ta],0.0,...,12,0.0,1.0,0.0,1.0,0.0,12.0,0.0,12.0,[W][W][W][W][W][W][W][W][W][W][W][W][Ta][Ta][T...
4,4,4.0,0.0,4.0,0.0,8.0,block,2,[W][Tr],0.0,...,4,0.0,0.0,1.0,1.0,0.0,0.0,4.0,4.0,[W][W][W][W][Tr][Tr][Tr][Tr]


In [5]:
smiles = df['smiles'].values

In [8]:
CHARACTERS = [
    '[W]',
    '[Tr]',
    '[Ta]',
    '[R]'
]

In [53]:
CHARACTER_REPLACEMENT = dict(zip(CHARACTERS, ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']))

In [15]:
def get_counts(smiles, characters=CHARACTERS):
    counts = [smiles.count(char) for char in characters]
    return dict(zip(characters, counts))

In [114]:
def get_shannon_nonzero(character_count): 
    counts = [c for c in character_count.values() if c>0]
    length = sum(counts)
    probs = [count/length for count in counts]
    ideal_entropy = entropy_max(length)
    entropy = - sum([ p * math.log(p) / math.log(2.0) for p in probs ])

    return entropy/ideal_entropy

In [35]:
def entropy_max(length):
    "Calculates the max Shannon entropy of a string with given length"

    prob = 1.0 / length

    return -1.0 * length * prob * math.log(prob) / math.log(2.0)

In [48]:
def get_balance(character_count): 
    counts = [c for c in character_count.values()]
    length = sum(counts)
    frequencies = [c/length for c in counts]
    return dict(zip(character_count.keys(), frequencies))

In [16]:
get_counts(smiles[0])

{'[W]': 4, '[Tr]': 0, '[Ta]': 4, '[R]': 0}

In [46]:
get_shannon_nonzero(get_counts(smiles[0]))

(1.0, 3.0)

In [50]:
get_balance(get_counts(smiles[0]))

{'[W]': 0.5, '[Tr]': 0.0, '[Ta]': 0.5, '[R]': 0.0}

In [64]:
multiple_replace(smiles[0])

'aaaacccc'

In [74]:
re.findall(r'((\w)\2{1,})', multiple_replace('[W][W][Ta][W][Ta]'))

[('aa', 'a')]

In [63]:
def multiple_replace(s, replacement_dict=CHARACTER_REPLACEMENT):
    for word in replacement_dict:
        s = s.replace(word, replacement_dict[word])
    return s

In [75]:
smiles[0]

'[W][W][W][W][Ta][Ta][Ta][Ta]'

In [97]:
def find_clusters(s, replacement_dict=CHARACTER_REPLACEMENT): 
    clusters = re.findall(r'((\w)\2{1,})', multiple_replace(s, replacement_dict))
    cluster_dict = dict(zip(CHARACTER_REPLACEMENT.keys(), [[] for i in CHARACTER_REPLACEMENT.keys()]))
    inv_replacement_dict = {v: k for k, v in replacement_dict.items()}
    for cluster, character in clusters: 
        cluster_dict[inv_replacement_dict[character]].append(len(cluster))
        
    return cluster_dict

In [98]:
find_clusters('[W][W][W][W][Ta][Ta][Ta][Ta][W][W][W]')

{'[W]': [4, 3], '[Tr]': [], '[Ta]': [4], '[R]': []}

In [107]:
def get_cluster_stats(s, normalized: bool = True, replacement_dict: dict=CHARACTER_REPLACEMENT):
    clusters = find_clusters(s, replacement_dict)
    cluster_stats = {}
    cluster_stats['total_clusters'] = 0 
    for key, value in clusters.items(): 
        if value:
            cluster_stats['num' + '_' + key] = len(value)
            cluster_stats['total_clusters'] += len(value)
            cluster_stats['max' + '_' + key] = max(value)
            cluster_stats['min' + '_' + key] = min(value)
        else:
            cluster_stats['num' + '_' +  key] = 0
            cluster_stats['max' + '_' + key] = 0
            cluster_stats['min' + '_' + key] = 0
    
    if normalized: 
        for key, value in cluster_stats.items(): 
            if key != 'total_clusters': 
                cluster_stats[key] = value/cluster_stats['total_clusters']
                
    return cluster_stats

In [112]:
def get_head_tail_features(string, characters=CHARACTERS): 
    """0/1/2 encoded feature indicating if the building block is at start/end of the polymer chain"""
    is_head_tail = [0] * len(characters)
    
    for i, char in enumerate(characters): 
        if string.startswith(char):
            is_head_tail[i] += 1
        if string.endswith(char):
            is_head_tail[i] += 1
            
    new_keys = ['head_tail_' + char for char in characters ]
    return dict(zip(new_keys, is_head_tail))

In [108]:
get_cluster_stats('[W][W][W][W][Ta][Ta][Ta][Ta][W][W][W]')

{'total_clusters': 3,
 'num_[W]': 0.6666666666666666,
 'max_[W]': 1.3333333333333333,
 'min_[W]': 1.0,
 'num_[Tr]': 0.0,
 'max_[Tr]': 0.0,
 'min_[Tr]': 0.0,
 'num_[Ta]': 0.3333333333333333,
 'max_[Ta]': 1.3333333333333333,
 'min_[Ta]': 1.3333333333333333,
 'num_[R]': 0.0,
 'max_[R]': 0.0,
 'min_[R]': 0.0}

In [113]:
get_head_tail_features('[W][W][W][W][Ta][Ta][Ta][Ta][W][W][W]')

{'head_tail_[W]': 2,
 'head_tail_[Tr]': 0,
 'head_tail_[Ta]': 0,
 'head_tail_[R]': 0}

In [125]:
class PolymerSmilesFeaturizer:
    def __init__(self, smiles: str, normalized_cluster_stats: bool = True): 
        self.smiles = smiles
        self.characters = ['[W]', '[Tr]', '[Ta]', '[R]']
        self.replacement_dict = dict(zip(self.characters, ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']))
        self.normalized_cluster_stats = normalized_cluster_stats
        
    @staticmethod
    def get_head_tail_features(string: str, characters: list): 
        """0/1/2 encoded feature indicating if the building block is at start/end of the polymer chain"""
        is_head_tail = [0] * len(characters)

        for i, char in enumerate(characters): 
            if string.startswith(char):
                is_head_tail[i] += 1
            if string.endswith(char):
                is_head_tail[i] += 1

        new_keys = ['head_tail_' + char for char in characters ]
        return dict(zip(new_keys, is_head_tail))
    
    
    @staticmethod 
    def get_cluster_stats(s: str, replacement_dict: dict, normalized: bool = True):
        clusters = PolymerSmilesFeaturizer.find_clusters(s, replacement_dict)
        cluster_stats = {}
        cluster_stats['total_clusters'] = 0 
        for key, value in clusters.items(): 
            if value:
                cluster_stats['num' + '_' + key] = len(value)
                cluster_stats['total_clusters'] += len(value)
                cluster_stats['max' + '_' + key] = max(value)
                cluster_stats['min' + '_' + key] = min(value)
            else:
                cluster_stats['num' + '_' +  key] = 0
                cluster_stats['max' + '_' + key] = 0
                cluster_stats['min' + '_' + key] = 0

        if normalized: 
            for key, value in cluster_stats.items(): 
                if key != 'total_clusters': 
                    cluster_stats[key] = value/cluster_stats['total_clusters']

        return cluster_stats
    
    @staticmethod
    def find_clusters(s: str, replacement_dict: dict): 
        clusters = re.findall(r'((\w)\2{1,})', PolymerSmilesFeaturizer._multiple_replace(s, replacement_dict))
        cluster_dict = dict(zip(replacement_dict.keys(), [[] for i in replacement_dict.keys()]))
        inv_replacement_dict = {v: k for k, v in replacement_dict.items()}
        for cluster, character in clusters: 
            cluster_dict[inv_replacement_dict[character]].append(len(cluster))

        return cluster_dict
    
    @staticmethod
    def _multiple_replace(s: str, replacement_dict: dict):
        for word in replacement_dict:
            s = s.replace(word, replacement_dict[word])
        return s
    
    @staticmethod
    def get_counts(smiles: str, characters: list):
        counts = [smiles.count(char) for char in characters]
        return dict(zip(characters, counts))
    
    @staticmethod
    def get_relative_shannon(character_count: dict): 
        counts = [c for c in character_count.values() if c>0]
        length = sum(counts)
        probs = [count/length for count in counts]
        ideal_entropy = PolymerSmilesFeaturizer._entropy_max(length)
        entropy = - sum([ p * math.log(p) / math.log(2.0) for p in probs ])

        return entropy/ideal_entropy
    
    @staticmethod
    def _entropy_max(length: int):
        "Calculates the max Shannon entropy of a string with given length"

        prob = 1.0 / length

        return -1.0 * length * prob * math.log(prob) / math.log(2.0)
        
    @staticmethod
    def get_balance(character_count: dict): 
        counts = [c for c in character_count.values()]
        length = sum(counts)
        frequencies = [c/length for c in counts]
        return dict(zip(character_count.keys(), frequencies))
    
    def _featurize(self): 
        self._character_count = PolymerSmilesFeaturizer.get_counts(self.smiles, self.characters)
        self._balance = PolymerSmilesFeaturizer.get_balance(self._character_count)
        self._relative_shannon = PolymerSmilesFeaturizer.get_relative_shannon(self._character_count)
        self._cluster_stats = PolymerSmilesFeaturizer.get_cluster_stats(self.smiles, self.replacement_dict, self.normalized_cluster_stats)
        self._head_tail_feat = PolymerSmilesFeaturizer.get_head_tail_features(self.smiles, self.characters)
        
        
        self.features = self._head_tail_feat
        self.features.update(self._cluster_stats)
        self.features.update(self._balance)
        self.features['rel_shannon'] = self._relative_shannon
        self.features['length'] = sum(self._character_count.values())
        
    def featurize(self): 
        self._featurize()
        return self.features

In [126]:
psf = PolymerSmilesFeaturizer('[W][W][W][W][Ta][Ta][Ta][Ta][W][W][W]')

In [127]:
psf.featurize()

{'head_tail_[W]': 2,
 'head_tail_[Tr]': 0,
 'head_tail_[Ta]': 0,
 'head_tail_[R]': 0,
 'total_clusters': 3,
 'num_[W]': 0.6666666666666666,
 'max_[W]': 1.3333333333333333,
 'min_[W]': 1.0,
 'num_[Tr]': 0.0,
 'max_[Tr]': 0.0,
 'min_[Tr]': 0.0,
 'num_[Ta]': 0.3333333333333333,
 'max_[Ta]': 1.3333333333333333,
 'min_[Ta]': 1.3333333333333333,
 'num_[R]': 0.0,
 'max_[R]': 0.0,
 'min_[R]': 0.0,
 '[W]': 0.6363636363636364,
 '[Tr]': 0.0,
 '[Ta]': 0.36363636363636365,
 '[R]': 0.0,
 'rel_shannon': 0.27335713170510495,
 'length': 11}