# Uslovna slučajna polja - Conditional Random Fields (CRF)

In [1]:
from sklearn_crfsuite import CRF

In [2]:
# Konstrukcija feature-a ulaznog parametra X
def create_features(X):
    features = []
    for i in range(len(X)):
        x = X[i]
        
        current_features = {
            'bias': 1,
            'nuc': x.lower()
        }
        
        if i > 0:
            current_features.update({
                '-1:nuc': X[i - 1].lower()
            })

        features.append(current_features)
    
    return features

In [3]:
# Podaci za obučavanje/testiranje modela
X = ['ATTATATTATATTATATTAAA', 'CCTAGTCGTGTCGCAAAAAAAACTGCTCTGACCTGAGC', 'ATTATATGCCGCGGC', 'GGGCCCCGGCTTATATAT']
Y = ['---------------------', '++++++++++++++--------++++++++++++++++', '-------++++++++', '++++++++++--------']

In [4]:
X_prepared = [create_features(x) for x in X]

In [5]:
# Primer jednog niza feature-a
print(X_prepared[0])

[{'bias': 1, 'nuc': 'a'}, {'bias': 1, 'nuc': 't', '-1:nuc': 'a'}, {'bias': 1, 'nuc': 't', '-1:nuc': 't'}, {'bias': 1, 'nuc': 'a', '-1:nuc': 't'}, {'bias': 1, 'nuc': 't', '-1:nuc': 'a'}, {'bias': 1, 'nuc': 'a', '-1:nuc': 't'}, {'bias': 1, 'nuc': 't', '-1:nuc': 'a'}, {'bias': 1, 'nuc': 't', '-1:nuc': 't'}, {'bias': 1, 'nuc': 'a', '-1:nuc': 't'}, {'bias': 1, 'nuc': 't', '-1:nuc': 'a'}, {'bias': 1, 'nuc': 'a', '-1:nuc': 't'}, {'bias': 1, 'nuc': 't', '-1:nuc': 'a'}, {'bias': 1, 'nuc': 't', '-1:nuc': 't'}, {'bias': 1, 'nuc': 'a', '-1:nuc': 't'}, {'bias': 1, 'nuc': 't', '-1:nuc': 'a'}, {'bias': 1, 'nuc': 'a', '-1:nuc': 't'}, {'bias': 1, 'nuc': 't', '-1:nuc': 'a'}, {'bias': 1, 'nuc': 't', '-1:nuc': 't'}, {'bias': 1, 'nuc': 'a', '-1:nuc': 't'}, {'bias': 1, 'nuc': 'a', '-1:nuc': 'a'}, {'bias': 1, 'nuc': 'a', '-1:nuc': 'a'}]


In [6]:
# Transformacija podataka za obučavanje
y_train = [list(y) for y in Y]
X_train = X_prepared

In [7]:
# Inicijalizacija CRF algoritma i treniranje modela
crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass


In [8]:
# Evaluacija modela

x = 'CCGGTTAACACCAAATTTATATCTATCTGTTTACCTCGGCGCC'
y_pred = crf.predict([create_features(x)])
for i in range(len(y_pred)):
    print(x)
    print(''.join(y_pred[i]))
    print()

CCGGTTAACACCAAATTTATATCTATCTGTTTACCTCGGCGCC
++++++++++++---------------------++++++++++



In [9]:
# Analiza težinskih faktora feature-a u zavisnosti od stanja
for el in crf.state_features_.items():
    print(el)

(('nuc:a', '-'), 1.206173)
(('nuc:a', '+'), -1.206173)
(('nuc:t', '-'), 0.94765)
(('nuc:t', '+'), -0.94765)
(('-1:nuc:a', '-'), 0.168122)
(('-1:nuc:a', '+'), -0.168122)
(('nuc:c', '+'), 1.680139)
(('nuc:g', '+'), 0.91978)
(('-1:nuc:g', '+'), 0.585843)


In [10]:
# Analiza težinskih faktora prelazaka između stanja
crf.transition_features_

{('-', '-'): 0.908999,
 ('-', '+'): -1.390005,
 ('+', '-'): -1.304624,
 ('+', '+'): 1.797423}