# Conditional Random Field implementation
## Using small number of demonstrations, no adversarial data
### Now using a discritivation of the state space so that I can represent interesting state features

In [28]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import gym
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

### Load Expert Data 

In [29]:
from experts import Expert

ENV_NAME = 'Acrobot-v1'
expert = Expert(ENV_NAME)
env = gym.make(ENV_NAME)

data, avg_reward, splits = expert.generate_data(num_episodes=100)
sequences = np.split(data, splits)[0:-1]
avg_reward

-93.24

In [30]:
for i in range(6):
    
    print(np.min(data[:,i]), np.max(data[:,i]))

-0.9999942148146757 0.9999999677597143
-0.9999999905199425 0.9999997422606326
-0.9999996651775235 0.9999998237451254
-0.9999999960117508 0.9999999832593169
-9.689276561534452 10.892076040873079
-21.21953680868765 20.16736924230797


In [31]:
env.reset()

array([ 0.99828285,  0.05857777,  0.99552911, -0.09445523,  0.05452896,
        0.03578921])

### Define feature function, and functions to transform data into correct for for crf

In [32]:
def feats(seq, k):
    
    return {
            'f11': np.format_float_positional(seq[k-1][0], 2, unique=False),
            'f12': np.format_float_positional(seq[k][0], 2, unique=False),
            'f21': np.format_float_positional(seq[k-1][1], 2, unique=False),
            'f22': np.format_float_positional(seq[k][1], 2, unique=False),
            'f31': np.format_float_positional(seq[k-1][2], 2, unique=False),
            'f32': np.format_float_positional(seq[k][2], 2, unique=False),
            'f41': np.format_float_positional(seq[k-1][3], 2, unique=False),
            'f42': np.format_float_positional(seq[k][3], 2, unique=False),
            'f51': np.format_float_positional(seq[k-1][4], 1, unique=False),
            'f52': np.format_float_positional(seq[k][4], 1, unique=False),
            'f61': np.format_float_positional(seq[k-1][5], 1, unique=False),
            'f62': np.format_float_positional(seq[k][5], 1, unique=False),
            'first': k == 0
            #'last': k == len(seq)-1
            #'bias': True
    }

def labs(seq, k):
    
    return str(seq[k][6])
    
def seq_to_feats(seq):
    
    return [feats(seq, k) for k in range(len(seq))]

def seq_to_labs(seq):
    
    return [labs(seq, k) for k in range(len(seq))]

X_train = [seq_to_feats(seq) for seq in sequences]
y_train = [seq_to_labs(seq) for seq in sequences]


### Train CRF

In [33]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=25000,
    all_possible_transitions=True,
    all_possible_states=True,
    verbose=1,
    c2=0
)
crf.fit(X_train, y_train)

loading training data to CRFsuite: 100%|██████████| 100/100 [00:00<00:00, 1577.22it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 1
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 5366
Seconds required: 0.047

L-BFGS optimization
c1: 0.000000
c2: 0.000000
num_memories: 6
max_iterations: 25000
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=0.01  loss=3938.41  active=5164  feature_norm=1.00
Iter 2   time=0.01  loss=3129.21  active=5366  feature_norm=1.55
Iter 3   time=0.01  loss=2631.56  active=5366  feature_norm=2.14
Iter 4   time=0.00  loss=2456.93  active=5366  feature_norm=2.65
Iter 5   time=0.00  loss=2294.78  active=5366  feature_norm=3.11
Iter 6   time=0.00  loss=2004.49  active=5366  feature_norm=4.02
Iter 7   time=0.00  loss=1477.73  active=5366  feature_norm=5.79
Iter 8   time=0.00  loss=868.85   active=5366  feature_norm=8.18
Iter 9   time=0.00  loss=721.55   active=5366  feature_norm=9.44
Iter 10  time

CRF(algorithm='lbfgs', all_possible_states=True, all_possible_transitions=True,
    c2=0, keep_tempfiles=None, max_iterations=25000, verbose=1)

In [34]:
y_hat = np.array(np.hstack([np.array(seq_hat, dtype=np.float64) for seq_hat in crf.predict(X_train)]))

### Examine how well the learned policy performs in the enviroment

In [39]:
rewards = []

for j in range(100):
    sequence = np.empty((400, 6))
    sequence[0] = env.reset()
    i = 0
    done = False
    score = 0
    while not done:
        
        i += 1
        features = seq_to_feats(sequence[0:i])
        #print(features)
        policy = int(float(crf.predict_single(features)[-1]))
        observation, reward, done, info = env.step(policy)
        sequence[i] = observation
        score += reward
#         env.render()
#         time.sleep(0.01)
       
        
    
    rewards.append(score)

In [40]:
np.mean(rewards), np.std(rewards)

(-86.35, 20.33685078865457)

In [42]:
import time
for j in range(1):
    sequence = np.empty((201, 6))
    sequence[0] = env.reset()
    i = 0
    done = False
    score = 0
    while not done:
        
        i += 1
        features = seq_to_feats(sequence[0:i])
        #print(features)
        policy = int(float(crf.predict_single(features)[-1]))
        observation, reward, done, info = env.step(policy)
        sequence[i] = observation
        score += reward
        env.render()
        time.sleep(0.1)
       
        
    
env.close()

In [36]:
import pickle

pickle.dump(crf, open('crf_models/acrobot_no_adversarial.pkl', 'wb'))

In [37]:
for i in range(10):
    print(env.action_space.sample())

0
2
2
0
0
0
1
0
0
2


### Compare with basic logistic regression

In [59]:
from sklearn.linear_model import LogisticRegression

In [64]:
model = LogisticRegression(penalty='none', multi_class='multinomial', fit_intercept=True).fit(data[:,0:2], data[:,2])

In [65]:
scores = np.empty((100,))
for j in range(100):
    sequence = np.empty((200, 2))
    observation = env.reset().reshape(1, -1)
    done = False
    score = 0
    while not done:

        #features = seq_to_feats(sequence[0:i])
        policy = model.predict(observation).astype(int)[0]
        observation, reward, done, info = env.step(policy)
        observation = observation.reshape(1, -1)
        #sequence[i] = observation
        score += reward
#         env.render()
#         time.sleep(0.01)
        #i += 1
        
    
    scores[j] = score

In [66]:
env.close()

In [68]:

print('Logistic Regression score', np.mean(scores))
print('The less negative number is better')

Logistic Regression score -110.86
The less negative number is better


In [None]:
fig, ax = plt.subplots(figsize=(12, 7))
sns.scatterplot(data.pos, data.vel, hue=model.predict(data.drop(['action', 'reward'], axis=1)), palette=['red', 'blue'])

In [71]:
import pickle

pickle.dump(crf, open('crf_mountain_car_no_adversarial', 'wb'))