# Conditional Random Field implementation
## Using small number of demonstrations, no adversarial data
### Now using a discritivation of the state space so that I can represent interesting state features

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import gym
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics


Bad key "text.kerning_factor" on line 4 in
/home/brendanjcrowe/anaconda3/envs/seq/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
http://github.com/matplotlib/matplotlib/blob/master/matplotlibrc.template
or from the matplotlib source distribution


### Load Expert Data 

In [20]:
from experts import Expert

ENV_NAME = 'CartPole-v0'
expert = Expert(ENV_NAME)
env = gym.make(ENV_NAME)

data, avg_reward, splits = expert.generate_data(num_episodes=100)
sequences = np.split(data, splits)[0:-1]
avg_reward

200.0

In [42]:
for i in range(4):
    
    print(np.min(data[:,i]), np.max(data[:,i]))

-0.8698980070012878 0.9011965660492729
-0.4299339916568309 0.4228662862299207
-0.04953431719218638 0.04912509795413926
-0.3782617947520103 0.39411458878306904


### Define feature function, and functions to transform data into correct for for crf

In [77]:
def feats(seq, k):
    
    return {
            'f11': np.format_float_positional(seq[k-1][0], 1, unique=False),
            'f12': np.format_float_positional(seq[k][0], 1, unique=False),
            'f21': np.format_float_positional(seq[k-1][1], 2, unique=False),
            'f22': np.format_float_positional(seq[k][1], 2, unique=False),
            'f31': np.format_float_positional(seq[k-1][2], 2, unique=False),
            'f32': np.format_float_positional(seq[k][2], 2, unique=False),
            'f41': np.format_float_positional(seq[k-1][3], 2, unique=False),
            'f42': np.format_float_positional(seq[k][3], 2, unique=False),
            'first': k == 0,
            #'last': k == len(seq)-1
            #'bias': True
    }

def labs(seq, k):
    
    return str(seq[k][4])
    
def seq_to_feats(seq):
    
    return [feats(seq, k) for k in range(len(seq))]

def seq_to_labs(seq):
    
    return [labs(seq, k) for k in range(len(seq))]

X_train = [seq_to_feats(seq) for seq in sequences]
y_train = [seq_to_labs(seq) for seq in sequences]


### Train CRF

In [78]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=25000,
    all_possible_transitions=True,
    all_possible_states=True,
    verbose=1,
    c2=0
)
crf.fit(X_train, y_train)

loading training data to CRFsuite: 100%|██████████| 100/100 [00:00<00:00, 1108.79it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 1
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 672
Seconds required: 0.031

L-BFGS optimization
c1: 0.000000
c2: 0.000000
num_memories: 6
max_iterations: 25000
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=0.01  loss=9442.20  active=652   feature_norm=1.00
Iter 2   time=0.01  loss=5921.02  active=672   feature_norm=2.38
Iter 3   time=0.01  loss=3413.21  active=672   feature_norm=4.39
Iter 4   time=0.00  loss=2482.27  active=672   feature_norm=6.34
Iter 5   time=0.00  loss=1994.08  active=672   feature_norm=8.61
Iter 6   time=0.00  loss=1735.19  active=672   feature_norm=9.32
Iter 7   time=0.00  loss=1563.41  active=672   feature_norm=10.51
Iter 8   time=0.00  loss=1380.17  active=672   feature_norm=12.45
Iter 9   time=0.00  loss=1151.33  active=672   feature_norm=15.60
Iter 10  ti

CRF(algorithm='lbfgs', all_possible_states=True, all_possible_transitions=True,
    c2=0, keep_tempfiles=None, max_iterations=25000, verbose=1)

In [79]:
y_hat = np.array(np.hstack([np.array(seq_hat, dtype=np.float64) for seq_hat in crf.predict(X_train)]))

### Examine how well the learned policy performs in the enviroment

In [80]:
rewards = []

for j in range(100):
    sequence = np.empty((201, 4))
    sequence[0] = env.reset()
    i = 0
    done = False
    score = 0
    while not done:
        
        i += 1
        features = seq_to_feats(sequence[0:i])
        #print(features)
        policy = int(float(crf.predict_single(features)[-1]))
        observation, reward, done, info = env.step(policy)
        sequence[i] = observation
        score += reward
#         env.render()
#         time.sleep(0.01)
       
        
    
    rewards.append(score)

In [81]:
np.mean(rewards), np.std(rewards)

(163.61, 56.48077460516986)

In [84]:
import pickle

pickle.dump(crf, open('crf_models/cart_pole_no_adversarial.pkl', 'wb'))

### Compare with basic logistic regression

In [59]:
from sklearn.linear_model import LogisticRegression

In [64]:
model = LogisticRegression(penalty='none', multi_class='multinomial', fit_intercept=True).fit(data[:,0:2], data[:,2])

In [65]:
scores = np.empty((100,))
for j in range(100):
    sequence = np.empty((200, 2))
    observation = env.reset().reshape(1, -1)
    done = False
    score = 0
    while not done:

        #features = seq_to_feats(sequence[0:i])
        policy = model.predict(observation).astype(int)[0]
        observation, reward, done, info = env.step(policy)
        observation = observation.reshape(1, -1)
        #sequence[i] = observation
        score += reward
#         env.render()
#         time.sleep(0.01)
        #i += 1
        
    
    scores[j] = score

In [66]:
env.close()

In [68]:

print('Logistic Regression score', np.mean(scores))
print('The less negative number is better')

Logistic Regression score -110.86
The less negative number is better


In [None]:
fig, ax = plt.subplots(figsize=(12, 7))
sns.scatterplot(data.pos, data.vel, hue=model.predict(data.drop(['action', 'reward'], axis=1)), palette=['red', 'blue'])

In [71]:
import pickle

pickle.dump(crf, open('crf_mountain_car_no_adversarial', 'wb'))