# Conditional Random Field implementation
## Using small number of demonstrations, no adversarial data
### Now using a discritivation of the state space so that I can represent interesting state features

In [34]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import gym
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

### Load Expert Data 

In [35]:
from experts import Expert

ENV_NAME = 'LunarLander-v2'
expert = Expert(ENV_NAME)
env = gym.make(ENV_NAME)

data, reward, splits = expert.generate_data(num_episodes=100)
sequences = np.split(data, splits)[0:-1]
np.mean(reward), np.std(reward)

(274.71594742940664, 31.97296759593901)

In [36]:
np.unique(data[:,8])

array([0., 1., 2., 3.])

In [37]:
for i in range(8):
    
    print(np.min(data[:,i]), np.max(data[:,i]))

-0.48627200722694397 0.467172235250473
-0.09392815828323364 1.5227824449539185
-0.8054076433181763 0.8034216165542603
-1.148044228553772 0.5004852414131165
-0.6523207426071167 0.46380317211151123
-1.4410159587860107 1.3921678066253662
0.0 1.0
0.0 1.0


In [38]:
env.reset()

array([ 1.1082649e-03,  1.4118699e+00,  1.1224973e-01,  4.2207070e-02,
       -1.2775118e-03, -2.5426269e-02,  0.0000000e+00,  0.0000000e+00],
      dtype=float32)

### Define feature function, and functions to transform data into correct for for crf

In [47]:
def feats(seq, k):
    
    return {
        'f11': np.format_float_positional(seq[k-1][0], 2, unique=False),
        'f12': np.format_float_positional(seq[k][0], 2, unique=False),
        'f21': np.format_float_positional(seq[k-1][1], 2, unique=False),
        'f22': np.format_float_positional(seq[k][1], 2, unique=False),
        'f31': np.format_float_positional(seq[k-1][2], 2, unique=False),
        'f32': np.format_float_positional(seq[k][2], 2, unique=False),
        'f41': np.format_float_positional(seq[k-1][3], 2, unique=False),
        'f42': np.format_float_positional(seq[k][3], 2, unique=False),
        'f51': np.format_float_positional(seq[k-1][4], 2, unique=False),
        'f52': np.format_float_positional(seq[k][4], 2, unique=False),
        'f61': np.format_float_positional(seq[k-1][5], 1, unique=False),
        'f62': np.format_float_positional(seq[k][5], 1, unique=False),
        'f71': seq[k-1][6],
        'f72': seq[k][6],
        'f71': seq[k-1][7],
        'f72': seq[k][7],
        'first': k == 0,
        'last': k == len(seq)-1,
        'bias': True
    }

def labs(seq, k):
    
    return str(seq[k][8])
    
def seq_to_feats(seq):
    
    return [feats(seq, k) for k in range(len(seq))]

def seq_to_labs(seq):
    
    return [labs(seq, k) for k in range(len(seq))]

X_train = [seq_to_feats(seq) for seq in sequences]
y_train = [seq_to_labs(seq) for seq in sequences]


### Train CRF

In [48]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=25000,
    all_possible_transitions=True,
    all_possible_states=True,
    verbose=0,
    c2=0
)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=True, all_possible_transitions=True,
    c2=0, keep_tempfiles=None, max_iterations=25000, verbose=0)

In [49]:
y_hat = np.array(np.hstack([np.array(seq_hat, dtype=np.float64) for seq_hat in crf.predict(X_train)]))

### Examine how well the learned policy performs in the enviroment

In [45]:
rewards = []

for j in range(100):
    sequence = []
    sequence.append(env.reset())
    i = 0
    done = False
    score = 0
    while not done:
        
        i += 1
        features = seq_to_feats(sequence[0:i])
        #print(features)
        policy = int(float(crf.predict_single(features)[-1]))
        observation, reward, done, info = env.step(policy)
        sequence.append(observation)
        score += reward
#         env.render()
#         time.sleep(0.01)
       
        
    
    rewards.append(score)

In [46]:
np.mean(rewards), np.std(rewards)

(194.38713321461356, 166.94966421357068)

In [33]:
import time
for j in range(10):
    sequence = []
    sequence.append(env.reset())
    i = 0
    done = False
    score = 0
    while not done:
        
        i += 1
        features = seq_to_feats(sequence[0:i])
        #print(features)
        policy = int(float(crf.predict_single(features)[-1]))
        observation, reward, done, info = env.step(policy)
        sequence.append(observation)
        #print(reward)
        env.render()
        time.sleep(0.01)
       
        
    
env.close()

2.2381338263113846
1.8531008083888878
1.8067958171762655
1.730909166934964
1.7101668304250097
1.6644569081003056
1.5844828443312053
1.4578036565962975
1.2693714533564844
1.0045933733251786
0.656728606794502
0.2384580714821425
-0.21201758033947726
-0.6420888648902121
-1.0084051833038643
-1.2925189257188094
-1.4975752892205207
-1.6372452893426725
-1.726943819579077
-1.7797161775448274
-1.8056011648912147
-1.8119055666760744
-1.6612753947148622
-1.9977619932529365
-2.055599364751629
-1.9302173867931174
-1.8898735363179924
-1.8462034171253947
-1.7999217205002367
-1.7515697515517274
-1.671206752526955
-1.874318447709527
-1.820725951065782
-1.7663684411189422
-1.7113296126405544
-1.6557531829848813
-1.5997282826895116
-1.5432813737048718
-1.486538126215578
-1.429502869086491
-1.372237726198108
-1.314761441622153
-1.257145919503273
-1.1993680374045539
-1.1414691970065007
-1.0834910070800845
-1.0254336809410631
-0.9673302709812504
-0.9091891035254207
-0.851016901056795
-0.7928443807174688
-0.7

In [None]:
import pickle

pickle.dump(crf, open('crf_models/acrobot_no_adversarial.pkl', 'wb'))

In [None]:
for i in range(10):
    print(env.action_space.sample())

### Compare with basic logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression(penalty='none', multi_class='multinomial', fit_intercept=True).fit(data[:,0:2], data[:,2])

In [None]:
scores = np.empty((100,))
for j in range(1):
    sequence = []
    observation = env.reset().reshape(1, -1)
    done = False
    score = 0
    while not done:

        features = seq_to_feats(sequence[0:i])
        policy = model.predict(observation).astype(int)[0]
        observation, reward, done, info = env.step(policy)
        observation = observation.reshape(1, -1)
        sequence[i] = observation
        print(reward)
#         env.render()
#         time.sleep(0.01)
        #i += 1
        
    
    scores[j] = score

In [None]:
env.close()

In [None]:

print('Logistic Regression score', np.mean(scores))
print('The less negative number is better')

In [None]:
fig, ax = plt.subplots(figsize=(12, 7))
sns.scatterplot(data.pos, data.vel, hue=model.predict(data.drop(['action', 'reward'], axis=1)), palette=['red', 'blue'])

In [None]:
import pickle

pickle.dump(crf, open('crf_mountain_car_no_adversarial', 'wb'))