[link](https://towardsdatascience.com/a-reinforcement-learning-based-inventory-control-policy-for-retailers-ac35bc592278)

## Imports

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import torch

from create_historical_demand import create_demand_history
from agent import Agent
from environment import InvOptEnv

from ss_profit_calc import profit_calculation_s_s

## Train the agent

In [None]:
agent = Agent(state_size=7, action_size=21, seed=42)

In [None]:
def dqn(env, n_episodes=1000, max_t=10000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
  '''Deep Q-Learning

  Params
  ======
    n_episodes (int): maximum number of training epsiodes
    max_t (int): maximum number of timesteps per episode
    eps_start (float): starting value of epsilon, for epsilon-greedy action selection
    eps_end (float): minimum value of epsilon
    eps_decay (float): mutiplicative factor (per episode) for decreasing epsilon

  '''
  scores = [] # list containing score from each episode
  eps = eps_start
  for i_episode in range(1, n_episodes+1):
    state = env.reset()
    score = 0
    for t in range(max_t):
      action = agent.act(state,eps)
      next_state,reward,done = env.step(action)
      agent.step(state,action,reward,next_state,done)
      ## above step decides whether we will train(learn) the network
      ## actor (local_qnetwork) or we will fill the replay buffer
      ## if len replay buffer is equal to the batch size then we will
      ## train the network or otherwise we will add experience tuple in our
      ## replay buffer.
      state = next_state
      score += reward
      if done:
        print('episode '+str(i_episode).zfill(len(str(n_episodes)))+':', score)
        scores.append(score)
        break
    eps = max(eps*eps_decay, eps_end)## decrease the epsilon
  return scores

In [None]:
demand_history = create_demand_history()
plt.hist(demand_history)

In [None]:
env = InvOptEnv(demand_history)
scores= dqn(env)

In [None]:
plt.plot(np.arange(len(scores)),scores)
plt.ylabel('Reward')
plt.xlabel('Epsiode #')
plt.show()

In [None]:
torch.save(agent.qnetwork_local.state_dict(), './agent_qnetwork_local_state_dict.pt')

In [None]:
s_s_list = []
for S in range(1,61): # give a little room to allow S to exceed the capacity
  for s in range(0,S):
    s_s_list.append([s,S])

profit_sS_list = []
for sS in s_s_list:
  profit_sS_list.append(profit_calculation_s_s(sS[0],sS[1],demand_history))

best_sS_profit = np.max(profit_sS_list)
best_sS = s_s_list[np.argmax(profit_sS_list)]