In [1]:
import numpy as np
import pandas as pd
import random
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import csv

from __future__ import print_function
import argparse

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
def v_wrap(np_array, dtype=np.float32):
    if np_array.dtype != dtype:
        np_array = np_array.astype(dtype)
    return torch.from_numpy(np_array)


def normalized_columns_initializer(weights, std=1.0):
    out = torch.randn(weights.size())
    out *= std / torch.sqrt(out.pow(2).sum(1, keepdim=True))
    return out


def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        weight_shape = list(m.weight.data.size())
        fan_in = np.prod(weight_shape[1:4])
        fan_out = np.prod(weight_shape[2:4]) * weight_shape[0]
        w_bound = np.sqrt(6. / (fan_in + fan_out))
        m.weight.data.uniform_(-w_bound, w_bound)
        m.bias.data.fill_(0)
    elif classname.find('Linear') != -1:
        weight_shape = list(m.weight.data.size())
        fan_in = weight_shape[1]
        fan_out = weight_shape[0]
        w_bound = np.sqrt(6. / (fan_in + fan_out))
        m.weight.data.uniform_(-w_bound, w_bound)
        m.bias.data.fill_(0)


In [3]:
class job_shop_env():
    path = '/workspaces/learnings/Job_Shop_Scheduling_Problem_with_Reinforcement_Learning/data/'
    expert_job = pd.read_csv(path + 'process_time_matrix.csv',header=None).drop([0]).values
    job = pd.read_csv(path + 'work_order.csv',header=None).values
    
    def __init__(self):
        self.job_cluster = self.expert_job.shape[1]
        self.expert = self.expert_job.shape[0]
        self.job_num = self.job.shape[0]
        self.process_time = self.expert_job
        self.expert_status = np.repeat(0,self.expert) 
        self.expert_process_job = [[] for i in range(self.expert)]
        self.expert_process_time = [[] for i in range(self.expert)]
        self.job_waiting_time = [[] for i in range(self.expert)]
        self.left_job = self.job.shape[0]
        self.done = False
        self.total_time = 0  
        self.job_distribute_time = np.repeat(0,self.job.shape[0])
        self.total_job_process_time = np.repeat(0,self.job.shape[0])
        self.job_status = np.repeat(1,self.job.shape[0])  
        self.job_index = list(range(self.job.shape[0]))  
        self.timeindex = 0   
        self.state = np.vstack((self.job_status,self.job_distribute_time))
        self.state = self.state.reshape(self.state.shape[0],self.state.shape[1],1)
        self.done_job = [] 
        self.done_expert = [] 
        self.job_start_time = [] 
        self.state_dim = self.state.shape[0]
        self.action_dim = 2
        
        
    def reset(self):
        self.job_num = self.job.shape[0]
        self.expert_status = np.repeat(0,self.expert) 
        self.expert_process_job = [[] for i in range(self.expert)]
        self.expert_process_time = [[] for i in range(self.expert)]
        self.job_waiting_time = [[] for i in range(self.expert)]
        #self.left_job = self.job.shape[0]
        self.done = False
        self.total_time = 0  
        self.job_distribute_time = np.repeat(0,self.job.shape[0])
        self.total_job_process_time = np.repeat(0,self.job.shape[0])
        self.job_status = np.repeat(1,self.job.shape[0])  
        self.job_index = list(range(self.job.shape[0]))  
        #self.timeindex = 0  
        self.state = np.vstack((self.job_status,self.job_distribute_time))
        self.state = self.state.reshape(self.state.shape[0],self.state.shape[1],1)
        self.done_job = []
        self.done_expert = [] 
        self.job_start_time = [] 
        
        return self.state
        
    def step(self, action):
        
        job_id = np.random.choice(a=self.job_num, size=self.expert, replace=False, p=None)
        for i in job_id:
            if len(self.job_index) != 0:
                if i in self.job_index:
                    self.job_distribute_time[i] += 1
                    ## if more than 2, delete this job
                    #if self.job_distribute_time[i] >= 2:
                    #    del self.job_index[self.job_index.index(i)]
                else:
                    job_id[job_id.tolist().index(i)] = random.sample(self.job_index,1)[0]
            else:
                pass
        
        assert action.shape[0] == self.expert
        
        for i in range(self.expert):
            ## only process those jobs that are in job_index
            if job_id[i] in self.job_index:
                ## action = 0 indicates do not give jobs to the expert
                if action[i] == 0 or self.expert_status[i] == 3:
                    pass
                else:
                    self.expert_process_job[i].append(job_id[i])
                    self.expert_status[i] += 1
                    self.job_status[job_id[i]] = 0
                    self.expert_process_time[i].append(0)
                    # how much time a job wait before processing
                    self.job_waiting_time[i].append(self.timeindex)
                    # if expert could not handle the job, exit
                    self.total_job_process_time[job_id[i]] = self.process_time[i][self.job[job_id[i]][2]]
                
                delete_index = []
                for j in range(len(self.expert_process_time[i])):
                    if len(self.expert_process_job[i]) != 0:
                        if self.expert_process_time[i][j] == self.process_time[i][self.job[self.expert_process_job[i][j]][2]]:
                            # if job finished, workload of expert would decrease
                            self.expert_status[i] -= 1
                            self.done_expert.append(i)
                            if self.expert_process_job[i][j] not in self.done_job:
                                self.left_job -= 1
                            self.done_job.append(self.expert_process_job[i][j])
                            ## calculate when the job starts to be processed by subtracting the process time
                            self.job_start_time.append(self.job_waiting_time[i][j] + self.job[self.expert_process_job[i][j]][1])
                            delete_index.append(j)
                if len(delete_index) > 0:
                    if len(delete_index) > 1:
                        delete_index.sort(reverse = True)
                    for k in delete_index:
                        del self.expert_process_job[i][k]
                        del self.expert_process_time[i][k]
            ## calculate total time consumed
            self.total_time += sum(self.job_waiting_time[i]) + self.total_job_process_time[i].sum()
            self.expert_process_time[i] = [m + 1 for m in self.expert_process_time[i]]
        
        ## reward takes the minus of total time*0.001 and left job num
        #print(self.total_time)
        reward = 1 - self.left_job/self.job_num
        self.timeindex += 1
        
        ## update state info
        self.state = np.vstack((self.job_status,self.job_distribute_time))
        self.state = self.state.reshape(self.state.shape[0],self.state.shape[1],1)
        
        if self.left_job == 0:
            self.done = True
        #print(self.expert_status)
        #print(self.expert_process_job)
        #print(self.done_job)
        return self.state, reward, self.done, self.done_job, self.done_expert, self.job_start_time

    def update(self,delete_list):
        if len(delete_list) != 0:
            for i in delete_list:
                if i in self.job_index:
                    self.job_index.remove(i)
        else:
            pass

In [4]:

class ActorCritic(torch.nn.Module):
    def __init__(self, num_inputs, action_space):
        super(ActorCritic, self).__init__()
        self.conv1 = nn.Conv2d(num_inputs, 32, 3, stride=2, padding=1)
        self.conv2 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
        self.conv3 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
        self.conv4 = nn.Conv2d(32, 32, 3, stride=2, padding=1)

        self.lstm = nn.LSTMCell(32*553, 256)

        num_outputs = action_space
        self.critic_linear = nn.Linear(256, 1)
        self.actor_linear = nn.Linear(256, num_outputs)

        self.apply(weights_init)
        self.actor_linear.weight.data = normalized_columns_initializer(
            self.actor_linear.weight.data, 0.01)
        self.actor_linear.bias.data.fill_(0)
        self.critic_linear.weight.data = normalized_columns_initializer(
            self.critic_linear.weight.data, 1.0)
        self.critic_linear.bias.data.fill_(0)

        self.lstm.bias_ih.data.fill_(0)
        self.lstm.bias_hh.data.fill_(0)

        self.train()

    def forward(self, inputs):
        inputs, (hx, cx) = inputs
        x = F.relu(self.conv1(inputs))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))

        x = x.view(-1, 32*553)
        hx, cx = self.lstm(x, (hx, cx))
        x = hx

        return self.critic_linear(x), self.actor_linear(x), (hx, cx)
    
    def choose_action(self,inputs,action_dim):
        s, (hx, cx) = inputs
        value, logit, (hx, cx) = self.forward((s.unsqueeze(0),(hx, cx)))
        prob = F.softmax(logit, dim=-1)
        log_prob = F.log_softmax(logit, dim=-1)
        entropy = -(log_prob * prob).sum(1, keepdim=True)
        
        #action = prob.multinomial(num_samples=action_dim).detach()
        action=[]
        for i in range(action_dim):
            action.append(prob.multinomial(num_samples=1).detach()[0])
        action = torch.from_numpy(np.array(action,dtype = np.int64).reshape(1,133))
        return action, log_prob, entropy, value

In [5]:

def train(args):
    torch.manual_seed(args.seed)

    env = job_shop_env()
    
    model = ActorCritic(env.state_dim, env.action_dim)

    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    model.train()

    state = env.reset()
    state = v_wrap(state)
    done = True
    action_dim = env.expert

    episode_length = 0
    complete_jobs = []
    expert_complete_job = []
    complete_job_start_time = []
    update_list = []
    for episode in range(args.episode):
        
        if done:
            cx = torch.zeros(1, 256)
            hx = torch.zeros(1, 256)
        else:
            cx = cx.detach()
            hx = hx.detach()
        
        if len(complete_jobs) != 0:
            update_list = [n for m in complete_jobs for n in m]
            env.update(update_list)

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps+1):
            episode_length += 1
            

            action, log_prob, entropy, value = model.choose_action((state, (hx,cx)),action_dim)
            log_prob = log_prob.gather(1, action)[0]
            
            state, reward, done, done_job, done_expert, job_start_time = env.step(action.view(-1,).numpy())
            done = done or episode_length >= args.max_episode_length
            ## reward shaping
            reward = max(min(reward, 1), -1)
            if episode_length % 20 == 0:
                print(reward)
                #print(done_job)

            if done:
                complete_jobs.append(done_job)
                expert_complete_job.append(done_expert)
                complete_job_start_time.append(job_start_time)
                print('Complete these jobs with 100 iterations:')
                print(complete_jobs)
                print('Current episode:',episode)
                episode_length = 0
                state = env.reset()

            state = v_wrap(state)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)
            entropies.append(entropy)
            if done:
                break
        
        if len(list(set(update_list))) > 8800:
            ## write results into the csv file
            with open('submit_{}.csv'.format(len(list(set(update_list)))),'w') as f:
                writer = csv.writer(f)
                for i in range(len(complete_jobs)):
                    for j in range(len(complete_jobs[i])):
                        writer.writerow([complete_jobs[i][j]+1, expert_complete_job[i][j]+1, complete_job_start_time[i][j]])

        if episode == args.episode -1 or len(list(set(update_list))) == 8840:
            ## write results into the csv file
            with open('submit.csv','w') as f:
                writer = csv.writer(f)
                for i in range(len(complete_jobs)):
                    for j in range(len(complete_jobs[i])):
                        writer.writerow([complete_jobs[i][j]+1, expert_complete_job[i][j]+1, complete_job_start_time[i][j]])
            break

        R = torch.zeros(1, 1)
        if not done:
            value, _, _ = model((state.unsqueeze(0), (hx, cx)))
            R = value.detach()

        values.append(R)
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimation
            delta_t = rewards[i] + args.gamma * \
                values[i + 1] - values[i]
            gae = gae * args.gamma * args.gae_lambda + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * gae.detach() - args.entropy_coef * entropies[i]

        optimizer.zero_grad()

        (policy_loss + args.value_loss_coef * value_loss).backward(torch.ones_like(policy_loss))
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        optimizer.step()
        print(policy_loss.mean() + args.value_loss_coef * value_loss)
        print('para updated')

In [16]:
parser = argparse.ArgumentParser(description='JSSPRL')
parser.add_argument('--lr', type=float, default=0.0001,
                    help='learning rate (default: 0.0001)')
parser.add_argument('--gamma', type=float, default=0.99,
                    help='discount factor for rewards (default: 0.99)')
parser.add_argument('--gae-lambda', type=float, default=1.00,
                    help='lambda parameter for GAE (default: 1.00)')
parser.add_argument('--entropy-coef', type=float, default=0.01,
                    help='entropy term coefficient (default: 0.01)')
parser.add_argument('--value-loss-coef', type=float, default=0.5,
                    help='value loss coefficient (default: 0.5)')
parser.add_argument('--max-grad-norm', type=float, default=50,
                    help='value loss coefficient (default: 50)')
parser.add_argument('--seed', type=int, default=1,
                    help='random seed (default: 1)')
parser.add_argument('--num-steps', type=int, default=20,
                    help='number of forward steps in A3C (default: 20)')
parser.add_argument('--max-episode-length', type=int, default=1000000,
                    help='maximum length of an episode (default: 1000000)')
parser.add_argument('--episode', type=int, default=10,
                    help='How many episode to train the RL algorithm')

#args = parser.parse_args()
print('start training...')
#train(args)

#if __name__ == '__main__':
#    
#    args = parser.parse_args()
#    print('start training...')
#    train(args)
    

start training...


In [15]:
! python /workspaces/learnings/Job_Shop_Scheduling_Problem_with_Reinforcement_Learning/run.py --lr=0.01 --gamma=0.9 --seed=2020 --num-steps=100 --max-episode-length=100

start training...
0.0007918552036199067
0.002149321266968318
0.0039592760180995334
0.005429864253393646
0.006674208144796356
Complete these jobs with 100 iterations:
[[6219, 5586, 2234, 523, 2880, 234, 8754, 7305, 6849, 2604, 905, 1755, 2391, 8805, 7689, 8345, 8697, 7339, 753, 4237, 3726, 2836, 6105, 5473, 2992, 4027, 5873, 5095, 824, 8073, 70, 5023, 5112, 3839, 6896, 5691, 1058, 1499, 2583, 5103, 8235, 1869, 1702, 2698, 3499, 2708, 8190, 7080, 854, 2095, 4585, 4241, 8182, 4209, 1353, 2196, 7177, 3339, 8680]]
Current episode: 0
tensor([[1.3647]], grad_fn=<AddBackward0>)
para updated
0.007692307692307665
0.009728506787330282
0.011312217194570096
0.012443438914027105
0.013687782805429816
Complete these jobs with 100 iterations:
[[6219, 5586, 2234, 523, 2880, 234, 8754, 7305, 6849, 2604, 905, 1755, 2391, 8805, 7689, 8345, 8697, 7339, 753, 4237, 3726, 2836, 6105, 5473, 2992, 4027, 5873, 5095, 824, 8073, 70, 5023, 5112, 3839, 6896, 5691, 1058, 1499, 2583, 5103, 8235, 1869, 1702, 2698, 3499,