In [None]:
#This code accepts a design matrix as an input and conducts three-stage, simulated wargames using the VPG algorithm
from spinup import ppo_pytorch as ppo, trpo_tf1 as trpo, vpg_pytorch as vpg
from spinup.utils.run_utils import call_experiment
from spinup.utils import test_policy

import torch
import gym
import tensorflow as tf
import pandas

import copy
import itertools
import timeit
import os

#This function takes output from the simulations, summarizes as appropriate for the research goals, and outputs to file.
#Actual simulation code starts after this function
def get_encounter_results(
    Run_Nbr,
    Stage,
    learning_rate,
    train_dur,
    hidden_lrs,
    combat_mdl,
    nbr_red,
    nbr_blue,
    map_wide,
    alpha):
    
    if os.path.exists(BASE_OUTPUT_DIR+'encounter_results.txt'):

        if not os.path.exists(BASE_OUTPUT_DIR+'Encounter_Results_Summary_RAW.csv'):
            create_file1 = pandas.DataFrame(columns=['Run_Nbr',
                                      'Stage',
                                      'Side',
                                      'Total_Agents_Remaining',
                                      'Total_Strength_Remaining' ])             
            create_file1.to_csv(BASE_OUTPUT_DIR+'Encounter_Results_Summary_RAW.csv', mode="w", index=False, header=True)        
        headers = [ i for i in range(max(nbr_red,nbr_blue)+1) ]
        encounter_VPG = pandas.read_csv(BASE_OUTPUT_DIR+'encounter_results.txt',sep=',', index_col=False, names=headers)
        encounter_VPG.rename(columns={0:'Side'}, inplace=True)
        encounter_VPG['Side'] = encounter_VPG['Side'].str.capitalize() #The side that won the encounter
        encounter_VPG["Total_Agents_Remaining"]=encounter_VPG.iloc[:, 1:max(nbr_red,nbr_blue)].count(axis=1) #The number of agents remaining in the winning side
        encounter_VPG["Total_Strength_Remaining"]=encounter_VPG.iloc[:, 1:max(nbr_red,nbr_blue)].sum(axis=1).round(5) #Total strength remaining in winning side
        encounter_VPG = encounter_VPG[["Side","Total_Agents_Remaining","Total_Strength_Remaining"]]        
        encounter_VPG.insert(0, "Run_Nbr",Run_Nbr)
        encounter_VPG.insert(1, "Stage",Stage)
        encounter_VPG.to_csv(BASE_OUTPUT_DIR+'Encounter_Results_Summary_RAW.csv', mode="a", index=False, header=False)
        
        if not os.path.exists(BASE_OUTPUT_DIR+'Encounter_Results_Summary.csv'):
            create_file = pandas.DataFrame(columns=['Run_Nbr',
                                      'Stage',
                                      'Side',
                                      'Total_Battles_Won',
                                      'Total_Agents_Remaining',
                                      'Total_Strength_Remaining',
                                      'Pct_Battles_Won',
                                      'Pct_Agents_Remaining',
                                      'Pct_Strength_Remaining' ]) 
            create_file.to_csv(BASE_OUTPUT_DIR+'Encounter_Results_Summary.csv', mode="w", index=False, header=True)
           
        
        interim=encounter_VPG.groupby('Side',as_index=False).agg(
            Total_Battles_Won=('Side', 'count'),
            Total_Agents_Remaining=('Total_Agents_Remaining', sum),
            Total_Strength_Remaining=('Total_Strength_Remaining', sum)    
        )
        interim['Total_Strength_Remaining'] = interim['Total_Strength_Remaining'].round(5)
        #Pct_Battles_Won = Nbr battles won / Total number of battles with a winner
        interim['Pct_Battles_Won']=(interim['Total_Battles_Won']/interim['Total_Battles_Won'].sum()).round(5) #Nbr. of battles won/Total number of battles with a winner

        #Pct_Agents_Remaining = Total agents remaining/total nbr of agents fielded = Total agents remaining/(nbr_red/blue * nbr of battles with a winner)
        interim.loc[interim['Side'] == 'Blue', 'Pct_Agents_Remaining'] = (interim['Total_Agents_Remaining']/(nbr_blue*interim['Total_Battles_Won'].sum())).round(5)
        interim.loc[interim['Side'] == 'Red', 'Pct_Agents_Remaining'] = (interim['Total_Agents_Remaining']/(nbr_red*interim['Total_Battles_Won'].sum())).round(5)

        #Pct_Strength_Remaining = Total strength remaining/total strength of agents fielded = Total strength remaining/(nbr_red/blue * 150 * nbr of battles with a winner)
        #Every red and blue agents starts with a strength of 150. 
        interim.loc[interim['Side'] == 'Blue', 'Pct_Strength_Remaining'] = (interim['Total_Strength_Remaining']/(nbr_blue*150*interim['Total_Battles_Won'].sum())).round(5)
        interim.loc[interim['Side'] == 'Red', 'Pct_Strength_Remaining'] = (interim['Total_Strength_Remaining']/(nbr_red*150*interim['Total_Battles_Won'].sum())).round(5)
        interim.insert(0, "Run_Nbr",Run_Nbr)
        interim.insert(1, "Stage",Stage)
        interim.to_csv(BASE_OUTPUT_DIR+'Encounter_Results_Summary.csv', mode="a", index=False, header=False)  
        
        if(Stage==3):
            if not os.path.exists(BASE_OUTPUT_DIR+'Encounter_Results_Summary_Red_Stage3_Wins.csv'):            
                create_file = pandas.DataFrame(columns=['Run_Nbr',
                                          'Stage',
                                          'Side',
                                          'Total_Battles_Won',
                                          'Total_Agents_Remaining',
                                          'Total_Strength_Remaining',
                                          'Pct_Battles_Won',
                                          'Pct_Agents_Remaining',
                                          'Pct_Strength_Remaining' ]) 
                create_file.to_csv(BASE_OUTPUT_DIR+'Encounter_Results_Summary_Red_Stage3_Wins.csv', mode="w", index=False, header=True)

            interim2=encounter_VPG.groupby(['Run_Nbr','Stage','Side'],as_index=False).agg(
                Total_Battles_Won=('Side', 'count'),
                Total_Agents_Remaining=('Total_Agents_Remaining', sum),
                Total_Strength_Remaining=('Total_Strength_Remaining', sum)    
            )
            interim2['Total_Strength_Remaining'] = interim2['Total_Strength_Remaining'].round(5)
            #Pct_Battles_Won = Nbr battles won / Total number of battles with a winner
            interim2['Pct_Battles_Won']=(interim2['Total_Battles_Won']/interim2['Total_Battles_Won'].sum()).round(5) #Nbr. of battles won/Total number of battles with a winner

            #Pct_Agents_Remaining = Total agents remaining/total nbr of agents fielded = Total agents remaining/(nbr_red/blue * nbr of battles with a winner)
            interim2.loc[interim2['Side'] == 'Blue', 'Pct_Agents_Remaining'] = (interim2['Total_Agents_Remaining']/(nbr_blue*interim2['Total_Battles_Won'].sum())).round(5)
            interim2.loc[interim2['Side'] == 'Red', 'Pct_Agents_Remaining'] = (interim2['Total_Agents_Remaining']/(nbr_red*interim2['Total_Battles_Won'].sum())).round(5)

            #Pct_Strength_Remaining = Total strength remaining/total strength of agents fielded = Total strength remaining/(nbr_red/blue * 150 * nbr of battles with a winner)
            #Every red and blue agents starts with a strength of 150. 
            interim2.loc[interim2['Side'] == 'Blue', 'Pct_Strength_Remaining'] = (interim2['Total_Strength_Remaining']/(nbr_blue*150*interim2['Total_Battles_Won'].sum())).round(5)
            interim2.loc[interim2['Side'] == 'Red', 'Pct_Strength_Remaining'] = (interim2['Total_Strength_Remaining']/(nbr_red*150*interim2['Total_Battles_Won'].sum())).round(5)

            interim2 = interim2.loc[(interim2['Stage'] == 3) & (interim2['Side'] == 'Red')]
            interim2.to_csv(BASE_OUTPUT_DIR+'Encounter_Results_Summary_Red_Stage3_Wins.csv', mode="a", index=False, header=False)         

        os.remove(BASE_OUTPUT_DIR+'/encounter_results.txt')

#Simulation starts below
'''
##########################################################################################
START     START     START     START     START     START     START     START     START
'''      
            
exp_name='Training Red Team Using Deep Reinforcement Learning'
SEED=1 

#Set location of input design matrix and output directory    
BASE_OUTPUT_DIR='/home/alanubuntu/Downloads/spinningup/data/'
DM_file='VPG Two Factor Interaction Design.csv'#Design Matrix For Experiment
Design_Matrix=pandas.read_csv(BASE_OUTPUT_DIR+'Design Matrix/'+ DM_file)   
DM_cols=["Map Width", "Red Team Size", "Blue Team Size", "Learning Factor", "Learning Rate", "Training Duration", "NN Structure", "Blue Alpha", "Combat Model"]
Design_Matrix = Design_Matrix[Design_Matrix.columns.intersection(DM_cols)]

#Set activation function for neural networks
activation_fct=torch.nn.ReLU

#Set starting positions for both teams on map
BLUE_START_DIR='S'
RED_START_DIR='N' 

#Final data set to output for analysis of 1st two stages
Training_Data_Summary_Raw = pandas.DataFrame(columns=['Run_Nbr','Stage','Epoch','Time'])   

print('##########################################################################################')
print('##########################################################################################')
print('##########################################################################################')

#Initialize timer and iteration count
total_iterations = Design_Matrix.shape[0] #Total number of iterations in this experiment
current_iteration = 0
total_time = 0.0
complete_time_start = timeit.default_timer()

#For every combination of variables in the design matrix, conduct a simulation
for row in range(Design_Matrix.shape[0]):
    map_wide = Design_Matrix.iloc[row]["Map Width"] #if map_wide=9, then python will create 10x10 map (ie (0...9)x(0...9))
    nbr_red = Design_Matrix.iloc[row]["Red Team Size"]
    nbr_blue = Design_Matrix.iloc[row]["Blue Team Size"]
    discount_factor = Design_Matrix.iloc[row]["Learning Factor"]
    learning_rate = Design_Matrix.iloc[row]["Learning Rate"]
    train_dur = Design_Matrix.iloc[row]["Training Duration"]
    hidden_lrs = eval(Design_Matrix.iloc[row]["NN Structure"])
    alpha = Design_Matrix.iloc[row]["Blue Alpha"] #Blue takes random action in Stage 3 with probability alpha. 
    combat_mdl = Design_Matrix.iloc[row]["Combat Model"].lower()
    
    #############################################  
    #############################################  
    start = timeit.default_timer()
    current_iteration += 1    
    print('***********************')
    print('***********************')
    print('Iteration #{} out of {}'.format(current_iteration,total_iterations))  
    print(map_wide, nbr_red, nbr_blue, discount_factor, learning_rate, train_dur, hidden_lrs, alpha, combat_mdl)
    avg_time=total_time/current_iteration
    runs_remaining=total_iterations-(current_iteration-1)
    print('Avg. time per run = {}. Estimated total time remaining = {} seconds = {} minutes = {} hours'.format(avg_time, (avg_time)*(runs_remaining), (avg_time)*(runs_remaining)/60, (avg_time)*(runs_remaining)/(60*60)))      
    
    #Initial training of red to defeat blue while blue is stationary
    #The roles will later be subsequently reversed and this learned strategy will be take as a proxy for 
    #blue's read-world strategy
    env_fn = lambda : gym.make('WARGAME-SIM-v0', 
                               NBR_RED_AGENTS = nbr_red, 
                               NBR_BLUE_AGENTS = nbr_blue, 
                               MAP_WIDTH = map_wide,
                               BLUE_ALPHA = 0, #blue will not move during initial training, so no random actions
                               BLUE_ACTION = "STATIONARY",
                               BLUE_START_DIR = BLUE_START_DIR,
                               RED_START_DIR = RED_START_DIR,
                               COMBAT_MODEL = combat_mdl 
                              )
    INITIAL_OUTPUT_DIR_VPG = BASE_OUTPUT_DIR+'TEST_VPG1' 

    print("Stage 1")
    if os.path.exists(BASE_OUTPUT_DIR+'encounter_results.txt'):
        os.remove(BASE_OUTPUT_DIR+'encounter_results.txt')
    vpg(
            env_fn = env_fn, 
            logger_kwargs = dict(output_dir=INITIAL_OUTPUT_DIR_VPG, exp_name=exp_name), 
            ac_kwargs = dict(hidden_sizes=hidden_lrs, activation=activation_fct), 
            pi_lr = learning_rate,
            gamma = discount_factor,        
            epochs = train_dur,
            seed = SEED
            )
    get_encounter_results(current_iteration,1,learning_rate,train_dur,hidden_lrs,combat_mdl,nbr_red,nbr_blue,map_wide,alpha)
    
    #############################################    
    #############################################  
    #Now blue will act according to the model trained earlier and red will be trained to defeat blue.
    #Since blue is actually used red's strategy for movement, the starting positions of red and blue are reversed from the previous stage.
    env_fn = lambda : gym.make('WARGAME-SIM-v0', 
                               NBR_RED_AGENTS = nbr_red, 
                               NBR_BLUE_AGENTS = nbr_blue, 
                               MAP_WIDTH = map_wide,
                               BLUE_ALPHA = 0, #Blue will move according to model below. No random actions
                               BLUE_ACTION = INITIAL_OUTPUT_DIR_VPG + '/pyt_save/model.pt',
                               BLUE_START_DIR = RED_START_DIR,
                               RED_START_DIR = BLUE_START_DIR,
                               COMBAT_MODEL = combat_mdl 
                              )
    SECOND_OUTPUT_DIR_VPG = BASE_OUTPUT_DIR+'TEST_VPG2' 
    print("Stage 2")
    if os.path.exists(BASE_OUTPUT_DIR+'encounter_results.txt'):
        os.remove(BASE_OUTPUT_DIR+'encounter_results.txt')
    vpg(
            env_fn = env_fn, 
            logger_kwargs = dict(output_dir=SECOND_OUTPUT_DIR_VPG, exp_name=exp_name), 
            ac_kwargs = dict(hidden_sizes=hidden_lrs, activation=activation_fct), 
            pi_lr = learning_rate,
            gamma = discount_factor,        
            epochs = train_dur,
            seed = SEED             
            )
    get_encounter_results(current_iteration,2,learning_rate,train_dur,hidden_lrs,combat_mdl,nbr_red,nbr_blue,map_wide,alpha)
    
    #############################################    
    #############################################  
    #Now red has been trained to defeat blue under a fixed strategy (i.e. the model trained in the Stage 1).
    #Collecting parameter settings and time-to-train information from each epoch and training stage
    df_VPG1 = pandas.read_csv(INITIAL_OUTPUT_DIR_VPG+'/progress.txt',sep='\t')     
    df_VPG1.insert(0, "Run_Nbr",current_iteration)
    df_VPG1.insert(1, "Stage",1)
    df_VPG2 = pandas.read_csv(SECOND_OUTPUT_DIR_VPG+'/progress.txt',sep='\t')   
    df_VPG2.insert(0, "Run_Nbr",current_iteration)
    df_VPG2.insert(1, "Stage",2)
    
    PRE_FINAL=pandas.concat( [df_VPG1[["Run_Nbr","Stage","Epoch", "Time"]] , df_VPG2[["Run_Nbr","Stage","Epoch", "Time"]] ] )
    
    Training_Data_Summary_Raw=pandas.concat([Training_Data_Summary_Raw,PRE_FINAL],ignore_index=True)
    Training_Data_Summary_Raw["Time"] = Training_Data_Summary_Raw["Time"].round(5)
    
    #############################################    
    #############################################
    #Now we want to test the trained red team's generalizability/ability to cope with new blue strategies. 
    #We do this by adding increasingly random movements to blue's previously-learned strategy
    
    #Get the last environment, including last blue strategy and last-trained red strategy
    env, get_action = test_policy.load_policy_and_env(fpath=SECOND_OUTPUT_DIR_VPG) 
    env_f=copy.deepcopy(env)
    #Assign blue a new strategy within that environtment and a new probability of taking random action
    env_f.env.BLUE_ACTION= SECOND_OUTPUT_DIR_VPG+'/pyt_save/model.pt'#'RANDOM'     
    env_f.env.BLUE_ALPHA = alpha #Will take random action with probability BLUE_ALPHA. 0=never, 1=always

    #Test red's ability to copy with blue's modified strategy      
    print("Stage 3")
    if os.path.exists(BASE_OUTPUT_DIR+'encounter_results.txt'):
        os.remove(BASE_OUTPUT_DIR+'encounter_results.txt')
    nbr_of_encounters=100 #The number of "battles" to let play out -without further updating parameters- between the trained red team and the blue under the BLUE_ACTION policy
    test_policy.run_policy(env_f, get_action, 0, nbr_of_encounters, False) #Change to True to visualize in real-time
      
    get_encounter_results(current_iteration,3,learning_rate,train_dur,hidden_lrs,combat_mdl,nbr_red,nbr_blue,map_wide,alpha)
      
    total_time += timeit.default_timer()-start
    
#************************************************
#************************************************
#************************************************
#************************************************
#From here we summarize output data generated by ALL simulations

interim1=Training_Data_Summary_Raw.groupby(['Run_Nbr','Stage'],as_index=False).agg(
    Average_Time=('Time', 'mean')  
)
interim1['Average_Time'] = interim1['Average_Time'].round(5)

interim2=Training_Data_Summary_Raw.groupby(['Run_Nbr'],as_index=False).agg(
    Average_Time=('Time', 'mean') ,
    Total_Time=('Time', sum)
)
interim2['Average_Time'] = interim2['Average_Time'].round(5)
interim2['Total_Time'] = interim2['Total_Time'].round(5)

Training_Data_Summary_Raw.to_csv(BASE_OUTPUT_DIR+'Training_Data_Summary_RAW.csv', index=False) 
interim1.to_csv(BASE_OUTPUT_DIR+'Training_Data_Summary_By_Stage.csv', index=False)   
interim2.to_csv(BASE_OUTPUT_DIR+'Training_Data_Summary_By_Run_Nbr.csv', index=False)  

Design_Matrix.insert(0, "Run_Nbr",Design_Matrix.index+1)
Design_Matrix=Design_Matrix.merge(interim2[['Run_Nbr','Average_Time']], on='Run_Nbr',how='left')

#That file will be empty if there were no successful battles (for red) in Stage 3
if not os.path.exists(BASE_OUTPUT_DIR+'Encounter_Results_Summary_Red_Stage3_Wins.csv'):   
    print(" does not exist. There were no successful battles in Stage 3.")
    Design_Matrix = Design_Matrix.rename(columns={'Average_Time': 'Training Time'})
    cols_to_round = ['Training Time']
    Design_Matrix[cols_to_round] = Design_Matrix[cols_to_round].round(2)
    Design_Matrix.to_csv(BASE_OUTPUT_DIR + DM_file.replace(".csv", "") + ' Completed_NoStg3.csv', index=False) 
else:
    red_encounter_data=pandas.read_csv(BASE_OUTPUT_DIR+'Encounter_Results_Summary_Red_Stage3_Wins.csv')
    Design_Matrix=Design_Matrix.merge(red_encounter_data[['Run_Nbr','Pct_Battles_Won','Pct_Agents_Remaining','Pct_Strength_Remaining']], on='Run_Nbr',how='left')
    Design_Matrix = Design_Matrix.rename(columns={'Average_Time': 'Training Time', 'Pct_Battles_Won': 'Win Rate'})
    cols_to_round = ['Training Time','Win Rate','Pct_Agents_Remaining','Pct_Strength_Remaining']
    Design_Matrix[cols_to_round] = Design_Matrix[cols_to_round].round(2)
    Design_Matrix.to_csv(BASE_OUTPUT_DIR + DM_file.replace(".csv", "") + ' Completed.csv', index=False) 

total_time_required=timeit.default_timer()-complete_time_start
print('TOTAL TIME REQUIRED: {} seconds = {} minutes = {} hours'.format( total_time_required, total_time_required/60, total_time_required/(60*60)))

#Got rid of all the stdout to screen, except what is output here.
#I commented out printing in: ppo.py, logx.py, test_policy.py in downloads/spinningup and logger.py in gym