In [1]:
import os
import subprocess
import pandas as pd
import shutil
from paramiko import SSHClient, AutoAddPolicy


In [34]:
class BackendManager:
    def __init__(self, server, uname, app):
        self.server = server
        self.uname = uname
        self.app = app
        self.client = SSHClient()
        self.client.set_missing_host_key_policy(AutoAddPolicy())
        self.connected = False
        
        
    def connect(self):
        self.client.connect(self.server, username=self.uname)
        self.connected = True
        
    def close(self):
        self.client.close()
        self.connected = False
        
    def generate_experiments(self, expression_dir, args, threads):
        if self.connected:
            cmd = "source ~/.analyzer; "
            cmd += "cd {}; ".format(expression_dir)
            cmd += "python generate-variants-linnea.py {} --threads={};".format(" ".join(args), threads)
            
            print(cmd)
            
            _, stdout, _ = self.client.exec_command(cmd)
            
            ret = stdout.readlines()
            print(ret)
            if "Generated Variants" in ret[-1]:
                return 0
            else:
                return 1
        else:
            return -1
        
        
    def check_if_file_exists(self, file_path):
        if self.connected:
            cmd = "test -f {};".format(file_path)
            _, stdout, _ = self.client.exec_command(cmd)
            ret = stdout.channel.recv_exit_status()
            if stdout.channel.recv_exit_status() == 0:
                return True
            return False
        return -1
        
    def run_experiments(self, runner_file):
        if self.connected:
            args_dir, script = os.path.split(runner_file)
            
            cmd = "source ~/.analyzer; "
            cmd += "cd {}; ".format(args_dir)
            cmd += "{} {};".format(self.app, script)
            
            _, stdout, _ = self.client.exec_command(cmd)
            
            print(cmd)

            if stdout.channel.recv_exit_status() == 0:
                return 0
            print("Error: ", stdout.channel.recv_exit_status())
            return 1
        return -1
    
    def check_slrum_status(self, jobname):
        if self.connected:
            cmd = "squeue --format=\"%.18i %.9P %.30j %.8u %.8T %.10M %.9l %.6D %R\" --me"
            _, stdout, _ = self.client.exec_command(cmd)
            ret = stdout.readlines()
            for j in ret:
                if jobname in j.split():
                    print(j)
                    return 2
            
            print(ret)
            return 0
        
    def copy_from_backend(self,backend_path, local_path):
        call = 'scp {uname}@{server}:{backend_path} {local_path}'.format(uname=self.uname,
                                                                     server=self.server,
                                                                     backend_path=backend_path,
                                                                     local_path=local_path)
        print(call)
        try:
            ret = subprocess.check_output(call.split())
            print(ret)
            return 0
        except Exception as e:
            print(e)
            return 1
        
    def cancel_job(self,job_name):
        pass
    
    def debug_cmd(self, cmd):
        call =  'ssh -l {} {}'.format(self.uname, self.server).split()
        ret = subprocess.check_output(call + [cmd,])
        print(ret)
            
         

In [16]:
class Runner:
    def __init__(self, name, expression_dir, args, threads=4, backend=None):
        """
        This class handles the code generation and execution of the variant codes. 
        The generated event data can be obtained as a pandas dataframe.
        
        Requirements:
        
        It is assumed that there exists a script file that generates variant codes
        for a given oopoerand sizes. The operand sizes are input as command line args
            e.g. run of script file: python generate.py 10 10 10 10 12
        
        After running the script file, inside the folder "experiments", which is in 
        the same directory as the script file, an "argument folder" is generated, 
        which contains the case_table, event_meta_table (i.e, the event table without actual run times)
        ,and a runner script as shown in the expample below:
            e.g. experiment/10_10_10_10_12/
                    case_table.csv
                    event_meta_table.csv
                    runner.jl 
        
        'runner.jl' is the script that runs the experiments and generates a log file 'run_times.txt' (which is the
        event table with actual run times) in the "arguments folder"
        
        
        INPUT:
        
        name: Experiment name
        script_path: Path to the script file that generates variants
        args: operand sizes (or arguments to the script file)
        
        USECASE:
        If the behavior of the script is as said in the requirements, this class can 
        call the scipt file and collects the eventlogs as a pandas dataframe, and 
        if needed, can also clean the generated folders.
        
        """
        self.name = name
        self.expression_dir = expression_dir
        self.threads = threads
        
        self.script_path = os.path.join(self.expression_dir, "generate-variants-linnea.py")
        self.args = args
        self.args_dir = os.path.join(self.expression_dir,
                                   "experiments",
                                  "_".join(self.args))
        
        self.backend = backend
        
    
    def generate_experiments(self):  
        """
        generates experiments for a given set of valid arguments
        that can be given as input to the script file.
            e.g. in,  python generate.py 10 10 10 10 12
            ['10','10','10','10','12'] would be the argument list.
            
        Output: Return code == 0 implies successful completion 
        """
        
        if not self.backend:
            call = ["python", self.script_path] + self.args + ["--threads={}".format(self.threads)]
            completed_proccess = subprocess.run(call)
            ret = completed_proccess.returncode
        else:
            ret = self.backend.generate_experiments(self.expression_dir, self.args, self.threads)
            
        return ret
    
       
    def run_experiments(self):
        """
        executes the runner file, which generates run_times.txt
        """ 
        runner_path = os.path.join(self.args_dir,"runner.jl")
        if not self.backend:
            if os.path.exists(self.args_dir):
                print("Running Experiments locally")
                completed_proccess = subprocess.run(["julia", runner_path])
                if completed_proccess.returncode == 0:
                    print("Experiments completed locally")
                    return 0 # Ran experiment  
        else:    
            ret = self.backend.run_experiments(runner_path)
            if ret == 0:
                print("Running experiments in the backend.")
                return 0
            
        return -1
                
    def get_case_table(self):
        """get case table"""
        if os.path.exists(self.exp_dir):
            return self.read_log(os.path.join(self.exp_dir,"case_table.csv"))
        return -1
    
    def get_event_meta_table(self):
        """get event table without actual execution times."""
        if os.path.exists(self.exp_dir):
            return self.read_log(os.path.join(self.exp_dir,"event_meta_table.csv"))
        return -1
    
    def get_event_runtime_table(self):    
        """get event table with actual execution times."""
        if os.path.exists(self.exp_dir):
            return self.read_log(os.path.join(self.exp_dir,"run_times.txt"))
        return -1
    
    def get_all_tables(self, meta=True):
        """get all tables"""
        case_table = self.get_case_table()
        event_meta_table = None
        if meta:
            event_meta_table = self.get_event_meta_table()
        event_runtime_table = self.get_event_runtime_table()
        return (case_table,event_meta_table,event_runtime_table)
        
    def read_log(self, log_path):
        if os.path.exists(log_path):
            df = pd.read_csv(log_path,sep=';')
            return df
        return -1

    def isGenerated(self):
        if os.path.exists(self.exp_dir):
            return True
        return False
    
    def isRun(self):
        if os.path.exists(os.path.join(self.exp_dir, "run_times.txt")):
            return True
        return False
    
    def clean(self):
        """remove arguments folder"""
        if os.path.exists(self.exp_dir):
            shutil.rmtree(self.exp_dir)
        else:
            return -1
            
            

In [17]:
class DataCollector:
    def __init__(self,local_data_dir, backend_data_dir=None, backend=None):
        self.local_data_dir = local_data_dir
        self.backend = backend
        self.backend_data_dir = backend_data_dir
        
    
    def read_log(self, log_path):
        if os.path.exists(log_path):
            df = pd.read_csv(log_path,sep=';')
            return df
        return -1
    
    def get_table(self, table_name):
        table_path = os.path.join(self.local_data_dir,table_name)
        if os.path.exists(table_path):
            return self.read_log(table_path)
        elif self.backend_data_dir:
            backend_path = os.path.join(self.backend_data_dir, table_name)
            self.backend.copy_from_backend(backend_path, self.local_data_dir)
            if os.path.exists(table_path):
                return self.read_log(table_path)
            
        return -1
    
    def get_case_table(self):
        return self.get_table("case_table.csv")      
                    
    def get_event_meta_table(self):
        """get event table without actual execution times."""
        return self.get_table("event_meta_table.csv")
    
    def get_all_runtimes_table(self):    
        """get event table with actual execution times."""
        return self.get_table("run_times.txt")
      

In [41]:
args = ["90","90","90","90","91"]
args_dir_name = "_".join(args)
app = "sbatch submit.sh"
bm = BackendManager("login18-1.hpc.itc.rwth-aachen.de", "as641651", app)
bm.connect()

In [42]:
local_data_dir = "../Matrix-Chain-4/variants-linnea/cluster-experiments/{}".format(args_dir_name)
if not os.path.exists(local_data_dir):
    os.makedirs(local_data_dir)
local_data_dir

'../Matrix-Chain-4/variants-linnea/cluster-experiments/90_90_90_90_91'

In [43]:
backend_exp_dir = "~/PhD/performance-analyazer/Experiment2/Matrix-Chain-4/variants-linnea"
runner = Runner("Matrix-Chain", exp_dir,args,
                threads=8,
                backend=bm)

job_name = "{}_T{}".format(args_dir_name, 8)

In [44]:
data_collector = DataCollector(local_data_dir, runner.args_dir, bm)

In [45]:
ct = data_collector.get_case_table()
ct

scp as641651@login18-1.hpc.itc.rwth-aachen.de:~/PhD/performance-analyazer/Experiment2/Matrix-Chain-4/variants-linnea/experiments/90_90_90_90_91/case_table.csv ../Matrix-Chain-4/variants-linnea/cluster-experiments/90_90_90_90_91


scp: /home/as641651/PhD/performance-analyazer/Experiment2/Matrix-Chain-4/variants-linnea/experiments/90_90_90_90_91/case_table.csv: No such file or directory


CalledProcessError: Command '['scp', 'as641651@login18-1.hpc.itc.rwth-aachen.de:~/PhD/performance-analyazer/Experiment2/Matrix-Chain-4/variants-linnea/experiments/90_90_90_90_91/case_table.csv', '../Matrix-Chain-4/variants-linnea/cluster-experiments/90_90_90_90_91']' returned non-zero exit status 1.

In [62]:
ret = runner.generate_experiments()

source ~/.analyzer; cd ~/PhD/performance-analyazer/Experiment2/Matrix-Chain-4/variants-linnea; python generate-variants-linnea.py 90 90 90 90 90 --threads=8;
['New solution:.............4.37e+06\n', 'No further generation steps possible.\n', '----------------------------------\n', 'Number of nodes:                 8\n', 'Solution nodes:                  1\n', 'Data:                     4.05e+04\n', 'Best solution:            4.37e+06\n', 'Intensity:                     108\n', 'Number of algorithms:            6\n', 'Generated Variants.\n']


In [63]:
ret

0

In [73]:
ret = runner.run_experiments()

source ~/.analyzer; cd ~/PhD/performance-analyazer/Experiment2/Matrix-Chain-4/variants-linnea/experiments/90_90_90_90_90; sbatch submit.sh runner.jl;
Running experiments in the backend.


In [74]:
ret

0

In [11]:
status = bm.check_slrum_status(job_name)
status

['             JOBID PARTITION                           NAME     USER    STATE       TIME TIME_LIMI  NODES NODELIST(REASON)\n']


0

In [105]:
status[1].split()[2]

'90_90_90_90_90_T8'

In [31]:
ct,et,rt = runner.get_all_tables(meta=True)

In [32]:
ct

Unnamed: 0,case:concept:name,case:flops,case:num_kernels
0,algorithm1,14900000.0,3
1,algorithm5,29100000.0,3
2,algorithm4,29100000.0,3
3,algorithm0,2720000.0,3
4,algorithm3,18000000.0,3
5,algorithm2,17900000.0,3


In [33]:
et

Unnamed: 0,case:concept:name,concept:name,concept:flops,concept:kernel,concept:operation,timestamp:start
0,algorithm1,gemm_3.12e+05,312000.0,"gemm!('N', 'N', 1.0, ml1, ml2, 0.0, ml4)",tmp2 = (B C),2022-06-13 16:08:14.311588
1,algorithm1,gemm_2.86e+05,286000.0,"gemm!('N', 'N', 1.0, ml0, ml4, 0.0, ml5)",tmp4 = (A tmp2),2022-06-13 16:08:15.311588
2,algorithm1,gemm_1.43e+07,14300000.0,"gemm!('N', 'N', 1.0, ml5, ml3, 0.0, ml6)",tmp6 = (tmp4 D),2022-06-13 16:08:16.311588
3,algorithm5,gemm_2.64e+05,264000.0,"gemm!('N', 'N', 1.0, ml0, ml1, 0.0, ml4)",tmp1 = (A B),2022-06-13 16:14:54.311588
4,algorithm5,gemm_1.56e+07,15600000.0,"gemm!('N', 'N', 1.0, ml2, ml3, 0.0, ml5)",tmp3 = (C D),2022-06-13 16:14:55.311588
5,algorithm5,gemm_1.32e+07,13200000.0,"gemm!('N', 'N', 1.0, ml4, ml5, 0.0, ml6)",tmp6 = (tmp1 tmp3),2022-06-13 16:14:56.311588
6,algorithm4,gemm_1.56e+07,15600000.0,"gemm!('N', 'N', 1.0, ml2, ml3, 0.0, ml4)",tmp3 = (C D),2022-06-13 16:13:14.311588
7,algorithm4,gemm_2.64e+05,264000.0,"gemm!('N', 'N', 1.0, ml0, ml1, 0.0, ml5)",tmp1 = (A B),2022-06-13 16:13:15.311588
8,algorithm4,gemm_1.32e+07,13200000.0,"gemm!('N', 'N', 1.0, ml5, ml4, 0.0, ml6)",tmp6 = (tmp1 tmp3),2022-06-13 16:13:16.311588
9,algorithm0,gemm_3.12e+05,312000.0,"gemm!('N', 'N', 1.0, ml1, ml2, 0.0, ml4)",tmp2 = (B C),2022-06-13 16:06:34.311588


In [34]:
rt

Unnamed: 0,case:concept:name,concept:name,concept:flops,concept:operation,concept:kernel,timestamp:start,timestamp:end
0,algorithm1,gemm_3.12e+05,312000.0,tmp2 = (B C),"gemm!('N', 'N', 1.0, ml1, ml2, 0.0, ml4)",1655129000.0,1655129000.0
1,algorithm1,gemm_2.86e+05,286000.0,tmp4 = (A tmp2),"gemm!('N', 'N', 1.0, ml0, ml4, 0.0, ml5)",1655129000.0,1655129000.0
2,algorithm1,gemm_1.43e+07,14300000.0,tmp6 = (tmp4 D),"gemm!('N', 'N', 1.0, ml5, ml3, 0.0, ml6)",1655129000.0,1655129000.0
3,algorithm5,gemm_2.64e+05,264000.0,tmp1 = (A B),"gemm!('N', 'N', 1.0, ml0, ml1, 0.0, ml4)",1655129000.0,1655129000.0
4,algorithm5,gemm_1.56e+07,15600000.0,tmp3 = (C D),"gemm!('N', 'N', 1.0, ml2, ml3, 0.0, ml5)",1655129000.0,1655129000.0
5,algorithm5,gemm_1.32e+07,13200000.0,tmp6 = (tmp1 tmp3),"gemm!('N', 'N', 1.0, ml4, ml5, 0.0, ml6)",1655129000.0,1655129000.0
6,algorithm4,gemm_1.56e+07,15600000.0,tmp3 = (C D),"gemm!('N', 'N', 1.0, ml2, ml3, 0.0, ml4)",1655129000.0,1655129000.0
7,algorithm4,gemm_2.64e+05,264000.0,tmp1 = (A B),"gemm!('N', 'N', 1.0, ml0, ml1, 0.0, ml5)",1655129000.0,1655129000.0
8,algorithm4,gemm_1.32e+07,13200000.0,tmp6 = (tmp1 tmp3),"gemm!('N', 'N', 1.0, ml5, ml4, 0.0, ml6)",1655129000.0,1655129000.0
9,algorithm0,gemm_3.12e+05,312000.0,tmp2 = (B C),"gemm!('N', 'N', 1.0, ml1, ml2, 0.0, ml4)",1655129000.0,1655129000.0


In [35]:
runner.clean()

In [122]:
runner.args_dir.split('/')[-1]

'90_90_90_90_90'