-
Notifications
You must be signed in to change notification settings - Fork 23
/
install-monitoring.sh
executable file
·111 lines (88 loc) · 6 KB
/
install-monitoring.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/bin/bash -i
#
#
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0
#
#
#source the AWS ParallelCluster profile
. /etc/parallelcluster/cfnconfig
yum -y install docker
service docker start
chkconfig docker on
usermod -a -G docker $cfn_cluster_user
#to be replaced with yum -y install docker-compose as the repository problem is fixed
curl -L "https://github.com/docker/compose/releases/download/1.27.4/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
chmod +x /usr/local/bin/docker-compose
monitoring_dir_name=$(echo ${cfn_postinstall_args}| cut -d ',' -f 2 )
monitoring_home="/home/${cfn_cluster_user}/${monitoring_dir_name}"
case "${cfn_node_type}" in
MasterServer)
#cfn_efs=$(cat /etc/chef/dna.json | grep \"cfn_efs\" | awk '{print $2}' | sed "s/\",//g;s/\"//g")
#cfn_cluster_cw_logging_enabled=$(cat /etc/chef/dna.json | grep \"cfn_cluster_cw_logging_enabled\" | awk '{print $2}' | sed "s/\",//g;s/\"//g")
cfn_fsx_fs_id=$(cat /etc/chef/dna.json | grep \"cfn_fsx_fs_id\" | awk '{print $2}' | sed "s/\",//g;s/\"//g")
master_instance_id=$(ec2-metadata -i | awk '{print $2}')
cfn_max_queue_size=$(aws cloudformation describe-stacks --stack-name $stack_name --region $cfn_region | jq -r '.Stacks[0].Parameters | map(select(.ParameterKey == "MaxSize"))[0].ParameterValue')
s3_bucket=$(echo $cfn_postinstall | sed "s/s3:\/\///g;s/\/.*//")
cluster_s3_bucket=$(cat /etc/chef/dna.json | grep \"cluster_s3_bucket\" | awk '{print $2}' | sed "s/\",//g;s/\"//g")
cluster_config_s3_key=$(cat /etc/chef/dna.json | grep \"cluster_config_s3_key\" | awk '{print $2}' | sed "s/\",//g;s/\"//g")
cluster_config_version=$(cat /etc/chef/dna.json | grep \"cluster_config_version\" | awk '{print $2}' | sed "s/\",//g;s/\"//g")
log_group_names="\/aws\/parallelcluster\/$(echo ${stack_name} | cut -d "-" -f2-)"
aws s3api get-object --bucket $cluster_s3_bucket --key $cluster_config_s3_key --region $cfn_region --version-id $cluster_config_version ${monitoring_home}/parallelcluster-setup/cluster-config.json
yum -y install golang-bin
chown $cfn_cluster_user:$cfn_cluster_user -R /home/$cfn_cluster_user
chmod +x ${monitoring_home}/custom-metrics/*
cp -rp ${monitoring_home}/custom-metrics/* /usr/local/bin/
mv ${monitoring_home}/prometheus-slurm-exporter/slurm_exporter.service /etc/systemd/system/
(crontab -l -u $cfn_cluster_user; echo "*/1 * * * * /usr/local/bin/1m-cost-metrics.sh") | crontab -u $cfn_cluster_user -
(crontab -l -u $cfn_cluster_user; echo "*/60 * * * * /usr/local/bin/1h-cost-metrics.sh") | crontab -u $cfn_cluster_user -
# replace tokens
sed -i "s/_S3_BUCKET_/${s3_bucket}/g" ${monitoring_home}/grafana/dashboards/ParallelCluster.json
sed -i "s/__INSTANCE_ID__/${master_instance_id}/g" ${monitoring_home}/grafana/dashboards/ParallelCluster.json
sed -i "s/__FSX_ID__/${cfn_fsx_fs_id}/g" ${monitoring_home}/grafana/dashboards/ParallelCluster.json
sed -i "s/__AWS_REGION__/${cfn_region}/g" ${monitoring_home}/grafana/dashboards/ParallelCluster.json
sed -i "s/__AWS_REGION__/${cfn_region}/g" ${monitoring_home}/grafana/dashboards/logs.json
sed -i "s/__LOG_GROUP__NAMES__/${log_group_names}/g" ${monitoring_home}/grafana/dashboards/logs.json
sed -i "s/__Application__/${stack_name}/g" ${monitoring_home}/prometheus/prometheus.yml
sed -i "s/__INSTANCE_ID__/${master_instance_id}/g" ${monitoring_home}/grafana/dashboards/master-node-details.json
sed -i "s/__INSTANCE_ID__/${master_instance_id}/g" ${monitoring_home}/grafana/dashboards/compute-node-list.json
sed -i "s/__INSTANCE_ID__/${master_instance_id}/g" ${monitoring_home}/grafana/dashboards/compute-node-details.json
sed -i "s/__MONITORING_DIR__/${monitoring_dir_name}/g" ${monitoring_home}/docker-compose/docker-compose.master.yml
#Generate selfsigned certificate for Nginx over ssl
nginx_dir="${monitoring_home}/nginx"
nginx_ssl_dir="${nginx_dir}/ssl"
mkdir -p ${nginx_ssl_dir}
echo -e "\nDNS.1=$(ec2-metadata -p | awk '{print $2}')" >> "${nginx_dir}/openssl.cnf"
openssl req -new -x509 -nodes -newkey rsa:4096 -days 3650 -keyout "${nginx_ssl_dir}/nginx.key" -out "${nginx_ssl_dir}/nginx.crt" -config "${nginx_dir}/openssl.cnf"
#give $cfn_cluster_user ownership
chown -R $cfn_cluster_user:$cfn_cluster_user "${nginx_ssl_dir}/nginx.key"
chown -R $cfn_cluster_user:$cfn_cluster_user "${nginx_ssl_dir}/nginx.crt"
/usr/local/bin/docker-compose --env-file /etc/parallelcluster/cfnconfig -f ${monitoring_home}/docker-compose/docker-compose.master.yml -p monitoring-master up -d
# Download and build prometheus-slurm-exporter
##### Plese note this software package is under GPLv3 License #####
# More info here: https://github.com/vpenso/prometheus-slurm-exporter/blob/master/LICENSE
cd ${monitoring_home}
git clone https://github.com/vpenso/prometheus-slurm-exporter.git
cd prometheus-slurm-exporter
GOPATH=/root/go-modules-cache HOME=/root go mod download
GOPATH=/root/go-modules-cache HOME=/root go build
mv ${monitoring_home}/prometheus-slurm-exporter/prometheus-slurm-exporter /usr/bin/prometheus-slurm-exporter
systemctl daemon-reload
systemctl enable slurm_exporter
systemctl start slurm_exporter
;;
ComputeFleet)
compute_instance_type=$(ec2-metadata -t | awk '{print $2}')
gpu_instances="[pg][2-9].*\.[0-9]*[x]*large"
if [[ $compute_instance_type =~ $gpu_instances ]]; then
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.repo | tee /etc/yum.repos.d/nvidia-docker.repo
yum -y clean expire-cache
yum -y install nvidia-docker2
systemctl restart docker
/usr/local/bin/docker-compose -f /home/${cfn_cluster_user}/${monitoring_dir_name}/docker-compose/docker-compose.compute.gpu.yml -p monitoring-compute up -d
else
/usr/local/bin/docker-compose -f /home/${cfn_cluster_user}/${monitoring_dir_name}/docker-compose/docker-compose.compute.yml -p monitoring-compute up -d
fi
;;
esac