Permalink
Browse files

Added support for VM restart based on custom metrics in availabilty p…

…olicies (Issue #11)
  • Loading branch information...
1 parent 4119cec commit e955c1e4d4498ef3979c3e413b772f4b01bba36a Khalid Ahmed committed Mar 5, 2013
Showing with 256 additions and 13 deletions.
  1. +1 −1 bin/svccontroller.sh
  2. +131 −0 context/mysqlsetup.sh
  3. +24 −0 etc/config.rb
  4. +3 −3 etc/system.conf
  5. +36 −8 lib/policy_manager.rb
  6. +60 −0 plugins/MySqlHealth.rb
  7. +1 −1 templates/ubuntu1104.vm
View
@@ -190,7 +190,7 @@ createmaster() {
fi
# Create the master
- sed -e "s/%ENVID%/$ENVID/g; s/%SERVICE_NAME%/$SERVICE_NAME/g; s/%CARINA_IP%/$CARINA_IP/g; s/%NAME%/$MASTER_VM_NAME/g; s/%NETWORK_ID%/$MASTER_NETWORK_ID/g; s/%IMAGE_ID%/$MASTER_IMAGE_ID/g; s/%CPU%/$MASTER_NUM_CPUS/g; s/%MEMORY%/$MASTER_MEMORY/g; s/%MASTER_SERVICE_PORT%/$MASTER_SERVICE_PORT/g; s/%APP_CONTEXT_SCRIPT%/$MASTER_CONTEXT_SCRIPT/g; s/%APP_CONTEXT_VAR%/$MASTER_CONTEXT_VAR/g " $MASTER_TEMPLATE > $MASTER_TEMPLATE.tmp.$$
+ sed -e "s/%ENVID%/$ENVID/g; s/%SERVICE_NAME%/$SERVICE_NAME/g; s/%CARINA_IP%/$CARINA_IP/g; s/%NAME%/$MASTER_VM_NAME/g; s/%NETWORK_ID%/$MASTER_NETWORK_ID/g; s/%IMAGE_ID%/$MASTER_IMAGE_ID/g; s/%CPU%/$MASTER_NUM_CPUS/g; s/%MEMORY%/$MASTER_MEMORY/g; s/%MASTER_SERVICE_PORT%/$MASTER_SERVICE_PORT/g; s/%APP_CONTEXT_SCRIPT%/$MASTER_CONTEXT_SCRIPT/g; s/%APP_CONTEXT_VAR%/$MASTER_CONTEXT_VAR/g; s/%ADMIN_USER%/$ADMINUSER/g " $MASTER_TEMPLATE > $MASTER_TEMPLATE.tmp.$$
insertvmtemplatevars $MASTER_TEMPLATE.tmp.$$
View
@@ -0,0 +1,131 @@
+#!/bin/bash
+
+. /mnt/context.sh
+
+## setting the network for NFS
+echo 0 > /proc/sys/net/ipv4/tcp_sack
+
+echo 0 > /proc/sys/net/ipv4/tcp_timestamps
+
+# Add entries to fstab
+if ( ! grep "NFS Mount Points Added" /etc/fstab); then
+
+ # Create mount points
+ mkdir -p /db/mysql_data
+ mkdir -p /db/mysql_binlog
+ mkdir -p /db/mysql_config
+ mkdir -p /db/mysql_tranlog
+
+ cat >>/etc/fstab <<MNTTAB
+
+# NFS Mount Points Added
+#
+
+$NFSDATA /db/mysql_data nfs _netdev,rw,bg,hard,rsize=32768,wsize=32768,nointr,timeo=600,proto=tcp,nolock,noatime,tcp,vers=3
+$LOGMNT /db/mysql_binlog nfs _netdev,rw,bg,hard,rsize=32768,wsize=32768,nointr,timeo=600,proto=tcp,nolock,noatime,tcp,vers=3
+$CONFIGMNT /db/mysql_config nfs _netdev,rw,bg,hard,rsize=32768,wsize=32768,nointr,timeo=600,proto=tcp,nolock,noatime,tcp,vers=3
+$TRANLOG /db/mysql_tranlog nfs _netdev,rw,bg,hard,rsize=32768,wsize=32768,nointr,timeo=600,proto=tcp,nolock,noatime,tcp,vers=3
+MNTTAB
+
+fi
+
+if (! mount -a ); then
+ echo "Failed to mount all filesystems listed in /etc/fstab">&2
+ if (! mountpoint -q /db/mysql_data); then
+ echo "Failed to mount $NFSDATA">&2
+ fi
+ if (! mountpoint -q /db/mysql_binlog); then
+ echo "Failed to mount $LOGMNT">&2
+ fi
+ if (! mountpoint -q /db/mysql_config); then
+ echo "Failed to mount $CONFIGMNT">&2
+ fi
+ if (! mountpoint -q /db/mysql_tranlog); then
+ echo "Failed to mount $TRANLOG">&2
+ fi
+ wget http://$CARINA_IP/cgi-bin/updateappstatus.sh?service=$SERVICE_NAME\&vmid=$VMID\&envid=$ENVID\&status=MASTER_INIT_FAIL 2> /dev/null
+fi
+
+
+if [ ! -d /db/mysql_config/$HOSTNAME ]; then
+ mkdir /db/mysql_data/$HOSTNAME
+ mkdir /db/mysql_binlog/$HOSTNAME
+ mkdir /db/mysql_config/$HOSTNAME
+ mkdir /db/mysql_tranlog/$HOSTNAME
+
+
+ #copy the configuration files
+ rm /etc/mysql/my.cnf
+
+ mv /etc/mysql/* /db/mysql_config/$HOSTNAME/
+
+ cp /mnt/my.cnf /db/mysql_config/$HOSTNAME/
+
+ echo "slow_query_log_file = /db/mysql_tranlog/$HOSTNAME/mysql-slow.log" >> /db/mysql_config/$HOSTNAME/my.cnf
+ echo "log_error = /db/mysql_tranlog/$HOSTNAME/mysql-error.log" >> /db/mysql_config/$HOSTNAME/my.cnf
+ echo "log_bin = /db/mysql_binlog/$HOSTNAME/master-bin" >> /db/mysql_config/$HOSTNAME/my.cnf
+ echo "log-bin-index = /db/mysql_binlog/$HOSTNAME/master-bin" >> /db/mysql_config/$HOSTNAME/my.cnf
+ echo "general_log_file = /db/mysql_tranlog/$HOSTNAME/mysql.log" >> /db/mysql_config/$HOSTNAME/my.cnf
+ echo "datadir = /db/mysql_data/$HOSTNAME/mysql" >> /db/mysql_config/$HOSTNAME/my.cnf
+ echo "bind-address = $IP" >> /db/mysql_config/$HOSTNAME/my.cnf
+
+ ln -s /db/mysql_config/$HOSTNAME/my.cnf /etc/mysql/my.cnf
+ ln -s /db/mysql_config/$HOSTNAME/debian-start /etc/mysql/debian-start
+ ln -s /db/mysql_config/$HOSTNAME/debian.cnf /etc/mysql/debian.cnf
+ ln -s /db/mysql_config/$HOSTNAME/conf.d /etc/mysql/conf.d
+
+ #copy the mysql files
+
+ cp -R /var/lib/mysql /db/mysql_data/$HOSTNAME/mysql
+ rm /db/mysql_data/$HOSTNAME/mysql/ib_logfile0
+ rm /db/mysql_data/$HOSTNAME/mysql/ib_logfile1
+
+ # Setup permissions
+ chown -R mysql:mysql /db/mysql_data/$HOSTNAME
+ chown -R mysql:mysql /db/mysql_binlog/$HOSTNAME
+ chown -R mysql:mysql /db/mysql_config/$HOSTNAME
+ chown -R mysql:mysql /db/mysql_tranlog/$HOSTNAME
+
+ if [ ! -d /var/run/mysqld ]; then
+ mkdir /var/run/mysqld
+ chown mysql:mysql /var/run/mysqld
+ fi
+
+ if [ -f /etc/init/mysql.override ]; then
+ rm /etc/init/mysql.override
+ fi
+
+
+ #start up mysql
+
+ /etc/init.d/mysql start
+
+ #set the root password
+ /usr/bin/mysqladmin -u root password $DBPWD
+
+ else
+ #remove the configuration files
+ rm -rf /etc/mysql/*
+ #make the link
+ ln -s /db/mysql_config/$HOSTNAME/my.cnf /etc/mysql/my.cnf
+ ln -s /db/mysql_config/$HOSTNAME/debian-start /etc/mysql/debian-start
+ ln -s /db/mysql_config/$HOSTNAME/debian.cnf /etc/mysql/debian.cnf
+ ln -s /db/mysql_config/$HOSTNAME/conf.d /etc/mysql/conf.d
+
+
+
+ if [ ! -d /var/run/mysqld ]; then
+ mkdir /var/run/mysqld
+ chown mysql:mysql /var/run/mysqld
+ fi
+
+ if [ -f /etc/init/mysql.override ]; then
+ rm /etc/init/mysql.override
+ fi
+
+ #start up mysqld
+ /etc/init.d/mysql start
+
+ fi
+
+wget http://$CARINA_IP/cgi-bin/updateappstatus.sh?service=$SERVICE_NAME\&vmid=$VMID\&envid=$ENVID\&status=MASTER_INIT_DONE 2> /dev/null
View
@@ -337,4 +337,28 @@
:dependencies => { 'tomcat_ha_passive' => ['tomcat_ha_active']}
},
+ 'mysql-ha' => {
+ :type => "data",
+ :endpoint => "mm01",
+ :description => "Standalone MySQL node",
+ :master_template => "mysql",
+ :master_context_script => "mysqlsetup.sh",
+ :master_setup_time => 30,
+ :master_context_var => "\"NFSDATA=1.1.1.1:\\/vol\\/data1, LOGMNT=1.1.1.1:\\/vol\\/data2, CONFIGMNT=1.1.1.1:\\/vol\\/data3, TRANLOG=1.1.1.1:\\/vol\\/data4\"",
+ :slave_template => "mysql",
+ :slave_context_script => "mysqlsetup.sh",
+ :slave_context_var => "\"NFSDATA=1.1.1.1:\\/vol\\/data1, LOGMNT=1.1.1.1:\\/vol\\/data2, CONFIGMNT=1.1.1.1:\\/vol\\/data3, TRANLOG=1.1.1.1:\\/vol\\/data4\"",
+ :placement_policy => "pack",
+ :availability_policy => {
+ :period => 5,
+ :metric_plugins => ['VMStatus', 'MySqlHealth'],
+ :recreate_expr => "m.mysqlHealth < 1 || m.nummissing > 0"
+ },
+ :num_slaves => 0,
+ :slavedata => "8080",
+ :adminuser => "carina",
+ :app_url => "jdbc://%MASTER%/"
+ },
+
+
}
View
@@ -2,7 +2,7 @@
# DB_PASS - Password for 'root' user in MySQL
# GS_REDIS_IP - IP address of Redis server used to communicate between oneenvd and oneenvd-gs
# GS_REDIS_PORT - Redis server port number
-DB_HOST=10.135.41.108
-DB_PASS=root
-GS_REDIS_IP=10.135.41.144
+DB_HOST=127.0.0.1
+DB_PASS=
+GS_REDIS_IP=127.0.0.1
GS_REDIS_PORT=6379
View
@@ -247,6 +247,17 @@ def failbackEnv(env, policy)
env.info "AVAIL_POLICY_STATUS: Environment delcared as ACTIVE but failover environment missing"
end
+ private
+ def getTargetVM(vmList, mastervmid)
+ mastervmfailed=false
+ targetvm = nil
+ vmList.each { |vm|
+ targetvm = vm if vm.status == "MISSING" || vm.status == "UNKNOWN" || vm.status == "FAILED" || vm.status == "SICK"
+ mastervmfailed = true if vm.vmid == mastervmid && (vm.status== "MISSING" || vm.status == "UNKNOWN" || vm.status == "FAILED" || vm.status == "SICK" )
+ }
+ targetvm == mastervmid if mastervmfailed == true
+ return targetvm
+ end
private
def evalavailabilitypolicy(env, policy)
@@ -268,34 +279,47 @@ def evalavailabilitypolicy(env, policy)
nummissing += 1 if vm.status == "MISSING"
numunknown += 1 if vm.status == "UNKNOWN"
numfailed += 1 if vm.status == "FAILED"
- targetvm = vm if vm.status == "MISSING" || vm.status == "UNKNOWN" || vm.status == "FAILED"
- mastervmfailed = true if vm.vmid == env.mastervmid && (vm.status== "MISSING" || vm.status == "UNKNOWN" || vm.status == "FAILED")
+ targetvm = vm if vm.status == "MISSING" || vm.status == "UNKNOWN" || vm.status == "FAILED" || vm.status == "SICK"
+ mastervmfailed = true if vm.vmid == env.mastervmid && (vm.status== "MISSING" || vm.status == "UNKNOWN" || vm.status == "FAILED" || vm.status == "SICK" )
end
}
# Make sure to restart master first before any slaves
targetvm == env.mastervmid if mastervmfailed == true
# Check if we should be re-creating any VMs
if policy[:recreate_expr] != nil
- recreate_result = eval(policy[:recreate_expr])
+ if policy[:metric_plugins] != nil
+ recreate_result = @pluginManager.evaluateExpr( policy[:metric_plugins], policy[:recreate_expr], env.envid)
+ else
+ recreate_result = eval(policy[:recreate_expr])
+ end
@recreateCounter[env.envid] = "0" if @recreateCounter[env.envid] == nil
if recreate_result == true
newval = @recreateCounter[env.envid].to_i + 1
@recreateCounter[env.envid] = newval.to_s
if newval > policy[:period]
+ targetvm = getTargetVM(vmList, env.mastervmid)
if $jobManager.isJobRunning(env.envid) == false
- env.info = "AVAIL_POLICY_STATUS: Recreating VM #{targetvm.vmid}"
- env.recreateVM(targetvm)
+ if targetvm != nil
+ env.info = "AVAIL_POLICY_STATUS: Recreating VM #{targetvm.vmid}"
+ env.recreateVM(targetvm)
+ else
+ env.info = "AVAIL_POLICY_STATUS: Policy condition triggered but could not identify target VM to recreate"
+ end
@recreateCounter[env.envid] = "0"
end
else
- env.info = "AVAIL_POLICY_STATUS: Detected missing VM counter=#{@recreateCounter[env.envid]} period=#{policy[:period]}"
+ env.info = "AVAIL_POLICY_STATUS: Detected missing or unhealthy VM counter=#{@recreateCounter[env.envid]} period=#{policy[:period]}"
end
end
end
# Determine if the entire environment should be considered as failed
if policy[:failover_expr] != nil && env.policy_status == "ACTIVE"
if policy[:failover_role] == "active"
- failover_result = eval(policy[:failover_expr])
+ if policy[:metric_plugins] != nil
+ failover_result = @pluginManager.evaluateExpr( policy[:metric_plugins], policy[:failover_expr], env.envid)
+ else
+ failover_result = eval(policy[:failover_expr])
+ end
@failoverCounter[env.envid] = "0" if @failoverCounter[env.envid] == nil
if failover_result == true
newval = @failoverCounter[env.envid].to_i + 1
@@ -312,7 +336,11 @@ def evalavailabilitypolicy(env, policy)
# Determine if the entire environment should failed back
if policy[:failback_expr] != nil && policy[:failover_role] == "active" && env.policy_status == "FAILED"
- failback_result = eval(policy[:failback_expr])
+ if policy[:metric_plugins] != nil
+ failback_result = @pluginManager.evaluateExpr( policy[:metric_plugins], policy[:failback_expr], env.envid)
+ else
+ failback_result = eval(policy[:failback_expr])
+ end
@failbackCounter[env.envid] = "0" if @failbackCounter[env.envid] == nil
if failback_result == true
newval = @failbackCounter[env.envid].to_i + 1
View
@@ -0,0 +1,60 @@
+# -------------------------------------------------------------------------- #
+# Copyright 2011-2012, Research In Motion Limited #
+# #
+# Licensed under the Apache License, Version 2.0 (the "License"); you may #
+# not use this file except in compliance with the License. You may obtain #
+# a copy of the License at #
+# #
+# http://www.apache.org/licenses/LICENSE-2.0 #
+# #
+# Unless required by applicable law or agreed to in writing, software #
+# distributed under the License is distributed on an "AS IS" BASIS, #
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
+# See the License for the specific language governing permissions and #
+# limitations under the License. #
+#--------------------------------------------------------------------------- #
+#
+# This plugin checks the health of MySQL instances in an environment. It
+# assumes all VMs in the envrionment will run MySQL server and uses the
+# 'mysqladmin ping' command to check
+
+class MySqlHealth < Plugin
+ def initialize
+ end
+
+ # Returns 1 if MySQL is alive on all VMs in an environment or 0 if any VM has a failed mysql
+
+ def mysqlHealth(envid)
+ now = Time.now
+ env = $envManager.getEnv(envid)
+ vmList = env.getVMList()
+ vmList.each { |vm|
+ cmd = "ssh carina@#{vm.ipaddress} mysqladmin ping"
+ lineread = false
+ IO.popen(cmd) { |f|
+ if f.eof
+ if !lineread
+ $Logger.debug "MySqlHealthPlugin: mysqld is not alive - no output from mysqladmin ping"
+ vm.status = "SICK"
+ return 0
+ end
+ break
+ end
+ v = f.readline
+ puts "MySqlHealthPlugin: Processing line: #{v}"
+ lineread=true
+ if !v.include? 'is alive'
+ $Logger.debug "MySqlHealthPlugin: mysqld is not alive"
+ vm.status = "SICK"
+ return 0
+ end
+ }
+ }
+ $Logger.debug "MySqlHealthPlugin: mysqld is alive"
+ return 1
+ end
+
+ def getMetrics
+ return ['mysqlHealth']
+ end
+end
View
@@ -22,7 +22,7 @@ CONTEXT = [
SERVICE_NAME=%SERVICE_NAME%,
%APP_CONTEXT_VAR%,
VMID=$VMID,
- DEFUSER=rimadmin,
+ DEFUSER=%ADMIN_USER%,
HOSTNAME = "$NAME",
IP = "$NIC[IP]",
NETMASK = "$NETWORK[NETMASK, NETWORK_ID=%NETWORK_ID%]",

0 comments on commit e955c1e

Please sign in to comment.