From 4fe1e198185daa7f5132901bc5a16d6e14b2a7c4 Mon Sep 17 00:00:00 2001 From: aven Date: Fri, 5 Jan 2018 16:29:49 +0800 Subject: [PATCH] TRAFODION-2885 dcs server cant be restart while switching master --- .../trafodion/dcs/master/ServerManager.java | 58 ++++++++++++++++--- 1 file changed, 51 insertions(+), 7 deletions(-) diff --git a/dcs/src/main/java/org/trafodion/dcs/master/ServerManager.java b/dcs/src/main/java/org/trafodion/dcs/master/ServerManager.java index 8594c36aa8..37c796390a 100644 --- a/dcs/src/main/java/org/trafodion/dcs/master/ServerManager.java +++ b/dcs/src/main/java/org/trafodion/dcs/master/ServerManager.java @@ -23,14 +23,12 @@ Licensed to the Apache Software Foundation (ASF) under one package org.trafodion.dcs.master; import java.net.InetAddress; - import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.BufferedReader; import java.io.FileReader; import java.io.FileNotFoundException; - import java.util.Scanner; import java.util.Collections; import java.util.Iterator; @@ -47,17 +45,13 @@ Licensed to the Apache Software Foundation (ASF) under one import java.util.Comparator; import java.util.Map; import java.util.HashMap; - import java.text.DateFormat; import org.apache.zookeeper.*; import org.apache.zookeeper.data.Stat; - import org.apache.hadoop.conf.Configuration; - import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; - import org.trafodion.dcs.master.RunningServer; import org.trafodion.dcs.master.RegisteredServer; import org.trafodion.dcs.master.Metrics; @@ -66,7 +60,6 @@ Licensed to the Apache Software Foundation (ASF) under one import org.trafodion.dcs.Constants; import org.trafodion.dcs.zookeeper.ZkClient; import org.trafodion.dcs.util.*; - import org.codehaus.jettison.json.JSONArray; import org.codehaus.jettison.json.JSONException; import org.codehaus.jettison.json.JSONObject; @@ -314,6 +307,7 @@ public Boolean call() throws Exception { getServersFile(); createServersPortMap(); getZkRunning(); + getUnwathedServers(); getZkRegistered(); while (true) { @@ -506,6 +500,56 @@ private synchronized void getZkRunning() throws Exception { } } + private void getUnwathedServers() { + // In some situation, if DCS Server does not have znode info in zookeeper + // when DCS Master is starting, then server will never be watched by zookeeper, + // and if it downs, it will never be restarted. + + // configuredServers + // hostName + ":" + lineNum + ":" + serverCount + // runningServers + // hostName + ":" + instance + ":" + infoPort + ":" + serverStartTimestamp + // eg : gy26.esgyncn.local:3:24413:1515056285028 + // RestartHandler need to know hostName, instanceNum(lineNum), serverStartTimestamp(for if condition) + if (runningServers.size() == configuredServers.size()) { + if (LOG.isDebugEnabled()) { + LOG.debug("all dcs servers have started, no need to add watchers"); + } + return; + } + + boolean found = false; + for (String configured : configuredServers) { + Scanner configuredScn = new Scanner(configured); + configuredScn.useDelimiter(":"); + String hostName = configuredScn.next(); + int instance = Integer.parseInt(configuredScn.next()); + int serverCount = Integer.parseInt(configuredScn.next()); + configuredScn.close(); + for (String running : runningServers) { + Scanner runningScn = new Scanner(running); + runningScn.useDelimiter(":"); + String runningHostName = runningScn.next(); + + runningScn.close(); + if (runningHostName.equals(hostName)) { + found = true; + break; + } + } + if (found) { + found = false; + continue; + } else { + LOG.error("DcsServer [" + hostName + ":" + instance + "] does not started when starting DcsMaster [" + master.getServerName() + "] add to restart queue."); + // add to the restart handler + String simulatePath = hostName + ":" + instance + ":0:" + System.currentTimeMillis(); + RestartHandler handler = new RestartHandler(simulatePath, serverCount); + restartQueue.add(handler); + } + } + } + private synchronized void restartServer(String znodePath) throws Exception { String child = znodePath.replace(parentZnode + Constants.DEFAULT_ZOOKEEPER_ZNODE_SERVERS_RUNNING + "/", "");