From bc5744f6f5e81521a3905ae072177218fdd6b05c Mon Sep 17 00:00:00 2001
From: Brandon Edwards <brandon.edwards@intel.com>
Date: Mon, 5 Dec 2016 14:59:38 -0800
Subject: [PATCH 01/11]  Eddited DNSWordCreation.scala DomainProcessor.scala
 DNSSuspiciousConnectsModel.scala, and SuspicousConnectsArgumentParser to all
 allow in a variable user domain designation. More work to do in order to have
 this information come in from the spot.conf.

---
 .../SuspiciousConnectsArgumentParser.scala    | 25 +++++++++++++++++++
 .../org/apache/spot/dns/DNSWordCreation.scala |  6 +++--
 .../model/DNSSuspiciousConnectsModel.scala    |  9 ++++++-
 .../spot/utilities/DomainProcessor.scala      |  5 ++--
 4 files changed, 40 insertions(+), 5 deletions(-)
diff --git a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala
index 33186c58..632e0d83 100644
--- a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala
@@ -11,6 +11,11 @@ object SuspiciousConnectsArgumentParser {
                                       feedbackFile: String = "",
                                       duplicationFactor: Int = 1,
                                       topicCount: Int = 20,
+                                      localPath: String = "",
+                                      localUser: String = "",
+                                      userDomain: String = "",
+                                      ldaPath: String = "",
+                                      nodes: String = "",
                                       hdfsScoredConnect: String = "",
                                       threshold: Double = 1.0d,
                                       maxResults: Int = -1,
@@ -45,6 +50,26 @@ object SuspiciousConnectsArgumentParser {
       action((x, c) => c.copy(topicCount = x.toInt)).
       text("topic count")
 
+    opt[String]("lpath").required().valueName("<local path>").
+      action((x, c) => c.copy(localPath = x)).
+      text("Local Path")
+
+    opt[String]("ldapath").required().valueName("<local path>").
+      action((x, c) => c.copy(ldaPath = x)).
+      text("LDA Path")
+
+    opt[String]("luser").required().valueName("<local path>").
+      action((x, c) => c.copy(localUser = x)).
+      text("Local user path")
+
+    opt[String]("userDomain").required().valueName("<user domain>").
+      action((x, c) => c.copy(userDomain = x)).
+      text("Domain of spot user (example: intel)")
+
+    opt[String]("nodes").required().valueName("<input param>").
+      action((x, c) => c.copy(nodes = x)).
+      text("Node list")
+
     opt[String]("scored").required().valueName("<hdfs path>").
       action((x, c) => c.copy(hdfsScoredConnect = x)).
       text("HDFS path for results")
diff --git a/spot-ml/src/main/scala/org/apache/spot/dns/DNSWordCreation.scala b/spot-ml/src/main/scala/org/apache/spot/dns/DNSWordCreation.scala
index 2ffe12e1..e4595e1c 100644
--- a/spot-ml/src/main/scala/org/apache/spot/dns/DNSWordCreation.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/dns/DNSWordCreation.scala
@@ -15,13 +15,15 @@ import org.apache.spot.utilities.Quantiles
   * @param entropyCuts
   * @param numberPeriodsCuts
   * @param topDomainsBC
+  * @param userDomain
   */
 class DNSWordCreation(frameLengthCuts: Array[Double],
                       timeCuts: Array[Double],
                       subdomainLengthCuts: Array[Double],
                       entropyCuts: Array[Double],
                       numberPeriodsCuts: Array[Double],
-                      topDomainsBC: Broadcast[Set[String]]) extends Serializable {
+                      topDomainsBC: Broadcast[Set[String]],
+                      userDomain: String) extends Serializable {
 
 
   /**
@@ -79,7 +81,7 @@ class DNSWordCreation(frameLengthCuts: Array[Double],
 
 
     val DomainInfo(domain, topDomain, subdomain, subdomainLength, subdomainEntropy, numPeriods) =
-      extractDomainInfo(queryName, topDomainsBC)
+      extractDomainInfo(queryName, topDomainsBC, userDomain)
 
     Seq(topDomain,
       Quantiles.bin(frameLength.toDouble, frameLengthCuts),
diff --git a/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala b/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala
index 81c214f6..953e1eca 100644
--- a/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala
@@ -158,6 +158,7 @@ object DNSSuspiciousConnectsModel {
 
     val countryCodesBC = sparkContext.broadcast(CountryCodes.CountryCodes)
     val topDomainsBC = sparkContext.broadcast(TopDomains.TopDomains)
+    val userDomain = config.userDomain
 
     // create quantile cut-offs
 
@@ -180,7 +181,13 @@ object DNSSuspiciousConnectsModel {
 
     // simplify DNS log entries into "words"
 
-    val dnsWordCreator = new DNSWordCreation(frameLengthCuts, timeCuts, subdomainLengthCuts, entropyCuts, numberPeriodsCuts, topDomainsBC)
+    val dnsWordCreator = new DNSWordCreation(frameLengthCuts,
+                                             timeCuts,
+                                             subdomainLengthCuts,
+                                             entropyCuts,
+                                             numberPeriodsCuts,
+                                             topDomainsBC,
+                                             userDomain)
 
     val dataWithWordDF = totalDataDF.withColumn(Word, dnsWordCreator.wordCreationUDF(modelColumns: _*))
 
diff --git a/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala b/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala
index 28595e46..334ae876 100644
--- a/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala
@@ -48,9 +48,10 @@ object DomainProcessor extends Serializable {
     * Extract domain info from a url.
     * @param url Incoming url.
     * @param topDomainsBC Broadcast variable containing the top domains set.
+    * @param userDomain Domain of the spot user (example:'intel').
     * @return New [[DomainInfo]] object containing extracted domain information.
     */
-  def extractDomainInfo(url: String, topDomainsBC: Broadcast[Set[String]]): DomainInfo = {
+  def extractDomainInfo(url: String, topDomainsBC: Broadcast[Set[String]], userDomain: String): DomainInfo = {
 
     val spliturl = url.split('.')
     val numParts = spliturl.length
@@ -64,7 +65,7 @@ object DomainProcessor extends Serializable {
       0
     }
 
-    val topDomainClass = if (domain == "intel") {
+    val topDomainClass = if (domain == userDomain) {
       2
     } else if (topDomainsBC.value contains domain) {
       1

From 41ffbc3c3744eadc7baac4633f160188bcad731a Mon Sep 17 00:00:00 2001
From: Brandon Edwards <brandon.edwards@intel.com>
Date: Tue, 6 Dec 2016 09:03:02 -0800
Subject: [PATCH 02/11] Inserted lines into spot.conf and ml_ops.sh for input
 of user domain string.

---
 spot-ml/ml_ops.sh    | 3 ++-
 spot-setup/spot.conf | 4 ++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/spot-ml/ml_ops.sh b/spot-ml/ml_ops.sh
index 244bcc4a..cf1bc311 100755
--- a/spot-ml/ml_ops.sh
+++ b/spot-ml/ml_ops.sh
@@ -97,6 +97,7 @@ time spark-submit --class "org.apache.spot.SuspiciousConnects" \
   --dupfactor ${DUPFACTOR} \
   --feedback ${FEEDBACK_PATH} \
   --ldatopiccount ${TOPIC_COUNT} \
+  --userdomain ${USER_DOMAIN}\
   --scored ${HDFS_SCORED_CONNECTS} \
   --threshold ${TOL} \
   --maxresults ${MAXRESULTS} \
@@ -107,4 +108,4 @@ wait
 # move results to hdfs.
 cd ${LPATH}
 hadoop fs -getmerge ${HDFS_SCORED_CONNECTS}/part-* ${DSOURCE}_results.csv && hadoop fs -moveFromLocal \
-    ${DSOURCE}_results.csv  ${HDFS_SCORED_CONNECTS}/${DSOURCE}_results.csv
\ No newline at end of file
+    ${DSOURCE}_results.csv  ${HDFS_SCORED_CONNECTS}/${DSOURCE}_results.csv
diff --git a/spot-setup/spot.conf b/spot-setup/spot.conf
index 8f60eb15..433afa8b 100755
--- a/spot-setup/spot.conf
+++ b/spot-setup/spot.conf
@@ -17,6 +17,7 @@ HPATH=${HUSER}/${DSOURCE}/scored_results/${FDATE}
 #impala config
 IMPALA_DEM='node04'
 
+#kerberos config
 KRB_AUTH=false
 KINITPATH=
 KINITOPTS=
@@ -30,6 +31,9 @@ RPATH=${LUSER}/ipython/user/${FDATE}
 LDAPATH=${LUSER}/ml/oni-lda-c
 LIPATH=${LUSER}/ingest
 
+#domain associated to network data to be analyzed
+USER_DOMAIN='intel'
+
 SPK_EXEC='400'
 SPK_EXEC_MEM='2048m'
 SPK_DRIVER_MEM=''

From d7d6ae07344a9a4308067963cb9b03e493b995d0 Mon Sep 17 00:00:00 2001
From: Brandon Edwards <brandon.edwards@intel.com>
Date: Tue, 6 Dec 2016 14:45:38 -0800
Subject: [PATCH 03/11] More modifications in order to pass the user domain
 info around to the places that are needed.

---
 .../SuspiciousConnectsArgumentParser.scala    |  2 +-
 .../dns/DNSSuspiciousConnectsAnalysis.scala   |  4 ++-
 .../spot/dns/model/DNSScoreFunction.scala     | 30 ++++++++++++-------
 .../model/DNSSuspiciousConnectsModel.scala    | 16 ++++++----
 .../spot/utilities/DomainProcessor.scala      | 14 ++++-----
 .../spot/utilities/DomainProcessorTest.scala  | 18 +++++++----
 6 files changed, 54 insertions(+), 30 deletions(-)

diff --git a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala
index 632e0d83..be0db305 100644
--- a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala
@@ -62,7 +62,7 @@ object SuspiciousConnectsArgumentParser {
       action((x, c) => c.copy(localUser = x)).
       text("Local user path")
 
-    opt[String]("userDomain").required().valueName("<user domain>").
+    opt[String]("userdomain").required().valueName("<user domain>").
       action((x, c) => c.copy(userDomain = x)).
       text("Domain of spot user (example: intel)")
 
diff --git a/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala b/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala
index d0e6da17..4ef4718e 100644
--- a/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala
@@ -54,6 +54,8 @@ object DNSSuspiciousConnectsAnalysis {
 
     logger.info("Loading data")
 
+    val userDomain = config.userDomain
+
     val rawDataDF = sqlContext.read.parquet(config.inputPath)
       .filter(Timestamp + " is not null and " + UnixTimestamp + " is not null")
       .select(inColumns:_*)
@@ -64,7 +66,7 @@ object DNSSuspiciousConnectsAnalysis {
       DNSSuspiciousConnectsModel.trainNewModel(sparkContext, sqlContext, logger, config, rawDataDF, config.topicCount)
 
     logger.info("Scoring")
-    val scoredDF = model.score(sparkContext, sqlContext, rawDataDF)
+    val scoredDF = model.score(sparkContext, sqlContext, rawDataDF, userDomain)
 
 
     val filteredDF = scoredDF.filter(Score + " <= " + config.threshold)
diff --git a/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSScoreFunction.scala b/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSScoreFunction.scala
index 09656f27..728f2693 100644
--- a/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSScoreFunction.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSScoreFunction.scala
@@ -8,15 +8,16 @@ import org.apache.spot.dns.DNSWordCreation
 /**
   * Estimate the probabilities of network events using a [[DNSSuspiciousConnectsModel]]
   *
-  * @param frameLengthCuts
-  * @param timeCuts
-  * @param subdomainLengthCuts
-  * @param entropyCuts
-  * @param numberPeriodsCuts
-  * @param topicCount
-  * @param ipToTopicMixBC
-  * @param wordToPerTopicProbBC
-  * @param topDomainsBC
+  * @param frameLengthCuts Delimeters used to define binning for frame length field
+  * @param timeCuts Delimeters used to define binning for time field
+  * @param subdomainLengthCuts Delimeters used to define binning for subdomain length field
+  * @param entropyCuts Delimeters used to define binning for entropy field
+  * @param numberPeriodsCuts Delimeters used to define binning for number of periods of subdomain field
+  * @param topicCount Number of topics used for the LDA model
+  * @param ipToTopicMixBC Topic mixes learned by the LDA model for each IP in the data
+  * @param wordToPerTopicProbBC Word mixes for each of the topics learned by the LDA model
+  * @param topDomainsBC Alexa top one million list of domains.
+  * @param userDomain Domain associated to network data (example: 'intel')
   */
 class DNSScoreFunction(frameLengthCuts: Array[Double],
                        timeCuts: Array[Double],
@@ -26,13 +27,20 @@ class DNSScoreFunction(frameLengthCuts: Array[Double],
                        topicCount: Int,
                        ipToTopicMixBC: Broadcast[Map[String, Array[Double]]],
                        wordToPerTopicProbBC: Broadcast[Map[String, Array[Double]]],
-                       topDomainsBC: Broadcast[Set[String]]) extends Serializable {
+                       topDomainsBC: Broadcast[Set[String]],
+                       userDomain: String) extends Serializable {
 
 
   val suspiciousConnectsScoreFunction =
     new SuspiciousConnectsScoreFunction(topicCount, ipToTopicMixBC, wordToPerTopicProbBC)
 
-  val dnsWordCreator = new DNSWordCreation(frameLengthCuts, timeCuts, subdomainLengthCuts, entropyCuts, numberPeriodsCuts, topDomainsBC)
+  val dnsWordCreator = new DNSWordCreation(frameLengthCuts,
+                                           timeCuts,
+                                           subdomainLengthCuts,
+                                           entropyCuts,
+                                           numberPeriodsCuts,
+                                           topDomainsBC,
+                                           userDomain)
 
   def score(timeStamp: String,
             unixTimeStamp: Long,
diff --git a/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala b/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala
index 953e1eca..047e2620 100644
--- a/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala
@@ -64,10 +64,11 @@ class DNSSuspiciousConnectsModel(inTopicCount: Int,
     * @param sc         Spark Context
     * @param sqlContext Spark SQL context
     * @param inDF       Dataframe of DNS log events, containing at least the columns of [[DNSSuspiciousConnectsModel.ModelSchema]]
+    * @param userDomain Domain associated to network data (ex: 'intel')
     * @return Dataframe with a column named [[org.apache.spot.dns.DNSSchema.Score]] that contains the
     *         probability estimated for the network event at that row
     */
-  def score(sc: SparkContext, sqlContext: SQLContext, inDF: DataFrame): DataFrame = {
+  def score(sc: SparkContext, sqlContext: SQLContext, inDF: DataFrame, userDomain: String): DataFrame = {
 
     val countryCodesBC = sc.broadcast(CountryCodes.CountryCodes)
     val topDomainsBC = sc.broadcast(TopDomains.TopDomains)
@@ -84,7 +85,8 @@ class DNSSuspiciousConnectsModel(inTopicCount: Int,
         topicCount,
         ipToTopicMixBC,
         wordToPerTopicProbBC,
-        topDomainsBC)
+        topDomainsBC,
+        userDomain)
 
 
     val scoringUDF = udf((timeStamp: String,
@@ -168,7 +170,7 @@ object DNSSuspiciousConnectsModel {
     val frameLengthCuts = Quantiles.computeDeciles(totalDataDF.select(FrameLength).rdd
       .map({ case Row(frameLen: Int) => frameLen.toDouble }))
 
-    val domainStatsDF = createDomainStatsDF(sparkContext, sqlContext, countryCodesBC, topDomainsBC, totalDataDF)
+    val domainStatsDF = createDomainStatsDF(sparkContext, sqlContext, countryCodesBC, topDomainsBC, userDomain, totalDataDF)
 
     val subdomainLengthCuts = Quantiles.computeQuintiles(domainStatsDF.filter(SubdomainLength + " > 0")
       .select(SubdomainLength).rdd.map({ case Row(subdomainLength: Int) => subdomainLength.toDouble }))
@@ -238,6 +240,7 @@ object DNSSuspiciousConnectsModel {
     * @param sqlContext     Spark SQL context.
     * @param countryCodesBC Broadcast of the country codes set.
     * @param topDomainsBC   Broadcast of the most-popular domains set.
+    * @param userDomain     Domain associated to network data (ex: 'intel')
     * @param inDF           Incoming dataframe. Schema is expected to provide the field [[QueryName]]
     * @return A new dataframe with the new columns added. The new columns have the schema [[DomainStatsSchema]]
     */
@@ -246,11 +249,12 @@ object DNSSuspiciousConnectsModel {
                           sqlContext: SQLContext,
                           countryCodesBC: Broadcast[Set[String]],
                           topDomainsBC: Broadcast[Set[String]],
+                          userDomain: String,
                           inDF: DataFrame): DataFrame = {
     val queryNameIndex = inDF.schema.fieldNames.indexOf(QueryName)
 
     val domainStatsRDD: RDD[Row] = inDF.rdd.map(row =>
-      Row.fromTuple(createTempFields(countryCodesBC, topDomainsBC, row.getString(queryNameIndex))))
+      Row.fromTuple(createTempFields(countryCodesBC, topDomainsBC, userDomain, row.getString(queryNameIndex))))
 
     sqlContext.createDataFrame(domainStatsRDD, DomainStatsSchema)
   }
@@ -262,15 +266,17 @@ object DNSSuspiciousConnectsModel {
     *
     * @param countryCodesBC Broadcast of the country codes set.
     * @param topDomainsBC   Broadcast of the most-popular domains set.
+    * @param userDomain     Domain associated to network data (ex: 'intel')
     * @param url            URL string to anlayze for domain and subdomain information.
     * @return [[TempFields]]
     */
   def createTempFields(countryCodesBC: Broadcast[Set[String]],
                        topDomainsBC: Broadcast[Set[String]],
+                       userDomain: String,
                        url: String): TempFields = {
 
     val DomainInfo(_, topDomainClass, subdomain, subdomainLength, subdomainEntropy, numPeriods) =
-      DomainProcessor.extractDomainInfo(url, topDomainsBC)
+      DomainProcessor.extractDomainInfo(url, topDomainsBC, userDomain)
 
 
     TempFields(topDomainClass = topDomainClass,
diff --git a/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala b/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala
index 334ae876..c5f0d73c 100644
--- a/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala
@@ -28,10 +28,10 @@ object DomainProcessor extends Serializable {
 
   /**
     * Commonly extracted domain features.
-    * @param domain Domain (if any) of a url.
-    * @param topDomain Numerical class of domain: 2 for Intel, 1 for Alexa top domains, 0 for others.
-    * @param subdomain Subdomain (if any) in the url.
-    * @param subdomainLength Length of the subdomain. 0 if there is none.
+    * @param domain           Domain (if any) of a url.
+    * @param topDomain        Numerical class of domain: 2 for Intel, 1 for Alexa top domains, 0 for others.
+    * @param subdomain        Subdomain (if any) in the url.
+    * @param subdomainLength  Length of the subdomain. 0 if there is none.
     * @param subdomainEntropy Entropy of the subdomain viewed as a distribution on its character set.
     *                         0 if there is no subdomain.
     * @param numPeriods Number of periods + 1 in the url. (Number of sub-strings where url is split by periods.)
@@ -46,9 +46,9 @@ object DomainProcessor extends Serializable {
 
   /**
     * Extract domain info from a url.
-    * @param url Incoming url.
-    * @param topDomainsBC Broadcast variable containing the top domains set.
-    * @param userDomain Domain of the spot user (example:'intel').
+    * @param url           Incoming url.
+    * @param topDomainsBC  Broadcast variable containing the top domains set.
+    * @param userDomain    Domain of the spot user (example:'intel').
     * @return New [[DomainInfo]] object containing extracted domain information.
     */
   def extractDomainInfo(url: String, topDomainsBC: Broadcast[Set[String]], userDomain: String): DomainInfo = {
diff --git a/spot-ml/src/test/scala/org/apache/spot/utilities/DomainProcessorTest.scala b/spot-ml/src/test/scala/org/apache/spot/utilities/DomainProcessorTest.scala
index c1d93f02..7e2cae67 100644
--- a/spot-ml/src/test/scala/org/apache/spot/utilities/DomainProcessorTest.scala
+++ b/spot-ml/src/test/scala/org/apache/spot/utilities/DomainProcessorTest.scala
@@ -57,8 +57,10 @@ class DomainProcessorTest extends TestingSparkContextFlatSpec with Matchers {
 
     val topDomains = sparkContext.broadcast(TopDomains.TopDomains)
 
+    val userDomain = "intel"
+
     // case class DerivedFields(topDomain: String, subdomainLength: Double, subdomainEntropy: Double, numPeriods: Double)
-    val result = extractDomainInfo(url, topDomains)
+    val result = extractDomainInfo(url, topDomains, userDomain)
 
     result shouldBe DomainInfo(domain = "None", topDomain = 0, subdomain = "None", subdomainLength = 0, subdomainEntropy = 0, numPeriods = 6)
   }
@@ -69,7 +71,9 @@ class DomainProcessorTest extends TestingSparkContextFlatSpec with Matchers {
 
     val topDomains = sparkContext.broadcast(TopDomains.TopDomains)
 
-    val result = extractDomainInfo(url, topDomains)
+    val userDomain = "intel"
+
+    val result = extractDomainInfo(url, topDomains, userDomain)
 
     result shouldBe DomainInfo(domain = "amazon", topDomain = 1, subdomain = "services",
       subdomainLength = 8, subdomainEntropy = 2.5, numPeriods = 4)
@@ -80,8 +84,9 @@ class DomainProcessorTest extends TestingSparkContextFlatSpec with Matchers {
     val url = "amazon.com.mx"
     val countryCodes = sparkContext.broadcast(countryCodesSet)
     val topDomains = sparkContext.broadcast(TopDomains.TopDomains)
+    val userDomain = "intel"
 
-    val result = extractDomainInfo(url, topDomains)
+    val result = extractDomainInfo(url, topDomains, userDomain)
 
     result shouldBe DomainInfo(domain = "amazon", subdomain = "None", topDomain = 1, subdomainLength = 0, subdomainEntropy = 0, numPeriods = 3)
   }
@@ -91,8 +96,9 @@ class DomainProcessorTest extends TestingSparkContextFlatSpec with Matchers {
     val url = "services.amazon.com"
     val countryCodes = sparkContext.broadcast(countryCodesSet)
     val topDomains = sparkContext.broadcast(TopDomains.TopDomains)
+    val userDomain = "intel"
 
-    val result = extractDomainInfo(url, topDomains)
+    val result = extractDomainInfo(url, topDomains, userDomain)
 
     result shouldBe DomainInfo(domain = "amazon", subdomain = "services", topDomain = 1, subdomainLength = 8, subdomainEntropy = 2.5, numPeriods = 3)
   }
@@ -103,7 +109,9 @@ class DomainProcessorTest extends TestingSparkContextFlatSpec with Matchers {
     val url = "amazon.com"
     val countryCodes = sparkContext.broadcast(countryCodesSet)
     val topDomains = sparkContext.broadcast(TopDomains.TopDomains)
-    val result = extractDomainInfo(url, topDomains)
+    val userDomain = "intel"
+
+    val result = extractDomainInfo(url, topDomains, userDomain)
 
     result shouldBe DomainInfo(domain = "amazon", subdomain = "None", topDomain = 1, subdomainLength = 0, subdomainEntropy = 0, numPeriods = 2)
   }

From f3b3652673713fd36152605c64df9a95e140e818 Mon Sep 17 00:00:00 2001
From: Brandon Edwards <brandon.edwards@intel.com>
Date: Wed, 7 Dec 2016 16:39:18 -0800
Subject: [PATCH 04/11] More editing on SuspiciousConnectsArgumentParser to
 eliminate old (dropped) arguments.

---
 .../spot/SuspiciousConnectsArgumentParser.scala  | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala
index be0db305..e6f5c1cb 100644
--- a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala
@@ -11,11 +11,7 @@ object SuspiciousConnectsArgumentParser {
                                       feedbackFile: String = "",
                                       duplicationFactor: Int = 1,
                                       topicCount: Int = 20,
-                                      localPath: String = "",
-                                      localUser: String = "",
                                       userDomain: String = "",
-                                      ldaPath: String = "",
-                                      nodes: String = "",
                                       hdfsScoredConnect: String = "",
                                       threshold: Double = 1.0d,
                                       maxResults: Int = -1,
@@ -50,18 +46,6 @@ object SuspiciousConnectsArgumentParser {
       action((x, c) => c.copy(topicCount = x.toInt)).
       text("topic count")
 
-    opt[String]("lpath").required().valueName("<local path>").
-      action((x, c) => c.copy(localPath = x)).
-      text("Local Path")
-
-    opt[String]("ldapath").required().valueName("<local path>").
-      action((x, c) => c.copy(ldaPath = x)).
-      text("LDA Path")
-
-    opt[String]("luser").required().valueName("<local path>").
-      action((x, c) => c.copy(localUser = x)).
-      text("Local user path")
-
     opt[String]("userdomain").required().valueName("<user domain>").
       action((x, c) => c.copy(userDomain = x)).
       text("Domain of spot user (example: intel)")

From f7596ca8f13629e09d88a4d257ecb6f8d10f33de Mon Sep 17 00:00:00 2001
From: Brandon Edwards <brandon.edwards@intel.com>
Date: Wed, 7 Dec 2016 16:46:41 -0800
Subject: [PATCH 05/11] And yet more editing of
 SuspiciousConnectsArgumentParser.scala to remove old argument references.

---
 .../org/apache/spot/SuspiciousConnectsArgumentParser.scala    | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala
index e6f5c1cb..4647dbf3 100644
--- a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala
@@ -50,10 +50,6 @@ object SuspiciousConnectsArgumentParser {
       action((x, c) => c.copy(userDomain = x)).
       text("Domain of spot user (example: intel)")
 
-    opt[String]("nodes").required().valueName("<input param>").
-      action((x, c) => c.copy(nodes = x)).
-      text("Node list")
-
     opt[String]("scored").required().valueName("<hdfs path>").
       action((x, c) => c.copy(hdfsScoredConnect = x)).
       text("HDFS path for results")

From 55b0497bf3a6dbe87e659cadffcb70784cb9c1af Mon Sep 17 00:00:00 2001
From: Brandon Edwards <brandon.edwards@intel.com>
Date: Fri, 9 Dec 2016 13:56:44 -0800
Subject: [PATCH 06/11] Changed the usderDomain input to be optional in the
 argument parser and changed the default value of this armgument to be the
 empty string in spot.conf.

---
 spot-ml/INSTALL.md                                            | 1 +
 .../org/apache/spot/SuspiciousConnectsArgumentParser.scala    | 2 +-
 .../scala/org/apache/spot/utilities/DomainProcessor.scala     | 2 +-
 spot-setup/spot.conf                                          | 4 ++--
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/spot-ml/INSTALL.md b/spot-ml/INSTALL.md
index 505f35af..0ca8ef25 100644
--- a/spot-ml/INSTALL.md
+++ b/spot-ml/INSTALL.md
@@ -14,6 +14,7 @@ Names and language that we will use from the configuration variables for Spot (t
 - MLNODE The node from which the spot-ml routines are invoked
 - HUSER An HDFS user path that will be the base path for the solution; this is usually the same user that you created to run the solution
 - HPATH Location for storing intermediate results of the analysis on HDFS.
+- USER_DOMAIN Web domain associated to the user's network (for the DNS suspicous connects analysis). For example: USER_DOMAIN='intel'.
 
 ### Prepare data for input 
 
diff --git a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala
index 4647dbf3..9b0ac07c 100644
--- a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala
@@ -46,7 +46,7 @@ object SuspiciousConnectsArgumentParser {
       action((x, c) => c.copy(topicCount = x.toInt)).
       text("topic count")
 
-    opt[String]("userdomain").required().valueName("<user domain>").
+    opt[String]("userdomain").valueName("<user domain>").
       action((x, c) => c.copy(userDomain = x)).
       text("Domain of spot user (example: intel)")
 
diff --git a/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala b/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala
index c5f0d73c..a60b1fb6 100644
--- a/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala
@@ -65,7 +65,7 @@ object DomainProcessor extends Serializable {
       0
     }
 
-    val topDomainClass = if (domain == userDomain) {
+    val topDomainClass = if (userDomain != "" && domain == userDomain) {
       2
     } else if (topDomainsBC.value contains domain) {
       1
diff --git a/spot-setup/spot.conf b/spot-setup/spot.conf
index 433afa8b..3b217ed1 100755
--- a/spot-setup/spot.conf
+++ b/spot-setup/spot.conf
@@ -31,8 +31,8 @@ RPATH=${LUSER}/ipython/user/${FDATE}
 LDAPATH=${LUSER}/ml/oni-lda-c
 LIPATH=${LUSER}/ingest
 
-#domain associated to network data to be analyzed
-USER_DOMAIN='intel'
+#dns suspicious connects config
+USER_DOMAIN=''
 
 SPK_EXEC='400'
 SPK_EXEC_MEM='2048m'

From 13fc71863558d8d27512ca7c710fda585812a63a Mon Sep 17 00:00:00 2001
From: Brandon Edwards <brandon.edwards@intel.com>
Date: Fri, 9 Dec 2016 17:02:50 -0800
Subject: [PATCH 07/11] Added a test to DomainProcessorTest.scala to insure
 that an empty string for userDomain would not be taken as the legitimate user
 domain in the case of an empty string domain.

---
 .../apache/spot/utilities/DomainProcessorTest.scala    | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/spot-ml/src/test/scala/org/apache/spot/utilities/DomainProcessorTest.scala b/spot-ml/src/test/scala/org/apache/spot/utilities/DomainProcessorTest.scala
index 7e2cae67..5d7281ea 100644
--- a/spot-ml/src/test/scala/org/apache/spot/utilities/DomainProcessorTest.scala
+++ b/spot-ml/src/test/scala/org/apache/spot/utilities/DomainProcessorTest.scala
@@ -115,4 +115,14 @@ class DomainProcessorTest extends TestingSparkContextFlatSpec with Matchers {
 
     result shouldBe DomainInfo(domain = "amazon", subdomain = "None", topDomain = 1, subdomainLength = 0, subdomainEntropy = 0, numPeriods = 2)
   }
+  it should "not identify the domain as the users domain when both are empty strings" in {
+    val url = "ab..com"
+    val countryCodes = sparkContext.broadcast(countryCodesSet)
+    val topDomains = sparkContext.broadcast(TopDomains.TopDomains)
+    val userDomain = ""
+
+    val result = extractDomainInfo(url, topDomains, userDomain)
+
+    result shouldBe DomainInfo(domain = "", subdomain = "ab", topDomain = 0, subdomainLength = 2, subdomainEntropy = 1, numPeriods = 3)
+  }
 }

From b9cc67d1ea4579781a95d86e1d34ea49cf069b68 Mon Sep 17 00:00:00 2001
From: Brandon Edwards <brandon.edwards@intel.com>
Date: Tue, 13 Dec 2016 12:37:48 -0800
Subject: [PATCH 08/11] Modified spot.conf to contain logic that only tries to
 pass the userdomain option when the spot.conf variable USER_DOMAIN is
 non-empty.

---
 spot-ml/ml_ops.sh | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/spot-ml/ml_ops.sh b/spot-ml/ml_ops.sh
index cf1bc311..a951fe1b 100755
--- a/spot-ml/ml_ops.sh
+++ b/spot-ml/ml_ops.sh
@@ -45,6 +45,14 @@ else
     RAWDATA_PATH=${PROXY_PATH}
 fi
 
+# pass the user domain designation if not empty
+
+if [ ! -z $USER_DOMAIN ] ; then
+    USER_DOMAIN_PARSER_CMD="--userdomain $USER_DOMAIN"
+else
+    USER_DOMAIN_PARSER_CMD=''
+fi
+
 FEEDBACK_PATH=${LPATH}/${DSOURCE}_scores.csv
 DUPFACTOR=1000
 
@@ -97,7 +105,7 @@ time spark-submit --class "org.apache.spot.SuspiciousConnects" \
   --dupfactor ${DUPFACTOR} \
   --feedback ${FEEDBACK_PATH} \
   --ldatopiccount ${TOPIC_COUNT} \
-  --userdomain ${USER_DOMAIN}\
+  $USER_DOMAIN_PARSER_CMD \
   --scored ${HDFS_SCORED_CONNECTS} \
   --threshold ${TOL} \
   --maxresults ${MAXRESULTS} \

From 3bf290dd0a33368486abea1d803163bbc7cd7736 Mon Sep 17 00:00:00 2001
From: Brandon Edwards <brandon.edwards@intel.com>
Date: Tue, 13 Dec 2016 16:21:47 -0800
Subject: [PATCH 09/11] Changed the position of the optional parameter to be at
 the end of all other parameters

---
 spot-ml/ml_ops.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spot-ml/ml_ops.sh b/spot-ml/ml_ops.sh
index a951fe1b..c5d6e431 100755
--- a/spot-ml/ml_ops.sh
+++ b/spot-ml/ml_ops.sh
@@ -105,11 +105,11 @@ time spark-submit --class "org.apache.spot.SuspiciousConnects" \
   --dupfactor ${DUPFACTOR} \
   --feedback ${FEEDBACK_PATH} \
   --ldatopiccount ${TOPIC_COUNT} \
-  $USER_DOMAIN_PARSER_CMD \
   --scored ${HDFS_SCORED_CONNECTS} \
   --threshold ${TOL} \
   --maxresults ${MAXRESULTS} \
-  --ldamaxiterations 20
+  --ldamaxiterations 20 \ 
+  $USER_DOMAIN_PARSER_CMD
 
 wait
 

From 452fca35bd33c291b88cd4a664f531db1ba7d7e2 Mon Sep 17 00:00:00 2001
From: Brandon Edwards <brandon.edwards@intel.com>
Date: Wed, 14 Dec 2016 08:25:30 -0800
Subject: [PATCH 10/11] Omitted an extra space after one of the jar parameter
 entry lines that was causing an error.

---
 spot-ml/ml_ops.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spot-ml/ml_ops.sh b/spot-ml/ml_ops.sh
index c5d6e431..ece60cd1 100755
--- a/spot-ml/ml_ops.sh
+++ b/spot-ml/ml_ops.sh
@@ -108,7 +108,7 @@ time spark-submit --class "org.apache.spot.SuspiciousConnects" \
   --scored ${HDFS_SCORED_CONNECTS} \
   --threshold ${TOL} \
   --maxresults ${MAXRESULTS} \
-  --ldamaxiterations 20 \ 
+  --ldamaxiterations 20 \
   $USER_DOMAIN_PARSER_CMD
 
 wait

From 1a0269f2e0f9a1297e1084c01b1991362f0a74ae Mon Sep 17 00:00:00 2001
From: Brandon Edwards <brandon.edwards@intel.com>
Date: Wed, 14 Dec 2016 08:35:14 -0800
Subject: [PATCH 11/11] Changed USER_DOMAIN_PARSER_CMD to USER_DOMAIN_CMD

---
 spot-ml/ml_ops.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/spot-ml/ml_ops.sh b/spot-ml/ml_ops.sh
index ece60cd1..02a9c1d2 100755
--- a/spot-ml/ml_ops.sh
+++ b/spot-ml/ml_ops.sh
@@ -48,9 +48,9 @@ fi
 # pass the user domain designation if not empty
 
 if [ ! -z $USER_DOMAIN ] ; then
-    USER_DOMAIN_PARSER_CMD="--userdomain $USER_DOMAIN"
+    USER_DOMAIN_CMD="--userdomain $USER_DOMAIN"
 else
-    USER_DOMAIN_PARSER_CMD=''
+    USER_DOMAIN_CMD=''
 fi
 
 FEEDBACK_PATH=${LPATH}/${DSOURCE}_scores.csv
@@ -109,7 +109,7 @@ time spark-submit --class "org.apache.spot.SuspiciousConnects" \
   --threshold ${TOL} \
   --maxresults ${MAXRESULTS} \
   --ldamaxiterations 20 \
-  $USER_DOMAIN_PARSER_CMD
+  $USER_DOMAIN_CMD
 
 wait