From bc5744f6f5e81521a3905ae072177218fdd6b05c Mon Sep 17 00:00:00 2001 From: Brandon Edwards Date: Mon, 5 Dec 2016 14:59:38 -0800 Subject: [PATCH 01/11] Eddited DNSWordCreation.scala DomainProcessor.scala DNSSuspiciousConnectsModel.scala, and SuspicousConnectsArgumentParser to all allow in a variable user domain designation. More work to do in order to have this information come in from the spot.conf. --- .../SuspiciousConnectsArgumentParser.scala | 25 +++++++++++++++++++ .../org/apache/spot/dns/DNSWordCreation.scala | 6 +++-- .../model/DNSSuspiciousConnectsModel.scala | 9 ++++++- .../spot/utilities/DomainProcessor.scala | 5 ++-- 4 files changed, 40 insertions(+), 5 deletions(-) diff --git a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala index 33186c58..632e0d83 100644 --- a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala +++ b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala @@ -11,6 +11,11 @@ object SuspiciousConnectsArgumentParser { feedbackFile: String = "", duplicationFactor: Int = 1, topicCount: Int = 20, + localPath: String = "", + localUser: String = "", + userDomain: String = "", + ldaPath: String = "", + nodes: String = "", hdfsScoredConnect: String = "", threshold: Double = 1.0d, maxResults: Int = -1, @@ -45,6 +50,26 @@ object SuspiciousConnectsArgumentParser { action((x, c) => c.copy(topicCount = x.toInt)). text("topic count") + opt[String]("lpath").required().valueName(""). + action((x, c) => c.copy(localPath = x)). + text("Local Path") + + opt[String]("ldapath").required().valueName(""). + action((x, c) => c.copy(ldaPath = x)). + text("LDA Path") + + opt[String]("luser").required().valueName(""). + action((x, c) => c.copy(localUser = x)). + text("Local user path") + + opt[String]("userDomain").required().valueName(""). + action((x, c) => c.copy(userDomain = x)). + text("Domain of spot user (example: intel)") + + opt[String]("nodes").required().valueName(""). + action((x, c) => c.copy(nodes = x)). + text("Node list") + opt[String]("scored").required().valueName(""). action((x, c) => c.copy(hdfsScoredConnect = x)). text("HDFS path for results") diff --git a/spot-ml/src/main/scala/org/apache/spot/dns/DNSWordCreation.scala b/spot-ml/src/main/scala/org/apache/spot/dns/DNSWordCreation.scala index 2ffe12e1..e4595e1c 100644 --- a/spot-ml/src/main/scala/org/apache/spot/dns/DNSWordCreation.scala +++ b/spot-ml/src/main/scala/org/apache/spot/dns/DNSWordCreation.scala @@ -15,13 +15,15 @@ import org.apache.spot.utilities.Quantiles * @param entropyCuts * @param numberPeriodsCuts * @param topDomainsBC + * @param userDomain */ class DNSWordCreation(frameLengthCuts: Array[Double], timeCuts: Array[Double], subdomainLengthCuts: Array[Double], entropyCuts: Array[Double], numberPeriodsCuts: Array[Double], - topDomainsBC: Broadcast[Set[String]]) extends Serializable { + topDomainsBC: Broadcast[Set[String]], + userDomain: String) extends Serializable { /** @@ -79,7 +81,7 @@ class DNSWordCreation(frameLengthCuts: Array[Double], val DomainInfo(domain, topDomain, subdomain, subdomainLength, subdomainEntropy, numPeriods) = - extractDomainInfo(queryName, topDomainsBC) + extractDomainInfo(queryName, topDomainsBC, userDomain) Seq(topDomain, Quantiles.bin(frameLength.toDouble, frameLengthCuts), diff --git a/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala b/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala index 81c214f6..953e1eca 100644 --- a/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala +++ b/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala @@ -158,6 +158,7 @@ object DNSSuspiciousConnectsModel { val countryCodesBC = sparkContext.broadcast(CountryCodes.CountryCodes) val topDomainsBC = sparkContext.broadcast(TopDomains.TopDomains) + val userDomain = config.userDomain // create quantile cut-offs @@ -180,7 +181,13 @@ object DNSSuspiciousConnectsModel { // simplify DNS log entries into "words" - val dnsWordCreator = new DNSWordCreation(frameLengthCuts, timeCuts, subdomainLengthCuts, entropyCuts, numberPeriodsCuts, topDomainsBC) + val dnsWordCreator = new DNSWordCreation(frameLengthCuts, + timeCuts, + subdomainLengthCuts, + entropyCuts, + numberPeriodsCuts, + topDomainsBC, + userDomain) val dataWithWordDF = totalDataDF.withColumn(Word, dnsWordCreator.wordCreationUDF(modelColumns: _*)) diff --git a/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala b/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala index 28595e46..334ae876 100644 --- a/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala +++ b/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala @@ -48,9 +48,10 @@ object DomainProcessor extends Serializable { * Extract domain info from a url. * @param url Incoming url. * @param topDomainsBC Broadcast variable containing the top domains set. + * @param userDomain Domain of the spot user (example:'intel'). * @return New [[DomainInfo]] object containing extracted domain information. */ - def extractDomainInfo(url: String, topDomainsBC: Broadcast[Set[String]]): DomainInfo = { + def extractDomainInfo(url: String, topDomainsBC: Broadcast[Set[String]], userDomain: String): DomainInfo = { val spliturl = url.split('.') val numParts = spliturl.length @@ -64,7 +65,7 @@ object DomainProcessor extends Serializable { 0 } - val topDomainClass = if (domain == "intel") { + val topDomainClass = if (domain == userDomain) { 2 } else if (topDomainsBC.value contains domain) { 1 From 41ffbc3c3744eadc7baac4633f160188bcad731a Mon Sep 17 00:00:00 2001 From: Brandon Edwards Date: Tue, 6 Dec 2016 09:03:02 -0800 Subject: [PATCH 02/11] Inserted lines into spot.conf and ml_ops.sh for input of user domain string. --- spot-ml/ml_ops.sh | 3 ++- spot-setup/spot.conf | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/spot-ml/ml_ops.sh b/spot-ml/ml_ops.sh index 244bcc4a..cf1bc311 100755 --- a/spot-ml/ml_ops.sh +++ b/spot-ml/ml_ops.sh @@ -97,6 +97,7 @@ time spark-submit --class "org.apache.spot.SuspiciousConnects" \ --dupfactor ${DUPFACTOR} \ --feedback ${FEEDBACK_PATH} \ --ldatopiccount ${TOPIC_COUNT} \ + --userdomain ${USER_DOMAIN}\ --scored ${HDFS_SCORED_CONNECTS} \ --threshold ${TOL} \ --maxresults ${MAXRESULTS} \ @@ -107,4 +108,4 @@ wait # move results to hdfs. cd ${LPATH} hadoop fs -getmerge ${HDFS_SCORED_CONNECTS}/part-* ${DSOURCE}_results.csv && hadoop fs -moveFromLocal \ - ${DSOURCE}_results.csv ${HDFS_SCORED_CONNECTS}/${DSOURCE}_results.csv \ No newline at end of file + ${DSOURCE}_results.csv ${HDFS_SCORED_CONNECTS}/${DSOURCE}_results.csv diff --git a/spot-setup/spot.conf b/spot-setup/spot.conf index 8f60eb15..433afa8b 100755 --- a/spot-setup/spot.conf +++ b/spot-setup/spot.conf @@ -17,6 +17,7 @@ HPATH=${HUSER}/${DSOURCE}/scored_results/${FDATE} #impala config IMPALA_DEM='node04' +#kerberos config KRB_AUTH=false KINITPATH= KINITOPTS= @@ -30,6 +31,9 @@ RPATH=${LUSER}/ipython/user/${FDATE} LDAPATH=${LUSER}/ml/oni-lda-c LIPATH=${LUSER}/ingest +#domain associated to network data to be analyzed +USER_DOMAIN='intel' + SPK_EXEC='400' SPK_EXEC_MEM='2048m' SPK_DRIVER_MEM='' From d7d6ae07344a9a4308067963cb9b03e493b995d0 Mon Sep 17 00:00:00 2001 From: Brandon Edwards Date: Tue, 6 Dec 2016 14:45:38 -0800 Subject: [PATCH 03/11] More modifications in order to pass the user domain info around to the places that are needed. --- .../SuspiciousConnectsArgumentParser.scala | 2 +- .../dns/DNSSuspiciousConnectsAnalysis.scala | 4 ++- .../spot/dns/model/DNSScoreFunction.scala | 30 ++++++++++++------- .../model/DNSSuspiciousConnectsModel.scala | 16 ++++++---- .../spot/utilities/DomainProcessor.scala | 14 ++++----- .../spot/utilities/DomainProcessorTest.scala | 18 +++++++---- 6 files changed, 54 insertions(+), 30 deletions(-) diff --git a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala index 632e0d83..be0db305 100644 --- a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala +++ b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala @@ -62,7 +62,7 @@ object SuspiciousConnectsArgumentParser { action((x, c) => c.copy(localUser = x)). text("Local user path") - opt[String]("userDomain").required().valueName(""). + opt[String]("userdomain").required().valueName(""). action((x, c) => c.copy(userDomain = x)). text("Domain of spot user (example: intel)") diff --git a/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala b/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala index d0e6da17..4ef4718e 100644 --- a/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala +++ b/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala @@ -54,6 +54,8 @@ object DNSSuspiciousConnectsAnalysis { logger.info("Loading data") + val userDomain = config.userDomain + val rawDataDF = sqlContext.read.parquet(config.inputPath) .filter(Timestamp + " is not null and " + UnixTimestamp + " is not null") .select(inColumns:_*) @@ -64,7 +66,7 @@ object DNSSuspiciousConnectsAnalysis { DNSSuspiciousConnectsModel.trainNewModel(sparkContext, sqlContext, logger, config, rawDataDF, config.topicCount) logger.info("Scoring") - val scoredDF = model.score(sparkContext, sqlContext, rawDataDF) + val scoredDF = model.score(sparkContext, sqlContext, rawDataDF, userDomain) val filteredDF = scoredDF.filter(Score + " <= " + config.threshold) diff --git a/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSScoreFunction.scala b/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSScoreFunction.scala index 09656f27..728f2693 100644 --- a/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSScoreFunction.scala +++ b/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSScoreFunction.scala @@ -8,15 +8,16 @@ import org.apache.spot.dns.DNSWordCreation /** * Estimate the probabilities of network events using a [[DNSSuspiciousConnectsModel]] * - * @param frameLengthCuts - * @param timeCuts - * @param subdomainLengthCuts - * @param entropyCuts - * @param numberPeriodsCuts - * @param topicCount - * @param ipToTopicMixBC - * @param wordToPerTopicProbBC - * @param topDomainsBC + * @param frameLengthCuts Delimeters used to define binning for frame length field + * @param timeCuts Delimeters used to define binning for time field + * @param subdomainLengthCuts Delimeters used to define binning for subdomain length field + * @param entropyCuts Delimeters used to define binning for entropy field + * @param numberPeriodsCuts Delimeters used to define binning for number of periods of subdomain field + * @param topicCount Number of topics used for the LDA model + * @param ipToTopicMixBC Topic mixes learned by the LDA model for each IP in the data + * @param wordToPerTopicProbBC Word mixes for each of the topics learned by the LDA model + * @param topDomainsBC Alexa top one million list of domains. + * @param userDomain Domain associated to network data (example: 'intel') */ class DNSScoreFunction(frameLengthCuts: Array[Double], timeCuts: Array[Double], @@ -26,13 +27,20 @@ class DNSScoreFunction(frameLengthCuts: Array[Double], topicCount: Int, ipToTopicMixBC: Broadcast[Map[String, Array[Double]]], wordToPerTopicProbBC: Broadcast[Map[String, Array[Double]]], - topDomainsBC: Broadcast[Set[String]]) extends Serializable { + topDomainsBC: Broadcast[Set[String]], + userDomain: String) extends Serializable { val suspiciousConnectsScoreFunction = new SuspiciousConnectsScoreFunction(topicCount, ipToTopicMixBC, wordToPerTopicProbBC) - val dnsWordCreator = new DNSWordCreation(frameLengthCuts, timeCuts, subdomainLengthCuts, entropyCuts, numberPeriodsCuts, topDomainsBC) + val dnsWordCreator = new DNSWordCreation(frameLengthCuts, + timeCuts, + subdomainLengthCuts, + entropyCuts, + numberPeriodsCuts, + topDomainsBC, + userDomain) def score(timeStamp: String, unixTimeStamp: Long, diff --git a/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala b/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala index 953e1eca..047e2620 100644 --- a/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala +++ b/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala @@ -64,10 +64,11 @@ class DNSSuspiciousConnectsModel(inTopicCount: Int, * @param sc Spark Context * @param sqlContext Spark SQL context * @param inDF Dataframe of DNS log events, containing at least the columns of [[DNSSuspiciousConnectsModel.ModelSchema]] + * @param userDomain Domain associated to network data (ex: 'intel') * @return Dataframe with a column named [[org.apache.spot.dns.DNSSchema.Score]] that contains the * probability estimated for the network event at that row */ - def score(sc: SparkContext, sqlContext: SQLContext, inDF: DataFrame): DataFrame = { + def score(sc: SparkContext, sqlContext: SQLContext, inDF: DataFrame, userDomain: String): DataFrame = { val countryCodesBC = sc.broadcast(CountryCodes.CountryCodes) val topDomainsBC = sc.broadcast(TopDomains.TopDomains) @@ -84,7 +85,8 @@ class DNSSuspiciousConnectsModel(inTopicCount: Int, topicCount, ipToTopicMixBC, wordToPerTopicProbBC, - topDomainsBC) + topDomainsBC, + userDomain) val scoringUDF = udf((timeStamp: String, @@ -168,7 +170,7 @@ object DNSSuspiciousConnectsModel { val frameLengthCuts = Quantiles.computeDeciles(totalDataDF.select(FrameLength).rdd .map({ case Row(frameLen: Int) => frameLen.toDouble })) - val domainStatsDF = createDomainStatsDF(sparkContext, sqlContext, countryCodesBC, topDomainsBC, totalDataDF) + val domainStatsDF = createDomainStatsDF(sparkContext, sqlContext, countryCodesBC, topDomainsBC, userDomain, totalDataDF) val subdomainLengthCuts = Quantiles.computeQuintiles(domainStatsDF.filter(SubdomainLength + " > 0") .select(SubdomainLength).rdd.map({ case Row(subdomainLength: Int) => subdomainLength.toDouble })) @@ -238,6 +240,7 @@ object DNSSuspiciousConnectsModel { * @param sqlContext Spark SQL context. * @param countryCodesBC Broadcast of the country codes set. * @param topDomainsBC Broadcast of the most-popular domains set. + * @param userDomain Domain associated to network data (ex: 'intel') * @param inDF Incoming dataframe. Schema is expected to provide the field [[QueryName]] * @return A new dataframe with the new columns added. The new columns have the schema [[DomainStatsSchema]] */ @@ -246,11 +249,12 @@ object DNSSuspiciousConnectsModel { sqlContext: SQLContext, countryCodesBC: Broadcast[Set[String]], topDomainsBC: Broadcast[Set[String]], + userDomain: String, inDF: DataFrame): DataFrame = { val queryNameIndex = inDF.schema.fieldNames.indexOf(QueryName) val domainStatsRDD: RDD[Row] = inDF.rdd.map(row => - Row.fromTuple(createTempFields(countryCodesBC, topDomainsBC, row.getString(queryNameIndex)))) + Row.fromTuple(createTempFields(countryCodesBC, topDomainsBC, userDomain, row.getString(queryNameIndex)))) sqlContext.createDataFrame(domainStatsRDD, DomainStatsSchema) } @@ -262,15 +266,17 @@ object DNSSuspiciousConnectsModel { * * @param countryCodesBC Broadcast of the country codes set. * @param topDomainsBC Broadcast of the most-popular domains set. + * @param userDomain Domain associated to network data (ex: 'intel') * @param url URL string to anlayze for domain and subdomain information. * @return [[TempFields]] */ def createTempFields(countryCodesBC: Broadcast[Set[String]], topDomainsBC: Broadcast[Set[String]], + userDomain: String, url: String): TempFields = { val DomainInfo(_, topDomainClass, subdomain, subdomainLength, subdomainEntropy, numPeriods) = - DomainProcessor.extractDomainInfo(url, topDomainsBC) + DomainProcessor.extractDomainInfo(url, topDomainsBC, userDomain) TempFields(topDomainClass = topDomainClass, diff --git a/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala b/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala index 334ae876..c5f0d73c 100644 --- a/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala +++ b/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala @@ -28,10 +28,10 @@ object DomainProcessor extends Serializable { /** * Commonly extracted domain features. - * @param domain Domain (if any) of a url. - * @param topDomain Numerical class of domain: 2 for Intel, 1 for Alexa top domains, 0 for others. - * @param subdomain Subdomain (if any) in the url. - * @param subdomainLength Length of the subdomain. 0 if there is none. + * @param domain Domain (if any) of a url. + * @param topDomain Numerical class of domain: 2 for Intel, 1 for Alexa top domains, 0 for others. + * @param subdomain Subdomain (if any) in the url. + * @param subdomainLength Length of the subdomain. 0 if there is none. * @param subdomainEntropy Entropy of the subdomain viewed as a distribution on its character set. * 0 if there is no subdomain. * @param numPeriods Number of periods + 1 in the url. (Number of sub-strings where url is split by periods.) @@ -46,9 +46,9 @@ object DomainProcessor extends Serializable { /** * Extract domain info from a url. - * @param url Incoming url. - * @param topDomainsBC Broadcast variable containing the top domains set. - * @param userDomain Domain of the spot user (example:'intel'). + * @param url Incoming url. + * @param topDomainsBC Broadcast variable containing the top domains set. + * @param userDomain Domain of the spot user (example:'intel'). * @return New [[DomainInfo]] object containing extracted domain information. */ def extractDomainInfo(url: String, topDomainsBC: Broadcast[Set[String]], userDomain: String): DomainInfo = { diff --git a/spot-ml/src/test/scala/org/apache/spot/utilities/DomainProcessorTest.scala b/spot-ml/src/test/scala/org/apache/spot/utilities/DomainProcessorTest.scala index c1d93f02..7e2cae67 100644 --- a/spot-ml/src/test/scala/org/apache/spot/utilities/DomainProcessorTest.scala +++ b/spot-ml/src/test/scala/org/apache/spot/utilities/DomainProcessorTest.scala @@ -57,8 +57,10 @@ class DomainProcessorTest extends TestingSparkContextFlatSpec with Matchers { val topDomains = sparkContext.broadcast(TopDomains.TopDomains) + val userDomain = "intel" + // case class DerivedFields(topDomain: String, subdomainLength: Double, subdomainEntropy: Double, numPeriods: Double) - val result = extractDomainInfo(url, topDomains) + val result = extractDomainInfo(url, topDomains, userDomain) result shouldBe DomainInfo(domain = "None", topDomain = 0, subdomain = "None", subdomainLength = 0, subdomainEntropy = 0, numPeriods = 6) } @@ -69,7 +71,9 @@ class DomainProcessorTest extends TestingSparkContextFlatSpec with Matchers { val topDomains = sparkContext.broadcast(TopDomains.TopDomains) - val result = extractDomainInfo(url, topDomains) + val userDomain = "intel" + + val result = extractDomainInfo(url, topDomains, userDomain) result shouldBe DomainInfo(domain = "amazon", topDomain = 1, subdomain = "services", subdomainLength = 8, subdomainEntropy = 2.5, numPeriods = 4) @@ -80,8 +84,9 @@ class DomainProcessorTest extends TestingSparkContextFlatSpec with Matchers { val url = "amazon.com.mx" val countryCodes = sparkContext.broadcast(countryCodesSet) val topDomains = sparkContext.broadcast(TopDomains.TopDomains) + val userDomain = "intel" - val result = extractDomainInfo(url, topDomains) + val result = extractDomainInfo(url, topDomains, userDomain) result shouldBe DomainInfo(domain = "amazon", subdomain = "None", topDomain = 1, subdomainLength = 0, subdomainEntropy = 0, numPeriods = 3) } @@ -91,8 +96,9 @@ class DomainProcessorTest extends TestingSparkContextFlatSpec with Matchers { val url = "services.amazon.com" val countryCodes = sparkContext.broadcast(countryCodesSet) val topDomains = sparkContext.broadcast(TopDomains.TopDomains) + val userDomain = "intel" - val result = extractDomainInfo(url, topDomains) + val result = extractDomainInfo(url, topDomains, userDomain) result shouldBe DomainInfo(domain = "amazon", subdomain = "services", topDomain = 1, subdomainLength = 8, subdomainEntropy = 2.5, numPeriods = 3) } @@ -103,7 +109,9 @@ class DomainProcessorTest extends TestingSparkContextFlatSpec with Matchers { val url = "amazon.com" val countryCodes = sparkContext.broadcast(countryCodesSet) val topDomains = sparkContext.broadcast(TopDomains.TopDomains) - val result = extractDomainInfo(url, topDomains) + val userDomain = "intel" + + val result = extractDomainInfo(url, topDomains, userDomain) result shouldBe DomainInfo(domain = "amazon", subdomain = "None", topDomain = 1, subdomainLength = 0, subdomainEntropy = 0, numPeriods = 2) } From f3b3652673713fd36152605c64df9a95e140e818 Mon Sep 17 00:00:00 2001 From: Brandon Edwards Date: Wed, 7 Dec 2016 16:39:18 -0800 Subject: [PATCH 04/11] More editing on SuspiciousConnectsArgumentParser to eliminate old (dropped) arguments. --- .../spot/SuspiciousConnectsArgumentParser.scala | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala index be0db305..e6f5c1cb 100644 --- a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala +++ b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala @@ -11,11 +11,7 @@ object SuspiciousConnectsArgumentParser { feedbackFile: String = "", duplicationFactor: Int = 1, topicCount: Int = 20, - localPath: String = "", - localUser: String = "", userDomain: String = "", - ldaPath: String = "", - nodes: String = "", hdfsScoredConnect: String = "", threshold: Double = 1.0d, maxResults: Int = -1, @@ -50,18 +46,6 @@ object SuspiciousConnectsArgumentParser { action((x, c) => c.copy(topicCount = x.toInt)). text("topic count") - opt[String]("lpath").required().valueName(""). - action((x, c) => c.copy(localPath = x)). - text("Local Path") - - opt[String]("ldapath").required().valueName(""). - action((x, c) => c.copy(ldaPath = x)). - text("LDA Path") - - opt[String]("luser").required().valueName(""). - action((x, c) => c.copy(localUser = x)). - text("Local user path") - opt[String]("userdomain").required().valueName(""). action((x, c) => c.copy(userDomain = x)). text("Domain of spot user (example: intel)") From f7596ca8f13629e09d88a4d257ecb6f8d10f33de Mon Sep 17 00:00:00 2001 From: Brandon Edwards Date: Wed, 7 Dec 2016 16:46:41 -0800 Subject: [PATCH 05/11] And yet more editing of SuspiciousConnectsArgumentParser.scala to remove old argument references. --- .../org/apache/spot/SuspiciousConnectsArgumentParser.scala | 4 ---- 1 file changed, 4 deletions(-) diff --git a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala index e6f5c1cb..4647dbf3 100644 --- a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala +++ b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala @@ -50,10 +50,6 @@ object SuspiciousConnectsArgumentParser { action((x, c) => c.copy(userDomain = x)). text("Domain of spot user (example: intel)") - opt[String]("nodes").required().valueName(""). - action((x, c) => c.copy(nodes = x)). - text("Node list") - opt[String]("scored").required().valueName(""). action((x, c) => c.copy(hdfsScoredConnect = x)). text("HDFS path for results") From 55b0497bf3a6dbe87e659cadffcb70784cb9c1af Mon Sep 17 00:00:00 2001 From: Brandon Edwards Date: Fri, 9 Dec 2016 13:56:44 -0800 Subject: [PATCH 06/11] Changed the usderDomain input to be optional in the argument parser and changed the default value of this armgument to be the empty string in spot.conf. --- spot-ml/INSTALL.md | 1 + .../org/apache/spot/SuspiciousConnectsArgumentParser.scala | 2 +- .../scala/org/apache/spot/utilities/DomainProcessor.scala | 2 +- spot-setup/spot.conf | 4 ++-- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/spot-ml/INSTALL.md b/spot-ml/INSTALL.md index 505f35af..0ca8ef25 100644 --- a/spot-ml/INSTALL.md +++ b/spot-ml/INSTALL.md @@ -14,6 +14,7 @@ Names and language that we will use from the configuration variables for Spot (t - MLNODE The node from which the spot-ml routines are invoked - HUSER An HDFS user path that will be the base path for the solution; this is usually the same user that you created to run the solution - HPATH Location for storing intermediate results of the analysis on HDFS. +- USER_DOMAIN Web domain associated to the user's network (for the DNS suspicous connects analysis). For example: USER_DOMAIN='intel'. ### Prepare data for input diff --git a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala index 4647dbf3..9b0ac07c 100644 --- a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala +++ b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala @@ -46,7 +46,7 @@ object SuspiciousConnectsArgumentParser { action((x, c) => c.copy(topicCount = x.toInt)). text("topic count") - opt[String]("userdomain").required().valueName(""). + opt[String]("userdomain").valueName(""). action((x, c) => c.copy(userDomain = x)). text("Domain of spot user (example: intel)") diff --git a/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala b/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala index c5f0d73c..a60b1fb6 100644 --- a/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala +++ b/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala @@ -65,7 +65,7 @@ object DomainProcessor extends Serializable { 0 } - val topDomainClass = if (domain == userDomain) { + val topDomainClass = if (userDomain != "" && domain == userDomain) { 2 } else if (topDomainsBC.value contains domain) { 1 diff --git a/spot-setup/spot.conf b/spot-setup/spot.conf index 433afa8b..3b217ed1 100755 --- a/spot-setup/spot.conf +++ b/spot-setup/spot.conf @@ -31,8 +31,8 @@ RPATH=${LUSER}/ipython/user/${FDATE} LDAPATH=${LUSER}/ml/oni-lda-c LIPATH=${LUSER}/ingest -#domain associated to network data to be analyzed -USER_DOMAIN='intel' +#dns suspicious connects config +USER_DOMAIN='' SPK_EXEC='400' SPK_EXEC_MEM='2048m' From 13fc71863558d8d27512ca7c710fda585812a63a Mon Sep 17 00:00:00 2001 From: Brandon Edwards Date: Fri, 9 Dec 2016 17:02:50 -0800 Subject: [PATCH 07/11] Added a test to DomainProcessorTest.scala to insure that an empty string for userDomain would not be taken as the legitimate user domain in the case of an empty string domain. --- .../apache/spot/utilities/DomainProcessorTest.scala | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/spot-ml/src/test/scala/org/apache/spot/utilities/DomainProcessorTest.scala b/spot-ml/src/test/scala/org/apache/spot/utilities/DomainProcessorTest.scala index 7e2cae67..5d7281ea 100644 --- a/spot-ml/src/test/scala/org/apache/spot/utilities/DomainProcessorTest.scala +++ b/spot-ml/src/test/scala/org/apache/spot/utilities/DomainProcessorTest.scala @@ -115,4 +115,14 @@ class DomainProcessorTest extends TestingSparkContextFlatSpec with Matchers { result shouldBe DomainInfo(domain = "amazon", subdomain = "None", topDomain = 1, subdomainLength = 0, subdomainEntropy = 0, numPeriods = 2) } + it should "not identify the domain as the users domain when both are empty strings" in { + val url = "ab..com" + val countryCodes = sparkContext.broadcast(countryCodesSet) + val topDomains = sparkContext.broadcast(TopDomains.TopDomains) + val userDomain = "" + + val result = extractDomainInfo(url, topDomains, userDomain) + + result shouldBe DomainInfo(domain = "", subdomain = "ab", topDomain = 0, subdomainLength = 2, subdomainEntropy = 1, numPeriods = 3) + } } From b9cc67d1ea4579781a95d86e1d34ea49cf069b68 Mon Sep 17 00:00:00 2001 From: Brandon Edwards Date: Tue, 13 Dec 2016 12:37:48 -0800 Subject: [PATCH 08/11] Modified spot.conf to contain logic that only tries to pass the userdomain option when the spot.conf variable USER_DOMAIN is non-empty. --- spot-ml/ml_ops.sh | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/spot-ml/ml_ops.sh b/spot-ml/ml_ops.sh index cf1bc311..a951fe1b 100755 --- a/spot-ml/ml_ops.sh +++ b/spot-ml/ml_ops.sh @@ -45,6 +45,14 @@ else RAWDATA_PATH=${PROXY_PATH} fi +# pass the user domain designation if not empty + +if [ ! -z $USER_DOMAIN ] ; then + USER_DOMAIN_PARSER_CMD="--userdomain $USER_DOMAIN" +else + USER_DOMAIN_PARSER_CMD='' +fi + FEEDBACK_PATH=${LPATH}/${DSOURCE}_scores.csv DUPFACTOR=1000 @@ -97,7 +105,7 @@ time spark-submit --class "org.apache.spot.SuspiciousConnects" \ --dupfactor ${DUPFACTOR} \ --feedback ${FEEDBACK_PATH} \ --ldatopiccount ${TOPIC_COUNT} \ - --userdomain ${USER_DOMAIN}\ + $USER_DOMAIN_PARSER_CMD \ --scored ${HDFS_SCORED_CONNECTS} \ --threshold ${TOL} \ --maxresults ${MAXRESULTS} \ From 3bf290dd0a33368486abea1d803163bbc7cd7736 Mon Sep 17 00:00:00 2001 From: Brandon Edwards Date: Tue, 13 Dec 2016 16:21:47 -0800 Subject: [PATCH 09/11] Changed the position of the optional parameter to be at the end of all other parameters --- spot-ml/ml_ops.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spot-ml/ml_ops.sh b/spot-ml/ml_ops.sh index a951fe1b..c5d6e431 100755 --- a/spot-ml/ml_ops.sh +++ b/spot-ml/ml_ops.sh @@ -105,11 +105,11 @@ time spark-submit --class "org.apache.spot.SuspiciousConnects" \ --dupfactor ${DUPFACTOR} \ --feedback ${FEEDBACK_PATH} \ --ldatopiccount ${TOPIC_COUNT} \ - $USER_DOMAIN_PARSER_CMD \ --scored ${HDFS_SCORED_CONNECTS} \ --threshold ${TOL} \ --maxresults ${MAXRESULTS} \ - --ldamaxiterations 20 + --ldamaxiterations 20 \ + $USER_DOMAIN_PARSER_CMD wait From 452fca35bd33c291b88cd4a664f531db1ba7d7e2 Mon Sep 17 00:00:00 2001 From: Brandon Edwards Date: Wed, 14 Dec 2016 08:25:30 -0800 Subject: [PATCH 10/11] Omitted an extra space after one of the jar parameter entry lines that was causing an error. --- spot-ml/ml_ops.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spot-ml/ml_ops.sh b/spot-ml/ml_ops.sh index c5d6e431..ece60cd1 100755 --- a/spot-ml/ml_ops.sh +++ b/spot-ml/ml_ops.sh @@ -108,7 +108,7 @@ time spark-submit --class "org.apache.spot.SuspiciousConnects" \ --scored ${HDFS_SCORED_CONNECTS} \ --threshold ${TOL} \ --maxresults ${MAXRESULTS} \ - --ldamaxiterations 20 \ + --ldamaxiterations 20 \ $USER_DOMAIN_PARSER_CMD wait From 1a0269f2e0f9a1297e1084c01b1991362f0a74ae Mon Sep 17 00:00:00 2001 From: Brandon Edwards Date: Wed, 14 Dec 2016 08:35:14 -0800 Subject: [PATCH 11/11] Changed USER_DOMAIN_PARSER_CMD to USER_DOMAIN_CMD --- spot-ml/ml_ops.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spot-ml/ml_ops.sh b/spot-ml/ml_ops.sh index ece60cd1..02a9c1d2 100755 --- a/spot-ml/ml_ops.sh +++ b/spot-ml/ml_ops.sh @@ -48,9 +48,9 @@ fi # pass the user domain designation if not empty if [ ! -z $USER_DOMAIN ] ; then - USER_DOMAIN_PARSER_CMD="--userdomain $USER_DOMAIN" + USER_DOMAIN_CMD="--userdomain $USER_DOMAIN" else - USER_DOMAIN_PARSER_CMD='' + USER_DOMAIN_CMD='' fi FEEDBACK_PATH=${LPATH}/${DSOURCE}_scores.csv @@ -109,7 +109,7 @@ time spark-submit --class "org.apache.spot.SuspiciousConnects" \ --threshold ${TOL} \ --maxresults ${MAXRESULTS} \ --ldamaxiterations 20 \ - $USER_DOMAIN_PARSER_CMD + $USER_DOMAIN_CMD wait