In [1]:
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql.utils import AnalysisException
from pyspark.ml.feature import RegexTokenizer
from pyspark.sql.functions import udf, mean, col
from pyspark.sql.types import StringType, StructType, StructField, FloatType
import pandas as pd
import numpy as np
import os, math, time
import itertools
import csv, sys

In [2]:
result = []

In [3]:
SAVE = True
file  = open('result.txt', 'w', encoding='utf-8', newline='\n') if SAVE  else sys.stdout

In [4]:
t0 = time.time()

In [5]:
conf = (SparkConf()
        #.setMaster('spark://10.100.5.182:7077')
        .setMaster("local[4]")
        .set("spark.executor.memory", "8g")
        .setAppName("quiz"))

In [6]:
try:
    sc = SparkContext(conf=conf)
    sql_sc = SQLContext(sc)
except ValueError:
    pass

In [7]:
def read_csv(file_name):
    try:
        data = sql_sc.read.csv(file_name, 
                       sep=',', 
                       header=False, 
                       mode='DROPMALFORMED')
    except AnalysisException:
        data = sql_sc.read.csv('hdfs:///bdm/quiz/{}'.format(file_name), 
                       sep=',', 
                       header=False, 
                       mode='DROPMALFORMED')
    return data

In [8]:
data = read_csv('kddcup.data_10_percent')

In [9]:
cloumnnames = ["duration",
               "protocol_type", 
               "service",
               "flag",
               "src_bytes",
               "dst_bytes",
               "land",
               "wrong_fragment",
               "urgent",
               "hot",
               "num_failed_logins",
               "logged_in",
               "num_compromised",
               "root_shell",
               "su_attempted",
               "num_root",
               "num_file_creations",
               "num_shells",
               "num_access_files",
               "num_outbound_cmds",
               "is_host_login",
               "is_guest_login",
               "count",
               "srv_count",
               "serror_rate",
               "srv_serror_rate",
               "rerror_rate",
               "srv_rerror_rate",
               "same_srv_rate",
               "diff_srv_rate",
               "srv_diff_host_rate",
               "dst_host_count",
               "dst_host_srv_count",
               "dst_host_same_srv_rate",
               "dst_host_diff_srv_rate",
               "dst_host_same_src_port_rate",
               "dst_host_srv_diff_host_rate",
               "dst_host_serror_rate",
               "dst_host_srv_serror_rate",
               "dst_host_rerror_rate",
               "dst_host_srv_rerror_rate",
               "intrusion_type"]

In [10]:
for old, new in zip(['_c%d'%(i) for i in range(0, 42)], cloumnnames):
    data = (data.withColumnRenamed(old, new))
data = (data.withColumn('duration', data.duration.cast('float'))
       .withColumn('src_bytes', data.src_bytes.cast('float'))
       .withColumn('dst_bytes', data.dst_bytes.cast('float'))
       .withColumn('num_failed_logins', data.num_failed_logins.cast('float'))
       .withColumn('same_srv_rate', data.same_srv_rate.cast('float'))
       .withColumn('diff_srv_rate', data.diff_srv_rate.cast('float'))
       .withColumn('srv_diff_host_rate', data.srv_diff_host_rate.cast('float'))
       .withColumn('dst_host_count', data.dst_host_count.cast('float'))
       .withColumn('dst_host_srv_count', data.dst_host_srv_count.cast('float'))
       .withColumn('dst_host_same_srv_rate', data.dst_host_same_srv_rate.cast('float'))
       .withColumn('dst_host_diff_srv_rate', data.dst_host_diff_srv_rate.cast('float')))

In [11]:
data = data.dropna()

In [12]:
#data = data.sample(False, 0.001, 42)

### (1) For continuous attributes ‘duration’, ‘src_bytes’, ‘dst_bytes’, ‘num_failed_logins’, please calculate their mean, median, mode, standard deviation, respectively

In [13]:
# def find_median(values_list):
#     try:
#         median = np.median(values_list) #get the median of values in a list in each row
#         return round(float(median),2)
#     except Exception:
#         return None #if there is anything wrong with the given values

# median_finder = udf(find_median, FloatType())

In [14]:
q1_feature_nemaes = ['duration', 'src_bytes', 'dst_bytes', 'num_failed_logins']

In [15]:
q1 = data.select('duration', 'src_bytes', 'dst_bytes', 'num_failed_logins')

In [16]:
#print("%s"%(q1.describe().show()), file=file)

In [17]:
temp = q1.describe().collect()

In [18]:
result.append('|summary|         duration|         src_bytes|         dst_bytes|   num_failed_logins|')
for r in temp[1:3]:
    temp = ', '.join(r)
    result.append(temp)

In [19]:
for f in q1_feature_nemaes:
    temp = ('[Q1] %s median:%f'%(f, np.median(data.select(f).rdd.collect())))
    result.append(temp)

In [20]:
for f in q1_feature_nemaes:
    temp = ('[Q1] %s mode is'%(f), q1.select(f).rdd
    .map(lambda x: (str(x[0]), 1))
    .reduceByKey(lambda a,b:a+b)
    .sortBy(lambda w: w[1], ascending=False)
    .take(1))
    result.append(temp)

### (2) For symbolic attributes ‘protocol_type’, ‘service’, ‘flag’, ‘logged_in’, ‘intrusion_type’, output the list of each value and the corresponding frequency count, sorted in descending order of the count


In [21]:
q2_feature_names = ['protocol_type', 'service', 'flag', 'logged_in', 'intrusion_type']

In [22]:
# (data.select('protocol_type').rdd
# .map(lambda x: (x[0], 1))
# .reduceByKey(lambda a,b:a+b)
# .sortBy(lambda w: w[1], ascending=False)
# .collect())

In [23]:
for f in q2_feature_names:
    r = (data.select(f).rdd
        .map(lambda x: (x[0], 1))
        .reduceByKey(lambda a,b:a+b)
        .sortBy(lambda w: w[1], ascending=False)
        .collect())
    temp = ('[Q2] column:%s value and the corresponding frequency count'%(f), r)
    result.append(temp)

### (3) Output the list of the most frequently used ‘service’ for each ‘intrusion_type’, sorted in descending order of the occurrence frequency

In [24]:
q3_result =  (data.select('service', 'intrusion_type').rdd
.map(lambda x: ((x[0], x[1]), 1))
.reduceByKey(lambda a,b: a+b)
.map(lambda x: (x[0][1], (x[0][0], x[1])))
.groupByKey()
.map(lambda x: (x[0], sorted(x[1], key=lambda x: x[1], reverse=True)[0]))
.collect())
for intrusion_type, r in q3_result:
    temp = ('[Q3]intrusion_type:[%15s],  most frequently used service is:%20s'%(intrusion_type, str(r)))
    result.append(temp)

### (4) If we regard the values of ‘src_bytes’ , calculate the correlation coefficient of src_bytes and ‘num_failed_logins’ by the following formula:


In [25]:
q4 = data.select('src_bytes', 'num_failed_logins')

In [26]:
corr_coe = np.array(q4.collect())

In [27]:
#q4_r = np.corrcoef(corr_coe[:,0], corr_coe[:, 1])

In [28]:
n = 4898431
A_ = 1834.6211678800823
B_ = 3.205107921291532e-5
stddev_A = 941431.0703655402
stddev_B = 0.007299407575927214

In [29]:
q4_r = (np.sum(corr_coe[:,0]*corr_coe[:, 1])-(n*A_*B_))/((n-1)*stddev_B*stddev_A)

In [30]:
temp = ('[Q4] correlation coefficient is %f'%(q4_r)) 
result.append(temp)

### (5) bonus
(5) Which ‘intrusion type’ has the highest value for each of the following fields: 

same_srv_rate

diff_srv_rate

srv_diff_host_rate

dst_host_count

dst_host_srv_count

dst_host_same_srv_rate

dst_host_diff_srv_rate


In [31]:
for f in ['same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 
                 'dst_host_count', 'dst_host_srv_count', 
                 'dst_host_same_srv_rate' ,'dst_host_diff_srv_rate']:
    r = data.sort(col(f) , ascending=False).collect()[0]
    temp = ('[Bonus] %s\'s %s has the highest value'%(f, r['intrusion_type']))
    result.append(temp)

In [37]:
for r in result:
    file.write(str(r)+'\n')

In [38]:
file.close()

In [39]:
sc.stop()

In [40]:
print('cost {:.3f} minutes'.format((time.time()-t0)/60))

cost 7.976 minutes
