# 本例只是作为算法示例，生产环境中不建议使用，因为这种排序的代价极大，特别是在大数据环境下。

In [1]:
import pyspark,random,datetime

In [2]:
d = datetime.datetime.strptime('2017-01-01 00:00:00',
                               '%Y-%m-%d %H:%M:%S')

In [3]:
ls = [(random.choice(["info","warning","error"]),
      d + datetime.timedelta(seconds=random.randint(0,1000000)))
     for i in range(10000)]

In [4]:
ls[1:5]

 ('info', datetime.datetime(2017, 1, 6, 16, 35, 31)),
 ('info', datetime.datetime(2017, 1, 4, 18, 29, 55)),
 ('info', datetime.datetime(2017, 1, 4, 21, 14, 8))]

In [5]:
sc = pyspark.SparkContext()

In [6]:
rdd = sc.parallelize(ls).cache()

In [7]:
rdd.take(5)

[('error', datetime.datetime(2017, 1, 1, 18, 20, 56)),
 ('info', datetime.datetime(2017, 1, 6, 16, 35, 31)),
 ('info', datetime.datetime(2017, 1, 4, 18, 29, 55)),
 ('info', datetime.datetime(2017, 1, 4, 21, 14, 8))]

In [8]:
rdd2 = rdd.groupByKey()

## 分组，数据如下：

In [9]:
for t in rdd2.collect():
    print(t[0])
    flag = 0
    print(list(t[1])[:10])

info
[datetime.datetime(2017, 1, 6, 16, 35, 31), datetime.datetime(2017, 1, 4, 18, 29, 55), datetime.datetime(2017, 1, 4, 21, 14, 8), datetime.datetime(2017, 1, 6, 23, 47, 1), datetime.datetime(2017, 1, 4, 19, 57, 31), datetime.datetime(2017, 1, 12, 6, 11, 57), datetime.datetime(2017, 1, 7, 13, 32, 47), datetime.datetime(2017, 1, 6, 12, 47, 49), datetime.datetime(2017, 1, 10, 16, 41, 53), datetime.datetime(2017, 1, 6, 0, 53, 26)]
error
[datetime.datetime(2017, 1, 1, 18, 20, 56), datetime.datetime(2017, 1, 5, 8, 47, 43), datetime.datetime(2017, 1, 11, 5, 0, 27), datetime.datetime(2017, 1, 4, 9, 9, 10), datetime.datetime(2017, 1, 4, 18, 31, 10), datetime.datetime(2017, 1, 10, 16, 55, 1), datetime.datetime(2017, 1, 11, 20, 0, 36), datetime.datetime(2017, 1, 1, 17, 56, 23), datetime.datetime(2017, 1, 2, 5, 45, 22), datetime.datetime(2017, 1, 8, 6, 8, 42)]
[datetime.datetime(2017, 1, 11, 6, 5, 27), datetime.datetime(2017, 1, 2, 13, 35, 8), datetime.datetime(2017, 1, 8, 12, 37, 24), datetime

## 定义一个map方法，进行排序，并且取出前10条

### 再次强调，在大数据环境下，这种方式无论是性能还是效果都很不实用

In [10]:
def mymap(x):
    key = x[0]
    value = sorted(list(x[1]))[0:10]
    return (key,value)

In [11]:
rdd2.map(lambda x:mymap(x)).collect()

[('info',
  [datetime.datetime(2017, 1, 1, 0, 4, 56),
   datetime.datetime(2017, 1, 1, 0, 11, 43),
   datetime.datetime(2017, 1, 1, 0, 12, 36),
   datetime.datetime(2017, 1, 1, 0, 13, 35),
   datetime.datetime(2017, 1, 1, 0, 22, 40),
   datetime.datetime(2017, 1, 1, 0, 24, 20),
   datetime.datetime(2017, 1, 1, 0, 30, 31),
   datetime.datetime(2017, 1, 1, 0, 40, 23),
   datetime.datetime(2017, 1, 1, 0, 46, 8),
   datetime.datetime(2017, 1, 1, 0, 46, 50)]),
 ('error',
  [datetime.datetime(2017, 1, 1, 0, 1, 31),
   datetime.datetime(2017, 1, 1, 0, 7, 21),
   datetime.datetime(2017, 1, 1, 0, 9, 19),
   datetime.datetime(2017, 1, 1, 0, 11, 56),
   datetime.datetime(2017, 1, 1, 0, 23, 38),
   datetime.datetime(2017, 1, 1, 0, 39, 26),
   datetime.datetime(2017, 1, 1, 0, 44, 17),
   datetime.datetime(2017, 1, 1, 0, 47, 15),
   datetime.datetime(2017, 1, 1, 0, 53, 58),
   datetime.datetime(2017, 1, 1, 0, 54, 9)]),
  [datetime.datetime(2017, 1, 1, 0, 0, 37),
   datetime.datetime(2017, 1, 1, 0, 1

## 以上场景，海量数据情况下，是使用filter + sortBy算子

In [13]:
rdd.filter(lambda x :(x[0] =="info"))\
.sortBy(lambda x : (x[1],x)).take(10)

[('info', datetime.datetime(2017, 1, 1, 0, 4, 56)),
 ('info', datetime.datetime(2017, 1, 1, 0, 11, 43)),
 ('info', datetime.datetime(2017, 1, 1, 0, 12, 36)),
 ('info', datetime.datetime(2017, 1, 1, 0, 13, 35)),
 ('info', datetime.datetime(2017, 1, 1, 0, 22, 40)),
 ('info', datetime.datetime(2017, 1, 1, 0, 24, 20)),
 ('info', datetime.datetime(2017, 1, 1, 0, 30, 31)),
 ('info', datetime.datetime(2017, 1, 1, 0, 40, 23)),
 ('info', datetime.datetime(2017, 1, 1, 0, 46, 8)),
 ('info', datetime.datetime(2017, 1, 1, 0, 46, 50))]