In [4]:
from pyspark import SparkContext, SparkConf

conf = SparkConf().setMaster('local').setAppName('textApp')
sc = SparkContext(conf=conf)

In [5]:
# 从文本文件中读取数据
rdd_text = sc.textFile('data/web_access_demo.log')

In [13]:
rdd_text.take(5)

['27.19.74.143 - 30/May/2013:17:38:20 +0800 "GET /static/image/common/faq.gif HTTP/1.1" 200 1127',
 '110.52.250.126 - 30/May/2013:17:38:20 +0800 "GET /data/cache/style_1_widthauto.css?y7a HTTP/1.1" 200 1292',
 '27.19.74.143 - 30/May/2013:17:38:20 +0800 "GET /static/image/common/hot_1.gif HTTP/1.1" 200 680',
 '27.19.74.143 - 30/May/2013:17:38:20 +0800 "GET /static/image/common/hot_2.gif HTTP/1.1" 200 682',
 '27.19.74.143 - 30/May/2013:17:38:20 +0800 "GET /static/image/filetype/common.gif HTTP/1.1" 200 90']

In [18]:
# rdd转换操作
# 查找包含某个ip访问的日志记录
new_rdd = rdd_text.filter(lambda line: line.find('58.210.235.70') != -1)

In [19]:
new_rdd.take(10)

['58.210.235.70 - 30/May/2013:17:38:44 +0800 "GET /forum-60-1.html HTTP/1.1" 200 94527',
 '58.210.235.70 - 30/May/2013:17:39:09 +0800 "GET /uc_server/avatar.php?uid=24427&size=middle HTTP/1.1" 301 -',
 '58.210.235.70 - 30/May/2013:17:39:09 +0800 "GET /static/image/common/none.gif HTTP/1.1" 200 43',
 '58.210.235.70 - 30/May/2013:17:39:09 +0800 "GET /static/image/common/arw.gif HTTP/1.1" 200 940',
 '58.210.235.70 - 30/May/2013:17:39:09 +0800 "GET /uc_server/avatar.php?uid=22790&size=small HTTP/1.1" 301 -',
 '58.210.235.70 - 30/May/2013:17:39:09 +0800 "GET /static/image/common/arrow_top.gif HTTP/1.1" 200 51',
 '58.210.235.70 - 30/May/2013:17:39:07 +0800 "GET /thread-8248-1-1.html HTTP/1.1" 200 125651',
 '58.210.235.70 - 30/May/2013:17:39:09 +0800 "GET /uc_server/avatar.php?uid=11&size=middle HTTP/1.1" 301 -',
 '58.210.235.70 - 30/May/2013:17:39:09 +0800 "GET /uc_server/avatar.php?uid=10997&size=middle HTTP/1.1" 301 -',
 '58.210.235.70 - 30/May/2013:17:39:09 +0800 "GET /uc_server/data/avat

In [20]:
# 对每一行按照空格分割
# [[],[],[],[]]
split_rdd = new_rdd.map(lambda line: line.split())

In [44]:
split_rdd.first()

['58.210.235.70',
 '-',
 '30/May/2013:17:38:44',
 '+0800',
 '"GET',
 '/forum-60-1.html',
 'HTTP/1.1"',
 '200',
 '94527']

In [21]:
split_rdd.take(3)

[['58.210.235.70',
  '-',
  '30/May/2013:17:38:44',
  '+0800',
  '"GET',
  '/forum-60-1.html',
  'HTTP/1.1"',
  '200',
  '94527'],
 ['58.210.235.70',
  '-',
  '30/May/2013:17:39:09',
  '+0800',
  '"GET',
  '/uc_server/avatar.php?uid=24427&size=middle',
  'HTTP/1.1"',
  '301',
  '-'],
 ['58.210.235.70',
  '-',
  '30/May/2013:17:39:09',
  '+0800',
  '"GET',
  '/static/image/common/none.gif',
  'HTTP/1.1"',
  '200',
  '43']]

In [23]:
# flatmap
flat_split_rdd = new_rdd.flatMap(lambda line: line.split())

In [26]:
flat_split_rdd.take(18)

['58.210.235.70',
 '-',
 '30/May/2013:17:38:44',
 '+0800',
 '"GET',
 '/forum-60-1.html',
 'HTTP/1.1"',
 '200',
 '94527',
 '58.210.235.70',
 '-',
 '30/May/2013:17:39:09',
 '+0800',
 '"GET',
 '/uc_server/avatar.php?uid=24427&size=middle',
 'HTTP/1.1"',
 '301',
 '-']

In [27]:
# 筛选ip数据
import re
ip_rdd = flat_split_rdd.filter(lambda x: re.match("^(\d+\.){3}\d+", x))

In [39]:
# 统计每个ip的访问次数
flat_split_new_rdd = rdd_text.flatMap(lambda line: line.split())
new_ip_rdd = flat_split_new_rdd.filter(lambda x: re.match("^(\d+\.){3}\d+", x))

In [40]:
# 对结果进行统计
res = new_ip_rdd.countByValue()
sort_res = sorted(res.items(), key=lambda x: x[1], reverse=True)

In [41]:
sort_res[:5]

[('222.133.189.179', 29948),
 ('61.50.141.7', 22836),
 ('123.147.245.79', 9999),
 ('49.72.74.77', 8876),
 ('60.10.5.65', 6341)]

In [43]:
# 保存rdd到文本文件
new_ip_rdd.saveAsTextFile('data/web_ip.log')

In [45]:
# 筛选访问路经
path_rdd = flat_split_new_rdd.filter(lambda x: re.match("^(\/.*?)", x))

In [46]:
res_path = path_rdd.countByValue()
sort_path = sorted(res_path.items(), key=lambda x: x[1], reverse=True)

In [47]:
sort_path[:5]

[('/api.php?mod=js&bid=94', 5330),
 ('/static/js/common.js?y7a', 3886),
 ('/static/image/common/logo.png', 3362),
 ('/static/image/common/security.png', 3321),
 ('/api.php?mod=js&bid=65', 3293)]

In [48]:
# 筛选访问时间
time_rdd = flat_split_new_rdd.filter(lambda x: re.match("(\d+\/\w+\/.*?)", x))

In [49]:
res_time = time_rdd.countByValue()
sort_time = sorted(res_time.items(), key=lambda x: x[1], reverse=True)

In [50]:
sort_time[:10]

[('30/May/2013:19:57:05', 182),
 ('30/May/2013:23:32:18', 175),
 ('30/May/2013:17:48:18', 165),
 ('30/May/2013:18:41:32', 157),
 ('30/May/2013:19:58:38', 155),
 ('30/May/2013:17:54:08', 154),
 ('30/May/2013:19:14:05', 151),
 ('30/May/2013:19:19:14', 151),
 ('30/May/2013:20:07:22', 151),
 ('30/May/2013:18:38:28', 149)]