In [2]:
import pyspark
sc = pyspark.SparkContext('local[*]')  # 按CPU个数的本地运行模式

In [3]:
rdd = sc.parallelize(range(1000))
rdd.takeSample(False, 5)  # 无放回抽样

[638, 307, 697, 701, 605]

In [4]:
rdd

PythonRDD[3] at RDD at PythonRDD.scala:53

In [5]:
numbers = range(16)

In [6]:
rdd1 = sc.parallelize(numbers)

In [7]:
rdd1.collect()

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]

In [8]:
rdd1.take(3)

[0, 1, 2]

In [9]:
import urllib.request
url = 'http://www.gutenberg.org/files/100/100-0.txt'
urllib.request.urlretrieve(url, 'shakespeare_all.txt')

('shakespeare_all.txt', <http.client.HTTPMessage at 0x22ad45d8548>)

In [20]:
sc.textFile(
    'file:///D:/Projects/python_projects/big_data/shakespeare_all.txt').take(
        10)  # 无法读取则用绝对路经

['Project Gutenberg’s The Complete Works of William Shakespeare, by William Shakespeare',
 '',
 'This eBook is for the use of anyone anywhere in the United States and',
 'most other parts of the world at no cost and with almost no restrictions',
 'whatsoever.  You may copy it, give it away or re-use it under the terms',
 'of the Project Gutenberg License included with this eBook or online at',
 'www.gutenberg.org.  If you are not located in the United States, you’ll',
 'have to check the laws of the country where you are located before using',
 'this ebook.',
 '']

In [26]:
rdd1.saveAsTextFile('file:///D:/Projects/python_projects/big_data/numbers16.txt'
                   )  # 保存文件夹，按分区分成文本

In [27]:
def sq(x):
    return x**2


rdd1.map(sq).collect()

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196, 225]

In [28]:
rdd1.map(lambda x: x**3).collect()

[0, 1, 8, 27, 64, 125, 216, 343, 512, 729, 1000, 1331, 1728, 2197, 2744, 3375]

In [29]:
rdd1.map(lambda x: x*10).reduce(lambda a, b: a+b)

1200

In [30]:
rdd1.map(lambda x: x*2).sum()

240

In [35]:
def tag(x):
    return 'even' if x%2==0 else 'odd'

rdd1.map(lambda x:(tag(x), x)).collect()

[('even', 0),
 ('odd', 1),
 ('even', 2),
 ('odd', 3),
 ('even', 4),
 ('odd', 5),
 ('even', 6),
 ('odd', 7),
 ('even', 8),
 ('odd', 9),
 ('even', 10),
 ('odd', 11),
 ('even', 12),
 ('odd', 13),
 ('even', 14),
 ('odd', 15)]

In [36]:
rdd1.map(lambda x:(tag(x), x)).reduceByKey(lambda a, b: a+b).collect()

[('even', 56), ('odd', 64)]

In [37]:
def emit_feats(line):
    return [('char', len(line)), ('words', len(line.split())), ('line', 1)]


sc.textFile('file:///D:/Projects/python_projects/big_data/shakespeare_all.txt'
           ).flatMap(emit_feats).reduceByKey(lambda a, b: a + b).collectAsMap()

{'line': 169442, 'char': 5389109, 'words': 961565}

In [40]:
import re
WORD_RE=re.compile(r"[\w']+")
sc.textFile('file:///D:/Projects/python_projects/big_data/shakespeare_all.txt'
           ).flatMap(lambda line: [(word.lower(), 1) for word in WORD_RE.findall(line)]).reduceByKey(lambda a, b: a+b).takeOrdered(100, key=lambda x: -x[1])

[('the', 30193),
 ('and', 28456),
 ('i', 23041),
 ('to', 21067),
 ('of', 18842),
 ('a', 16263),
 ('you', 14593),
 ('my', 13184),
 ('in', 12339),
 ('that', 12081),
 ('is', 9859),
 ('not', 9088),
 ('with', 8536),
 ('me', 8284),
 ('for', 8263),
 ('it', 8215),
 ('his', 7583),
 ('be', 7404),
 ('this', 7177),
 ('he', 7089),
 ('your', 7077),
 ('but', 6765),
 ('have', 6286),
 ('as', 6197),
 ('thou', 5878),
 ('him', 5562),
 ('so', 5466),
 ('will', 5308),
 ('what', 4992),
 ('s', 4758),
 ('her', 4603),
 ('d', 4594),
 ('thy', 4359),
 ('all', 4273),
 ('by', 4123),
 ('no', 4075),
 ('do', 3976),
 ('shall', 3848),
 ('if', 3810),
 ('we', 3762),
 ('are', 3742),
 ('on', 3438),
 ('thee', 3394),
 ('our', 3295),
 ('lord', 3190),
 ('king', 3035),
 ('now', 3026),
 ('sir', 3002),
 ('good', 2982),
 ('from', 2908),
 ('o', 2903),
 ('she', 2876),
 ('at', 2759),
 ('they', 2746),
 ('or', 2729),
 ('come', 2643),
 ('which', 2577),
 ('more', 2501),
 ('would', 2484),
 ('then', 2479),
 ('enter', 2475),
 ('was', 2450),
 (