In [3]:
import findspark
findspark.init()

In [4]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [5]:
sc = SparkContext()
spark = SparkSession(sparkContext=sc)

In [5]:
rdd = sc.parallelize([1,2,3])

In [7]:
rdd.collect()

[1, 2, 3]

In [9]:
rdd2 = sc.parallelize(['cat','dog','wolf'])

In [None]:
rdd2.collect()

In [13]:
rdd2.first()

'cat'

In [15]:
s = {"cat","cat","dog","cat","wolf","wolf"}
rdd3 = sc.parallelize(s)
rdd3.collect()

['dog', 'cat', 'wolf']

In [16]:
d = {
    'a':100,
    'b':200,
    "c":300
}
rdd4 = sc.parallelize(d)
rdd4.collect()

['a', 'b', 'c']

External File

In [6]:
text = sc.textFile("/Users/liziwei/Desktop/bigdata/learningSpark/data/mtcars.csv")

In [7]:
text.take(5)

[',mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb',
 'Mazda RX4,21,6,160,110,3.9,2.62,16.46,0,1,4,4',
 'Mazda RX4 Wag,21,6,160,110,3.9,2.875,17.02,0,1,4,4',
 'Datsun 710,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1',
 'Hornet 4 Drive,21.4,6,258,110,3.08,3.215,19.44,1,0,3,1']

In [8]:
twit = sc.textFile("/Users/liziwei/Desktop/bigdata/learningSpark/data/twitter.txt")

In [10]:
twit.take(5)

['Fresh install of XP on new computer. Sweet relief! fuck vista\t1018769417\t1.0',
 'Well. Now I know where to go when I want my knives. #ChiChevySXSW http://post.ly/RvDl\t10284216536\t1.0',
 '"Literally six weeks before I can take off ""SSC Chair"" off my email. Its like the torturous 4th mile before everything stops hurting."\t10298589026\t1.0',
 'Mitsubishi i MiEV - Wikipedia, the free encyclopedia - http://goo.gl/xipe Cutest car ever!\t109017669432377344\t1.0',
 "'Cheap Eats in SLP' - http://t.co/4w8gRp7\t109642968603963392\t1.0"]

# Loading Data
## Parallelized Collections

In [12]:
rdd = sc.parallelize([("a",7),("a",2),("b",2)])
rdd2 = sc.parallelize([("a",2),("d",1),("b",1)])
rdd3 = sc.parallelize(range(100))
rdd4 = sc.parallelize([("a",['x','y','z']),
                      ("b",['p','r'])])

# Basic Information

In [13]:
rdd.getNumPartitions()

4

In [14]:
rdd.count()

3

In [15]:
rdd.countByKey()

defaultdict(int, {'a': 2, 'b': 1})

In [16]:
rdd.countByValue()

defaultdict(int, {('a', 7): 1, ('a', 2): 1, ('b', 2): 1})

In [17]:
rdd.collectAsMap()

{'a': 2, 'b': 2}

In [18]:
rdd.collect()

[('a', 7), ('a', 2), ('b', 2)]

In [19]:
rdd3.sum()

4950

In [21]:
sc.parallelize([]).isEmpty()

True

# Summary

In [23]:
rdd3.max()

99

In [24]:
rdd3.min()

0

In [25]:
rdd3.mean()

49.5

In [27]:
rdd3.variance()

833.25

In [29]:
rdd3.stats()

(count: 100, mean: 49.5, stdev: 28.86607004772212, max: 99.0, min: 0.0)

# Applying Functions

In [31]:
rdd.map(lambda x: x+(x[1],x[0])).collect()

[('a', 7, 7, 'a'), ('a', 2, 2, 'a'), ('b', 2, 2, 'b')]

In [33]:
rdd5 = rdd.flatMap(lambda x: x+(x[1],x[0]))
rdd5.collect()

['a', 7, 7, 'a', 'a', 2, 2, 'a', 'b', 2, 2, 'b']

In [34]:
rdd4.flatMapValues(lambda x: x).collect()

[('a', 'x'), ('a', 'y'), ('a', 'z'), ('b', 'p'), ('b', 'r')]

# Selecting Data
### Getting

In [35]:
rdd.collect()

[('a', 7), ('a', 2), ('b', 2)]

In [36]:
rdd.take(2)

[('a', 7), ('a', 2)]

In [37]:
rdd.first()

('a', 7)

In [40]:
rdd.top(2)

[('b', 2), ('a', 7)]

### Sampling

In [41]:
rdd3.sample(False, 0.15,81).collect()

[3, 4, 26, 30, 39, 40, 41, 42, 52, 63, 76, 79, 80, 86, 97]

### Filter

In [43]:
rdd.filter(lambda x: "a" in x).collect()

[('a', 7), ('a', 2)]

In [44]:
rdd5.distinct().collect()

['b', 'a', 2, 7]

In [45]:
rdd.keys().collect()

['a', 'a', 'b']

# Reshaping Data

### Reducing

In [49]:
rdd.reduceByKey(lambda x,y : x+y).collect()

[('b', 2), ('a', 9)]

In [50]:
rdd.reduce(lambda a,b:a+b)

('a', 7, 'a', 2, 'b', 2)

### Grouping by

In [51]:
rdd3.groupBy(lambda x:x%2).mapValues(list).collect()

[(0,
  [0,
   2,
   4,
   6,
   8,
   10,
   12,
   14,
   16,
   18,
   20,
   22,
   24,
   26,
   28,
   30,
   32,
   34,
   36,
   38,
   40,
   42,
   44,
   46,
   48,
   50,
   52,
   54,
   56,
   58,
   60,
   62,
   64,
   66,
   68,
   70,
   72,
   74,
   76,
   78,
   80,
   82,
   84,
   86,
   88,
   90,
   92,
   94,
   96,
   98]),
 (1,
  [1,
   3,
   5,
   7,
   9,
   11,
   13,
   15,
   17,
   19,
   21,
   23,
   25,
   27,
   29,
   31,
   33,
   35,
   37,
   39,
   41,
   43,
   45,
   47,
   49,
   51,
   53,
   55,
   57,
   59,
   61,
   63,
   65,
   67,
   69,
   71,
   73,
   75,
   77,
   79,
   81,
   83,
   85,
   87,
   89,
   91,
   93,
   95,
   97,
   99])]

In [52]:
rdd.groupByKey().mapValues(list).collect()

[('b', [2]), ('a', [7, 2])]

### Aggerating

In [53]:
seqOp = (lambda x,y: (x[0]+y,x[1]+1))

In [55]:
combOp = (lambda x,y:(x[0]+y[0],x[1]+y[1]))

In [57]:
rdd3.aggregate((0,0),seqOp,combOp)

(4950, 100)

In [59]:
rdd.aggregateByKey((0,0),seqOp,combOp).collect()

[('b', (2, 1)), ('a', (9, 2))]

In [66]:
rdd3.keyBy(lambda x:x+x).collect()

[(0, 0),
 (2, 1),
 (4, 2),
 (6, 3),
 (8, 4),
 (10, 5),
 (12, 6),
 (14, 7),
 (16, 8),
 (18, 9),
 (20, 10),
 (22, 11),
 (24, 12),
 (26, 13),
 (28, 14),
 (30, 15),
 (32, 16),
 (34, 17),
 (36, 18),
 (38, 19),
 (40, 20),
 (42, 21),
 (44, 22),
 (46, 23),
 (48, 24),
 (50, 25),
 (52, 26),
 (54, 27),
 (56, 28),
 (58, 29),
 (60, 30),
 (62, 31),
 (64, 32),
 (66, 33),
 (68, 34),
 (70, 35),
 (72, 36),
 (74, 37),
 (76, 38),
 (78, 39),
 (80, 40),
 (82, 41),
 (84, 42),
 (86, 43),
 (88, 44),
 (90, 45),
 (92, 46),
 (94, 47),
 (96, 48),
 (98, 49),
 (100, 50),
 (102, 51),
 (104, 52),
 (106, 53),
 (108, 54),
 (110, 55),
 (112, 56),
 (114, 57),
 (116, 58),
 (118, 59),
 (120, 60),
 (122, 61),
 (124, 62),
 (126, 63),
 (128, 64),
 (130, 65),
 (132, 66),
 (134, 67),
 (136, 68),
 (138, 69),
 (140, 70),
 (142, 71),
 (144, 72),
 (146, 73),
 (148, 74),
 (150, 75),
 (152, 76),
 (154, 77),
 (156, 78),
 (158, 79),
 (160, 80),
 (162, 81),
 (164, 82),
 (166, 83),
 (168, 84),
 (170, 85),
 (172, 86),
 (174, 87),
 (176, 88

# Mathematical Operations

In [68]:
rdd.subtract(rdd2).collect()

[('b', 2), ('a', 7)]

In [70]:
rdd2.subtractByKey(rdd).collect()

[('d', 1)]

In [71]:
rdd.cartesian(rdd2).collect()

[(('a', 7), ('a', 2)),
 (('a', 7), ('d', 1)),
 (('a', 7), ('b', 1)),
 (('a', 2), ('a', 2)),
 (('a', 2), ('d', 1)),
 (('a', 2), ('b', 1)),
 (('b', 2), ('a', 2)),
 (('b', 2), ('d', 1)),
 (('b', 2), ('b', 1))]

# Sort

In [73]:
rdd2.sortBy(lambda x:x[1]).collect()

[('d', 1), ('b', 1), ('a', 2)]

In [74]:
rdd2.sortByKey().collect()

[('a', 2), ('b', 1), ('d', 1)]