In [1]:
sc

In [2]:
spark

### 1. Create new Spark session.

In [4]:
sc.stop()

In [5]:
from pyspark import SparkContext,SparkConf
# 2 in local means the number of cores allocated
# setMaster() used to set Spark Context Manager which is local [core_of_cpu]
config =  SparkConf().setMaster("local[2]").setAppName("RDDSession")

sc = SparkContext(conf = config)

In [6]:
sc

In [10]:
from pyspark.sql import SparkSession
# Here spark searches for the spark session and if found get it else creates one.
spark = SparkSession.builder.appName("SQLSession").getOrCreate()

In [11]:
spark

### 2. Create RDD
    * create RDD using sc.parallelize() using collections like numpy array, lists, tuples, etc.
   

In [13]:
rdd1 = sc.parallelize([10,20,30,40,50,60,70,80,90])

In [14]:
# sc.parallelize created spark rdd object
print(rdd1)

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:195


In [17]:
rdd1.collect()

[10, 20, 30, 40, 50, 60, 70, 80, 90]

In [18]:
rdd1.take(5) # take(num of values as per the input count)

[10]

In [19]:
rdd1.count() # return the total count of values present in rdd

9

In [22]:
rdd1.distinct()

PythonRDD[9] at RDD at PythonRDD.scala:53

In [263]:
rdd1.getNumPartitions() # Number of RDD partitions = Num of Workers

2

### 3. Narrow Transformation

In [23]:
# Map transformation which applies to all individual values
rdd2 = rdd1.map(lambda val : val * 2)

In [24]:
rdd2.collect()

[20, 40, 60, 80, 100, 120, 140, 160, 180]

In [27]:
rdd3 = rdd1.map(lambda val : str(val) + " Number")

In [28]:
rdd3.collect()

['10 Number',
 '20 Number',
 '30 Number',
 '40 Number',
 '50 Number',
 '60 Number',
 '70 Number',
 '80 Number',
 '90 Number']

In [35]:
# Filter applies condition to each element of RDD
# All tranformation method like map() or filter() expect a function as an input.
rdd4 = rdd1.filter(lambda x: x >= 30)
rdd4.collect()

[30, 40, 50, 60, 70, 80, 90]

In [36]:
# Create an RDD using range()
rdd5 = sc.parallelize(range(1,30))
print(rdd5.take(10))

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [39]:
rdd6 = rdd5.filter(lambda x:x%2 == 0).map(lambda x:str(x) + " is even.")
rdd6.collect()

['2 is even.',
 '4 is even.',
 '6 is even.',
 '8 is even.',
 '10 is even.',
 '12 is even.',
 '14 is even.',
 '16 is even.',
 '18 is even.',
 '20 is even.',
 '22 is even.',
 '24 is even.',
 '26 is even.',
 '28 is even.']

In [42]:
city_rdd = sc.parallelize(["Delhi,Kochi,Kolkata,Vizag,Varkala",
                          "Chennai,Bangalore,Pune,Dubai,New York",
                          "Berlin,Venice,Mumbai,Kochi,Kottayam"])

In [43]:
city_rdd.collect()

['Delhi,Kochi,Kolkata,Vizag,Varkala',
 'Chennai,Bangalore,Pune,Dubai,New York',
 'Berlin,Venice,Mumbai,Kochi,Kottayam']

In [49]:
city_names = city_rdd.map(lambda val:val.split(','))
city_names.collect()

[['Delhi', 'Kochi', 'Kolkata', 'Vizag', 'Varkala'],
 ['Chennai', 'Bangalore', 'Pune', 'Dubai', 'New York'],
 ['Berlin', 'Venice', 'Mumbai', 'Kochi', 'Kottayam']]

In [56]:
city_names_2 = city_names.flatMap(lambda val: val)
city_names_2.collect()

['Delhi',
 'Kochi',
 'Kolkata',
 'Vizag',
 'Varkala',
 'Chennai',
 'Bangalore',
 'Pune',
 'Dubai',
 'New York',
 'Berlin',
 'Venice',
 'Mumbai',
 'Kochi',
 'Kottayam']

In [62]:
# Union - Union of two RDDs
rdd7 = rdd1.filter(lambda val : val > 20 and val < 80)

# Union of two rdds
unionRDD = rdd1.union(rdd7)

In [63]:
unionRDD.collect()

[10, 20, 30, 40, 50, 60, 70, 80, 90, 30, 40, 50, 60, 70]

#### Task : Create RDD using city_names and generate city_names starting with 'B' or 'K

In [70]:
city_names_3 = city_names_2.filter(lambda x: x.lower().startswith('b') or x.lower().startswith('k'))

In [71]:
city_names_3.collect()

['Kochi', 'Kolkata', 'Bangalore', 'Berlin', 'Kochi', 'Kottayam']

['Kochi',
 'Chennai',
 'Dubai',
 'Mumbai',
 'Kottayam',
 'Delhi',
 'Kolkata',
 'Vizag',
 'Varkala',
 'Bangalore',
 'Pune',
 'New York',
 'Berlin',
 'Venice']

In [96]:
# Sample(withReplacement, fraction, seed)
# Samples a fraction of data drom RDD with replacement
sample_rdd = city_names_2.sample(True,0.5)
sample_rdd.collect()

['Kolkata',
 'Kolkata',
 'Varkala',
 'Chennai',
 'Bangalore',
 'Bangalore',
 'Pune',
 'Dubai',
 'Berlin',
 'Kochi',
 'Kochi',
 'Kochi']

In [107]:
sample_rdd = city_names_2.sample(False,0.5,seed=42)
sample_rdd.collect()

['Delhi',
 'Kochi',
 'Vizag',
 'Varkala',
 'Berlin',
 'Venice',
 'Mumbai',
 'Kochi',
 'Kottayam']

### Wide Transformation
    * Uses multiple rdds

In [240]:
distinct_city = city_names_2.distinct()
distinct_city.collect()

['Kochi',
 'Chennai',
 'Dubai',
 'Mumbai',
 'Kottayam',
 'Delhi',
 'Kolkata',
 'Vizag',
 'Varkala',
 'Bangalore',
 'Pune',
 'New York',
 'Berlin',
 'Venice']

In [108]:
city_names_3.getNumPartitions()

2

In [109]:
pairs = [('Trivandrum',24),('Chennai',30),('Mumbai',21),('Kochi',12),
         ('Chennai',20),('Trivandrum',15),('Delhi',18),('Delhi',18)]

pairs_rdd = sc.parallelize(pairs)
pairs_rdd.collect()

[('Trivandrum', 24),
 ('Chennai', 30),
 ('Mumbai', 21),
 ('Kochi', 12),
 ('Chennai', 20),
 ('Trivandrum', 15),
 ('Delhi', 18),
 ('Delhi', 18)]

In [110]:
pairs_rdd.getNumPartitions()

2

In [112]:
# repartition() will be used to create new more number of partitions to the given RDD.
new_pair_rdd = pairs_rdd.repartition(4)
new_pair_rdd.getNumPartitions()

4

In [113]:
# coalesce() Combines existing RDD partitions in to less number of partitions and generate a new partition.
new_pair_rdd_1 = new_pair_rdd.coalesce(2)
new_pair_rdd_1.getNumPartitions()

2

In [114]:
# ReduceByKey() Aggregate each value of RDD by key. It does involves shuffling of values to ensure that all
# values with same key are bought together
pairs_rdd_2 = pairs_rdd.reduceByKey(lambda x,y : x + y)

In [115]:
pairs_rdd_2.collect()

[('Chennai', 50),
 ('Mumbai', 21),
 ('Kochi', 12),
 ('Trivandrum', 39),
 ('Delhi', 36)]

In [116]:
orders = [('Fries',2),('Nuggets',3),('PaniPuri',5),('Fries',5),('Nuggets',6),
          ('PaniPuri',4),('Dosa',3),('Vada',1),('Shawai',4)]

orders_rdd = sc.parallelize(orders)

In [118]:
orders_rdd.reduceByKey(lambda x,y: x + y).collect()

[('Nuggets', 9),
 ('Dosa', 3),
 ('Vada', 1),
 ('Fries', 7),
 ('PaniPuri', 9),
 ('Shawai', 4)]

In [119]:
# Intersection : Method to scan all rdd and find all common among all partitions of RDD.
rdd8 = sc.parallelize(['Delhi','Kolkata','Kochi','Vizag','Varkala','Chennai'])
rdd9 = sc.parallelize(['Bangalore','Pune','Delhi','Vizag','Mumbai','Trivandrum'])

In [120]:
rdd8.intersection(rdd9).collect()

['Delhi', 'Vizag']

In [121]:
# sc.parallelize(collection, number of partition)
rdd10 = sc.parallelize(range(1,50),4)
rdd10.getNumPartitions()

4

In [126]:
print(rdd10.collect())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]


In [123]:
def partition_funct(iterator):
    return [sum(iterator)]

In [124]:
partition_rdd = rdd10.mapPartitions(partition_funct)
partition_rdd.collect()

[78, 222, 366, 559]

In [127]:
pairs = [('Trivandrum',24),('Chennai',30),('Mumbai',21),('Kochi',12),
         ('Chennai',20),('Trivandrum',15),('Delhi',18),('Delhi',18)]

pairs_rdd = sc.parallelize(pairs)

In [134]:
# Groups data by key which involves shuffling all values in each key.
grouped_rdd = pairs_rdd.groupByKey().mapValues(tuple)
grouped_rdd.collect()

[('Chennai', (30, 20)),
 ('Mumbai', (21,)),
 ('Kochi', (12,)),
 ('Trivandrum', (24, 15)),
 ('Delhi', (18, 18))]

In [135]:
# Len() functions returns the count of city names.
pairs_rdd.groupByKey().mapValues(len).collect()

[('Chennai', 2), ('Mumbai', 1), ('Kochi', 1), ('Trivandrum', 2), ('Delhi', 2)]

In [139]:
# Joins to RDD based on their keys which require shuffling the data to ensure all keys are correctly placed.

order1 = sc.parallelize([('Fries',2),('Nuggets',3),('PaniPuri',5),('Fries',5),('Nuggets',6)])
order2 = sc.parallelize([('PaniPuri',4),('Dosa',3),('Fries',1),('Shawaii',4),('Sandwich',2)])

In [140]:
# Join will scan all partitioned RDDs to find common keys
order1.join(order2).collect()

[('PaniPuri', (5, 4)), ('Fries', (2, 1)), ('Fries', (5, 1))]

In [141]:
order1.leftOuterJoin(order2).collect()

[('PaniPuri', (5, 4)),
 ('Nuggets', (3, None)),
 ('Nuggets', (6, None)),
 ('Fries', (2, 1)),
 ('Fries', (5, 1))]

In [142]:
order1.rightOuterJoin(order2).collect()

[('Dosa', (None, 3)),
 ('PaniPuri', (5, 4)),
 ('Shawaii', (None, 4)),
 ('Sandwich', (None, 2)),
 ('Fries', (2, 1)),
 ('Fries', (5, 1))]

In [143]:
order1.fullOuterJoin(order2).collect()

[('Dosa', (None, 3)),
 ('PaniPuri', (5, 4)),
 ('Nuggets', (3, None)),
 ('Nuggets', (6, None)),
 ('Shawaii', (None, 4)),
 ('Sandwich', (None, 2)),
 ('Fries', (2, 1)),
 ('Fries', (5, 1))]

In [160]:
# cogroup() : groups data from two rdd by key resulting into new RDD with keys and values grouped from both RDD.
test_rdd = order1.cogroup(order2).collect()

for val,(k,v) in test_rdd:
    print(val,list(k),list(v))

Dosa [] [3]
PaniPuri [5] [4]
Nuggets [3, 6] []
Shawaii [] [4]
Sandwich [] [2]
Fries [2, 5] [1]


### RDD USING FILENAME()

In [3]:
# Load local FS use "file://path to filename"
emp_RDD = sc.textFile('file:///home/hadoop/Downloads/Employee_Advance.csv')
emp_RDD.take(6)

['1,Binnie,Hovee,bhovee0@hubpages.com,Genderqueer,Support,1419125,Lithuanian,Paris 11,France',
 '2,Devondra,Rosingdall,drosingdall1@yellowbook.com,Non-binary,Support,1365025,Tsonga,Seminole,United States',
 '3,Siward,Struijs,sstruijs2@wikia.com,Male,Business Development,432657,Armenian,London,United States',
 '4,Jaime,Camoys,jcamoys3@wsj.com,Bigender,Accounting,1122884,Armenian,Stockton,United States',
 '5,Nahum,Macieiczyk,nmacieiczyk4@reference.com,Bigender,Human Resources,1026666,Burmese,Toulouse,France',
 '6,Tova,Manclark,tmanclark5@army.mil,Female,Business Development,659039,Hiri Motu,Le Mans,France']

In [4]:
emp_RDD1 = emp_RDD.map(lambda x : x.split(','))
print(emp_RDD1.collect())

[['1', 'Binnie', 'Hovee', 'bhovee0@hubpages.com', 'Genderqueer', 'Support', '1419125', 'Lithuanian', 'Paris 11', 'France'], ['2', 'Devondra', 'Rosingdall', 'drosingdall1@yellowbook.com', 'Non-binary', 'Support', '1365025', 'Tsonga', 'Seminole', 'United States'], ['3', 'Siward', 'Struijs', 'sstruijs2@wikia.com', 'Male', 'Business Development', '432657', 'Armenian', 'London', 'United States'], ['4', 'Jaime', 'Camoys', 'jcamoys3@wsj.com', 'Bigender', 'Accounting', '1122884', 'Armenian', 'Stockton', 'United States'], ['5', 'Nahum', 'Macieiczyk', 'nmacieiczyk4@reference.com', 'Bigender', 'Human Resources', '1026666', 'Burmese', 'Toulouse', 'France'], ['6', 'Tova', 'Manclark', 'tmanclark5@army.mil', 'Female', 'Business Development', '659039', 'Hiri Motu', 'Le Mans', 'France'], ['7', 'Terrill', 'Feander', 'tfeander6@unc.edu', 'Genderqueer', 'Legal', '226156', 'Greek', 'Rouen', 'France'], ['8', 'Milicent', 'Mossdale', 'mmossdale7@chicagotribune.com', 'Male', 'Research and Development', '545708

#### 1. Show all employee working in Department = "Business Development"

In [167]:
emp_BD = emp_RDD1.filter(lambda val : val[5] == 'Business Development')
emp_BD.take(5)

[['3',
  'Siward',
  'Struijs',
  'sstruijs2@wikia.com',
  'Male',
  'Business Development',
  '432657',
  'Armenian',
  'London',
  'United States'],
 ['6',
  'Tova',
  'Manclark',
  'tmanclark5@army.mil',
  'Female',
  'Business Development',
  '659039',
  'Hiri Motu',
  'Le Mans',
  'France'],
 ['9',
  'Warde',
  'Stenett',
  'wstenett8@altervista.org',
  'Genderqueer',
  'Business Development',
  '807769',
  'Moldovan',
  'Bastia',
  'France'],
 ['17',
  'Bern',
  'Lafond',
  'blafondg@amazon.de',
  'Non-binary',
  'Business Development',
  '933737',
  'Greek',
  'San Antonio',
  'United States'],
 ['19',
  'Hillery',
  'Costall',
  'hcostalli@rakuten.co.jp',
  'Genderfluid',
  'Business Development',
  '736463',
  'Norwegian',
  'Clermont-Ferrand',
  'France']]

#### 2. Find average Income of Each Department.

In [232]:
emp_AVG = emp_RDD1.map(lambda row : (row[5],int(row[6]))).reduceByKey(lambda x,y : x + y)


[('Support', 1419125),
 ('Support', 1365025),
 ('Business Development', 432657),
 ('Accounting', 1122884),
 ('Human Resources', 1026666),
 ('Business Development', 659039),
 ('Legal', 226156),
 ('Research and Development', 545708),
 ('Business Development', 807769),
 ('Human Resources', 235352),
 ('Training', 83107),
 ('Support', 1008502),
 ('Legal', 685235),
 ('Training', 374701),
 ('Human Resources', 1261986),
 ('Services', 393009),
 ('Business Development', 933737),
 ('Services', 758842),
 ('Business Development', 736463),
 ('Human Resources', 978985),
 ('Accounting', 866039),
 ('Business Development', 1208531),
 ('Sales', 569996),
 ('Marketing', 895390),
 ('Marketing', 137077),
 ('Human Resources', 446416),
 ('Support', 44465),
 ('Legal', 273729),
 ('Training', 1316779),
 ('Marketing', 392291),
 ('Research and Development', 1148022),
 ('Legal', 1197458),
 ('Engineering', 438302),
 ('Training', 1409971),
 ('Training', 724047),
 ('Marketing', 953321),
 ('Training', 790775),
 ('Resear

#### 3. Count the number of employees working in each Department.

In [228]:
emp_RDD1.count()

1000

#### 4. Count Total Number of Employee working in each Department

In [191]:
emp_counter = emp_RDD1.groupBy(lambda val : val[5]).mapValues(len)
emp_counter.collect()

[('Accounting', 93),
 ('Human Resources', 94),
 ('Legal', 80),
 ('Services', 72),
 ('Sales', 76),
 ('Product Management', 67),
 ('Support', 95),
 ('Business Development', 107),
 ('Research and Development', 74),
 ('Training', 84),
 ('Marketing', 83),
 ('Engineering', 75)]

In [215]:
emp_AVG.join(emp_counter).mapValues(lambda x : x[0]/x[1]).collect()

[('Accounting', 814474.3333333334),
 ('Human Resources', 772824.3829787234),
 ('Legal', 754482.0),
 ('Services', 765440.4444444445),
 ('Sales', 745501.5921052631),
 ('Product Management', 696304.2985074627),
 ('Support', 825695.0947368421),
 ('Business Development', 783081.2710280374),
 ('Research and Development', 789849.1486486486),
 ('Training', 757752.380952381),
 ('Marketing', 744170.7831325302),
 ('Engineering', 757094.5466666666)]

##### Alternative approach for the average

In [239]:
emp_income = emp_RDD1.map(lambda x: (x[5],int(x[6]))).groupByKey().mapValues(list).collect()
for i,v in emp_income:
    print(i," => ",sum(v)/len(v))

Accounting  =>  814474.3333333334
Human Resources  =>  772824.3829787234
Legal  =>  754482.0
Services  =>  765440.4444444445
Sales  =>  745501.5921052631
Product Management  =>  696304.2985074627
Support  =>  825695.0947368421
Business Development  =>  783081.2710280374
Research and Development  =>  789849.1486486486
Training  =>  757752.380952381
Marketing  =>  744170.7831325302
Engineering  =>  757094.5466666666


#### 5. List out unique job titles in dataset


In [231]:
emp_job = emp_RDD1.map(lambda x : x[5]).distinct()
emp_job.collect()

['Accounting',
 'Human Resources',
 'Legal',
 'Services',
 'Sales',
 'Product Management',
 'Support',
 'Business Development',
 'Research and Development',
 'Training',
 'Marketing',
 'Engineering']

#### 6. count all emp starts with 'A'

In [226]:
emp_unq = emp_RDD1.filter(lambda x : x[1].lower().startswith('a'))
emp_unq.count()

81

#### 7. Word Count Example Using Spark RDD

In [242]:
!hdfs dfs -ls /

Found 3 items
drwxr-xr-x   - hadoop supergroup          0 2022-11-21 15:25 /hbase
drwxrwxrwx   - hadoop supergroup          0 2022-11-21 15:12 /tmp
drwxr-xr-x   - hadoop supergroup          0 2022-11-21 15:11 /user


In [244]:
!hdfs dfs -mkdir /wordcount

In [249]:
!hdfs dfs -put /home/hadoop/Downloads/HarryPotterandtheDeathlyHallows.txt /wordcount

In [250]:
rdd = sc.textFile('/wordcount/HarryPotterandtheDeathlyHallows.txt')

In [256]:
rdd_word = rdd.flatMap(lambda line : line.split(' '))
rdd_word1 = rdd_word.map(lambda word : (word.lower(),1))
rdd_word2 = rdd_word1.reduceByKey(lambda x,y : x + y)
rdd_word3 = rdd_word2.filter(lambda val:val[1] > 10).sortBy(lambda val:val[1], ascending=False)
rdd_word3.collect()

[('the', 10280),
 ('and', 5283),
 ('to', 4843),
 ('of', 4121),
 ('he', 3845),
 ('a', 3523),
 ('was', 2681),
 ('his', 2602),
 ('in', 2159),
 ('had', 1990),
 ('harry', 1867),
 ('it', 1859),
 ('that', 1813),
 ('said', 1676),
 ('you', 1485),
 ('as', 1409),
 ('at', 1401),
 ('i', 1275),
 ('with', 1127),
 ('they', 1091),
 ('not', 1083),
 ('on', 984),
 ('for', 980),
 ('but', 975),
 ('she', 918),
 ('her', 864),
 ('—', 841),
 ('from', 836),
 ('have', 784),
 ('…', 756),
 ('were', 738),
 ('be', 737),
 ('him', 666),
 ('into', 655),
 ('hermione', 653),
 ('out', 650),
 ('could', 627),
 ('all', 583),
 ('ron', 578),
 ('what', 571),
 ('been', 567),
 ('“i', 535),
 ('up', 510),
 ('we', 498),
 ('there', 493),
 ('who', 490),
 ('is', 484),
 ('—”', 472),
 ('their', 465),
 ('did', 451),
 ('would', 443),
 ('them', 442),
 ('like', 424),
 ('if', 424),
 ('looked', 419),
 ('by', 419),
 ('harry,', 414),
 ('back', 409),
 ('so', 403),
 ('over', 383),
 ('this', 382),
 ('an', 381),
 ('then', 365),
 ('one', 364),
 ('wand

#### Count the number of employees working in each city and sort the city by employees count in desending order

In [262]:
emp_count = emp_RDD1.map(lambda e : (e[-2],1)).reduceByKey(lambda x,y : x+y)
emp_count.sortBy(lambda x: x[1], ascending=False).collect()

[('Washington', 13),
 ('Berlin', 13),
 ('Lyon', 13),
 ('Cincinnati', 9),
 ('Dallas', 9),
 ('Cergy-Pontoise', 9),
 ('Sacramento', 8),
 ('Strasbourg', 8),
 ('Lille', 7),
 ('Dijon', 7),
 ('Los Angeles', 7),
 ('Pittsburgh', 7),
 ('Bordeaux', 7),
 ('San Jose', 7),
 ('Denver', 7),
 ('El Paso', 7),
 ('Montpellier', 7),
 ('Chicago', 6),
 ('Tulsa', 6),
 ('Quimper', 6),
 ('Marseille', 6),
 ('New York City', 6),
 ('Pau', 6),
 ('Poitiers', 6),
 ('Montgomery', 6),
 ('Le Mans', 5),
 ('Avignon', 5),
 ('Seattle', 5),
 ('Marne-la-Vallée', 5),
 ('Limoges', 5),
 ('Oakland', 5),
 ('Fort Worth', 5),
 ('Houston', 5),
 ('San Diego', 5),
 ('Philadelphia', 5),
 ('Paris La Défense', 5),
 ('Rungis', 5),
 ('Perpignan', 5),
 ('Rouen', 5),
 ('Caen', 5),
 ('Minneapolis', 5),
 ('Nantes', 5),
 ('Orléans', 5),
 ('Nürnberg', 4),
 ('Mobile', 4),
 ('Aix-en-Provence', 4),
 ('Angoulême', 4),
 ('Charlotte', 4),
 ('Düsseldorf', 4),
 ('Topeka', 4),
 ('Kansas City', 4),
 ('Roissy Charles-de-Gaulle', 4),
 ('Saint Petersburg', 4)

### Storage : Saving data using RDD 

In [264]:
rdd_word3.getNumPartitions()

2

In [267]:
# Saving RDD data to the local file system.
rdd_word3.saveAsTextFile('file:///home/hadoop/Downloads/output/')

Py4JJavaError: An error occurred while calling o4238.saveAsTextFile.
: org.apache.hadoop.mapred.FileAlreadyExistsException: Output directory file:/home/hadoop/Downloads/output already exists
	at org.apache.hadoop.mapred.FileOutputFormat.checkOutputSpecs(FileOutputFormat.java:131)
	at org.apache.spark.internal.io.HadoopMapRedWriteConfigUtil.assertConf(SparkHadoopWriter.scala:289)
	at org.apache.spark.internal.io.SparkHadoopWriter$.write(SparkHadoopWriter.scala:71)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1096)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1094)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1094)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1094)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:1067)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:1032)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:1032)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1032)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply$mcV$sp(PairRDDFunctions.scala:958)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:958)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:958)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:957)
	at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply$mcV$sp(RDD.scala:1544)
	at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1523)
	at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1523)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
	at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1523)
	at org.apache.spark.api.java.JavaRDDLike$class.saveAsTextFile(JavaRDDLike.scala:550)
	at org.apache.spark.api.java.AbstractJavaRDDLike.saveAsTextFile(JavaRDDLike.scala:45)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)


In [268]:
rdd_word3.saveAsTextFile('/wordcount/output/')

('the', 10280)
('and', 5283)
('to', 4843)
('of', 4121)
('he', 3845)
('a', 3523)
('was', 2681)
('his', 2602)
('in', 2159)
('had', 1990)
('harry', 1867)
('it', 1859)
('that', 1813)
('said', 1676)
('you', 1485)
('as', 1409)
('at', 1401)
('i', 1275)
('with', 1127)
('they', 1091)
('not', 1083)
('on', 984)
('for', 980)
('but', 975)
('she', 918)
('her', 864)
('—', 841)
('from', 836)
('have', 784)
('…', 756)
('were', 738)
('be', 737)
('him', 666)
('into', 655)
('hermione', 653)
('out', 650)
('could', 627)
('all', 583)
('ron', 578)
('what', 571)
('been', 567)
('“i', 535)
('up', 510)
('we', 498)
('there', 493)
('who', 490)
('is', 484)
('—”', 472)
('their', 465)
('did', 451)
('would', 443)
('them', 442)
('like', 424)
('if', 424)
('looked', 419)
('by', 419)
('harry,', 414)
('back', 409)
('so', 403)
('over', 383)
('this', 382)
('an', 381)
('then', 365)
('one', 364)
('wand', 352)
('do', 342)
('about', 339)
('know', 338)
('my', 334)

### RDD - statistical function

In [275]:
rdd_city = emp_RDD1.map(lambda x : (x[5],int(x[6])))
rdd_city.min(key=lambda x: x[1])

('Training', 12727)

In [276]:
rdd_city.max(key=lambda x: x[1])

('Support', 1496924)

In [281]:
rdd_11 = sc.parallelize(range(1,100)).sample(False,0.3,seed=2)
rdd_11.min()

16

In [282]:
rdd_11.max()

95

In [291]:
# standard deviation : how much data is deviated from the mean
rdd_city.map(lambda x: x[1]).stdev()

424405.95187845733

In [292]:
# square of standard deviation is variance
rdd_city.map(lambda x: x[1]).variance()

180120411989.85944

In [293]:
rdd_city.map(lambda x: x[1]).mean()

770051.4190000002

In [294]:
rdd_city.map(lambda x: x[1]).sum()

770051419

In [296]:
rdd_city.map(lambda x: x[1]).stats()

(count: 1000, mean: 770051.4190000002, stdev: 424405.95187845733, max: 1496924, min: 12727)

### RDD Persistence and its Storage Levels

In [5]:
emp_RDD.cache()

file:///home/hadoop/Downloads/Employee_Advance.csv MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

In [7]:
from pyspark.storagelevel import StorageLevel
# Persist te RDD in memory with default storage level (MEMORY ONLY)
emp_RDD = emp_RDD.persist(StorageLevel.MEMORY_ONLY)

In [8]:
emp_RDD1 = emp_RDD.map(lambda row : row.split(','))
emp_Map = emp_RDD.map(lambda row : row.split(','))
emp_Map.reduceByKey(lambda x,y, : x + y)

PythonRDD[8] at RDD at PythonRDD.scala:53

In [11]:
emp_RDD = emp_RDD.persist(StorageLevel.MEMORY_AND_DISK)

In [12]:
# free the persist
emp_RDD.unpersist()

file:///home/hadoop/Downloads/Employee_Advance.csv MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0