In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]") \
    .appName("SparkByExamples.com").getOrCreate()

In [9]:
data = ["Project","Gutenberg’s","Alice’s","Adventures",
"in","Wonderland","Project","Gutenberg’s","Adventures",
"in","Wonderland","Project","Gutenberg’s"]

rdd=spark.sparkContext.parallelize(data)
print("Original RDD")
rdd.collect()

Original RDD


['Project',
 'Gutenberg’s',
 'Alice’s',
 'Adventures',
 'in',
 'Wonderland',
 'Project',
 'Gutenberg’s',
 'Adventures',
 'in',
 'Wonderland',
 'Project',
 'Gutenberg’s']

In [10]:
rdd2=rdd.map(lambda x: (x,1))

for element in rdd2.collect():
    print(element)

('Project', 1)
('Gutenberg’s', 1)
('Alice’s', 1)
('Adventures', 1)
('in', 1)
('Wonderland', 1)
('Project', 1)
('Gutenberg’s', 1)
('Adventures', 1)
('in', 1)
('Wonderland', 1)
('Project', 1)
('Gutenberg’s', 1)


In [13]:
data = [('James','Smith','M',30),
  ('Anna','Rose','F',41),
  ('Robert','Williams','M',62), 
]

columns = ["firstname","lastname","gender","salary"]
df = spark.createDataFrame(data=data, schema = columns)
df.show()

+---------+--------+------+------+
|firstname|lastname|gender|salary|
+---------+--------+------+------+
|    James|   Smith|     M|    30|
|     Anna|    Rose|     F|    41|
|   Robert|Williams|     M|    62|
+---------+--------+------+------+



In [14]:
rdd2=df.rdd.map(lambda x:(x[0]+","+x[1],x[2],x[3]*2))  

In [15]:
rdd2=df.rdd.map(lambda x:(x["firstname"]+","+x["lastname"],x["gender"],x["salary"]*2)) 
rdd2=df.rdd.map(lambda x:(x.firstname+","+x.lastname,x.gender,x.salary*2)) 
def func1(x):
    firstName=x.firstname
    lastName=x.lastname
    name=firstName+","+lastName
    gender=x.gender.lower()
    salary=x.salary*2
    return (name,gender,salary)
rdd2=df.rdd.map(lambda x: func1(x))

In [16]:
df2=rdd2.toDF(["name","gender","new_salary"]   )
df2.show()

+---------------+------+----------+
|           name|gender|new_salary|
+---------------+------+----------+
|    James,Smith|     m|        60|
|      Anna,Rose|     f|        82|
|Robert,Williams|     m|       124|
+---------------+------+----------+



In [17]:
data = ["Project Gutenberg’s",
        "Alice’s Adventures in Wonderland",
        "Project Gutenberg’s",
        "Adventures in Wonderland",
        "Project Gutenberg’s"]
rdd=spark.sparkContext.parallelize(data)
for element in rdd.collect():
    print(element)

Project Gutenberg’s
Alice’s Adventures in Wonderland
Project Gutenberg’s
Adventures in Wonderland
Project Gutenberg’s


In [18]:
rdd2=rdd.flatMap(lambda x: x.split(" "))
for element in rdd2.collect():
    print(element)

Project
Gutenberg’s
Alice’s
Adventures
in
Wonderland
Project
Gutenberg’s
Adventures
in
Wonderland
Project
Gutenberg’s


In [19]:
arrayData = [
        ('James',['Java','Scala'],{'hair':'black','eye':'brown'}),
        ('Michael',['Spark','Java',None],{'hair':'brown','eye':None}),
        ('Robert',['CSharp',''],{'hair':'red','eye':''}),
        ('Washington',None,None),
        ('Jefferson',['1','2'],{})]
df = spark.createDataFrame(data=arrayData, schema = ['name','knownLanguages','properties'])
df.show(truncate=False)
from pyspark.sql.functions import explode
df2 = df.select(df.name,explode(df.knownLanguages),df.properties)
df2.printSchema()
df2.show()
rdd=df.rdd
rdd2 = rdd.flatMap(lambda row: [(row['name'], lang) for lang in row['knownLanguages']] if row['knownLanguages'] else [])
for element in rdd2.collect():
    print(element)

+----------+-------------------+-----------------------------+
|name      |knownLanguages     |properties                   |
+----------+-------------------+-----------------------------+
|James     |[Java, Scala]      |{eye -> brown, hair -> black}|
|Michael   |[Spark, Java, NULL]|{eye -> NULL, hair -> brown} |
|Robert    |[CSharp, ]         |{eye -> , hair -> red}       |
|Washington|NULL               |NULL                         |
|Jefferson |[1, 2]             |{}                           |
+----------+-------------------+-----------------------------+

root
 |-- name: string (nullable = true)
 |-- col: string (nullable = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)

+---------+------+--------------------+
|     name|   col|          properties|
+---------+------+--------------------+
|    James|  Java|{eye -> brown, ha...|
|    James| Scala|{eye -> brown, ha...|
|  Michael| Spark|{eye -> NULL, hai...|
|  

In [20]:
columns = ["Seqno","Name"]
data = [("1", "john jones"),
    ("2", "tracey smith"),
    ("3", "amy sanders")]
df = spark.createDataFrame(data=data,schema=columns)
df.show()

# foreach()
accum=spark.sparkContext.accumulator(0)
df.foreach(lambda x:accum.add(int(x.Seqno)))
print(accum.value)

+-----+------------+
|Seqno|        Name|
+-----+------------+
|    1|  john jones|
|    2|tracey smith|
|    3| amy sanders|
+-----+------------+

6
