In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark = SparkSession.builder.appName("test").master("local[*]").getOrCreate()

In [6]:

sc = spark.sparkContext
my_list = [('hello', 2),('tony', 5)]
schema = ['name', 'age']
rdd = sc.parallelize(my_list)
rdd.collect()
df = spark.createDataFrame(rdd, schema)
df.show()
df.select("name").show()
df.withColumn("age10", col("age")*10).show()

+-----+---+
| name|age|
+-----+---+
|hello|  2|
| tony|  5|
+-----+---+

+-----+
| name|
+-----+
|hello|
| tony|
+-----+

+-----+---+-----+
| name|age|age10|
+-----+---+-----+
|hello|  2|   20|
| tony|  5|   50|
+-----+---+-----+



In [3]:
d = [{"name":"Alice", "age" : 12}, {"name":"John", "age" : 14}]
df_d = spark.createDataFrame(d)
df_d.show()

+---+-----+
|age| name|
+---+-----+
| 12|Alice|
| 14| John|
+---+-----+



In [6]:
from pyspark.sql import Row
Person  = Row('name', 'age')
my_list2 =  [('matt', 2),('ron', 5)]
rdd2 = sc.parallelize(my_list2)
person = rdd2.map(lambda rdd2_element: Person(*rdd2_element))
df_r = spark.createDataFrame(person).show()

+----+---+
|name|age|
+----+---+
|matt|  2|
| ron|  5|
+----+---+



In [7]:
# df_text = spark.read.format("csv").option("inferSchema", "true").option("sep", ";").option("header", "true").load("/Users/tech/codes/SparkJourney/data/books.csv")
# df_text.show(10, False)
from operator import add
my_list_to_be_reduced = [("a", 1), ("b", 1), ("a", 1)]
rdd = sc.parallelize(my_list_to_be_reduced)
sorted(rdd.collect())

[('a', 1), ('a', 1), ('b', 1)]

In [8]:
rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
sorted(rdd.groupByKey().mapValues(list).collect())
#
# sorted(rdd.groupByKey().mapValues(list).collect())



[('a', [1, 1]), ('b', [1])]

In [9]:
# create a silly test dataframe from Python collections (lists)
schema = ["fruits"]
wordsDF = spark.createDataFrame([("apple",),("banana",),("mango",),("apple",)],schema)
wordsDF.show()

+------+
|fruits|
+------+
| apple|
|banana|
| mango|
| apple|
+------+



In [10]:
from pyspark.sql.functions import length
wordsDF.select(length("fruits").alias("length")).show()





+------+
|length|
+------+
|     5|
|     6|
|     5|
|     5|
+------+



In [11]:
wordsDF.groupBy("fruits").count().show()

+------+-----+
|fruits|count|
+------+-----+
| apple|    2|
| mango|    1|
|banana|    1|
+------+-----+



In [8]:
booksDF = spark.read.text("/Users/tech/codes/SparkJourney/data/PrideandPrejudice.txt").select(col("value").alias("sentence"))
booksDF.show(10, truncate=False)

+------------------------------------------------------------------------+
|sentence                                                                |
+------------------------------------------------------------------------+
|The Project Gutenberg eBook of Pride and Prejudice, by Jane Austen      |
|                                                                        |
|This eBook is for the use of anyone anywhere in the United States and   |
|most other parts of the world at no cost and with almost no restrictions|
|whatsoever. You may copy it, give it away or re-use it under the terms  |
|of the Project Gutenberg License included with this eBook or online at  |
|www.gutenberg.org. If you are not located in the United States, you     |
|will have to check the laws of the country where you are located before |
|using this eBook.                                                       |
|                                                                        |
+------------------------

In [9]:
from pyspark.sql.functions import split, explode
bookWordsSplitDF = booksDF.select(split(booksDF.sentence, ' ').alias('split'))
bookWordsSplitDF.show(15)
bookWordsSingleDF = (bookWordsSplitDF.select(explode(bookWordsSplitDF.split).alias('word')))
bookWordsDF = bookWordsSingleDF.where(bookWordsSingleDF.word != '')
bookWordsDF.show()
bookWordsDFCount = bookWordsDF.count()
print(bookWordsDFCount)

+--------------------+
|               split|
+--------------------+
|[The, Project, Gu...|
|                  []|
|[This, eBook, is,...|
|[most, other, par...|
|[whatsoever., You...|
|[of, the, Project...|
|[www.gutenberg.or...|
|[will, have, to, ...|
|[using, this, eBo...|
|                  []|
|[Title:, Pride, a...|
|                  []|
|[Author:, Jane, A...|
|                  []|
|[Release, Date:, ...|
+--------------------+
only showing top 15 rows

+----------+
|      word|
+----------+
|       The|
|   Project|
| Gutenberg|
|     eBook|
|        of|
|     Pride|
|       and|
|Prejudice,|
|        by|
|      Jane|
|    Austen|
|      This|
|     eBook|
|        is|
|       for|
|       the|
|       use|
|        of|
|    anyone|
|  anywhere|
+----------+
only showing top 20 rows

124749
