In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("wordcount").master("local[*]").getOrCreate()
sc = spark.sparkContext

In [2]:
my_list = [('tony', 5), ('john', 2)]
schema = ['name', 'age']
rdd = sc.parallelize(my_list)
rdd.collect()

[('tony', 5), ('john', 2)]

In [3]:
dffromlist = spark.createDataFrame(rdd, schema)
dffromlist.show()


+----+---+
|name|age|
+----+---+
|tony|  5|
|john|  2|
+----+---+



In [4]:
from pyspark.sql.functions import col
dffromlist.select("name").show()
dffromlist.withColumn("age10", col("age")*10).show()

+----+
|name|
+----+
|tony|
|john|
+----+

+----+---+-----+
|name|age|age10|
+----+---+-----+
|tony|  5|   50|
|john|  2|   20|
+----+---+-----+



In [5]:
d = [{"name":"Alice", "age" : 12}, {"name":"John", "age" : 14}]
df_d = spark.createDataFrame(d)
df_d.show()


+---+-----+
|age| name|
+---+-----+
| 12|Alice|
| 14| John|
+---+-----+



In [6]:
df_d2 = df_d.select(df_d.name, df_d.age)
df_d2.show()

+-----+---+
| name|age|
+-----+---+
|Alice| 12|
| John| 14|
+-----+---+



In [8]:
from pyspark.sql import Row
list_persons = [('matt',5),('john',2)]
person_object = Row('name','age')
rdd_list_person = sc.parallelize(list_persons)
mapped_person = rdd_list_person.map(lambda rdd_item: person_object(*rdd_item))
spark.createDataFrame(mapped_person).show()

+----+---+
|name|age|
+----+---+
|matt|  5|
|john|  2|
+----+---+



In [9]:
wordsDF = spark.createDataFrame([('look',), ('spark',), ('tutorial',), ('spark',), ('look',), ('python',)], ['word'])
wordsDF.show()
print(type(wordsDF))
wordsDF.printSchema()

+--------+
|    word|
+--------+
|    look|
|   spark|
|tutorial|
|   spark|
|    look|
|  python|
+--------+

<class 'pyspark.sql.dataframe.DataFrame'>
root
 |-- word: string (nullable = true)



In [10]:
from pyspark.sql.functions import length
wordsLengthsDF = wordsDF.select(wordsDF.word, length('word').alias('lengths'))  # transformation
wordsLengthsDF.show()

+--------+-------+
|    word|lengths|
+--------+-------+
|    look|      4|
|   spark|      5|
|tutorial|      8|
|   spark|      5|
|    look|      4|
|  python|      6|
+--------+-------+



In [11]:
wordCountsDF = wordsDF.groupBy('word').count()
wordCountsDF.show()
wordCountsDF.orderBy(col("count").desc()).show()

+--------+-----+
|    word|count|
+--------+-----+
|tutorial|    1|
|   spark|    2|
|    look|    2|
|  python|    1|
+--------+-----+

+--------+-----+
|    word|count|
+--------+-----+
|    look|    2|
|   spark|    2|
|  python|    1|
|tutorial|    1|
+--------+-----+



In [12]:
words2df = spark.createDataFrame([("spark",),("spark",),("hello",),("hello",),("hi",)],["name"])
countdf = words2df.groupby(words2df.name).count()
countdf.orderBy(col("count").asc()).show()

+-----+-----+
| name|count|
+-----+-----+
|   hi|    1|
|hello|    2|
|spark|    2|
+-----+-----+



In [13]:
from pyspark.sql.functions import split,explode
fileName = "/Users/tech/codes/SparkJourney/data/PrideandPrejudice.txt"
bookDF = spark.read.text(fileName).select(col("value").alias("lines"))
bookWordsSplitDF = bookDF.select(split(bookDF.lines, ' ').alias("split"))
bookWordsSingleDF = bookWordsSplitDF.select(explode(bookWordsSplitDF.split).alias("word"))
bookWordCountDF = bookWordsSingleDF.groupby(bookWordsSingleDF.word).count().orderBy(col("count").desc())
bookWordCountDF.show()

# bookWordsSplitDF = bookDF.select(split(bookDF.lines, ' ').alias('split'))
# bookWordsSingleDF = bookWordsSplitDF.select(explode(bookWordsSplitDF.split).alias("word"))
# bookWordsSingleDF.groupby(bookWordsSingleDF.word).count().orderBy(col("count").desc()).show()

+----+-----+
|word|count|
+----+-----+
|    |72884|
| the| 4218|
|  to| 4123|
|  of| 3666|
| and| 3314|
|   a| 1944|
| her| 1855|
|  in| 1816|
| was| 1798|
|   I| 1724|
|that| 1417|
| not| 1365|
| she| 1304|
|  be| 1206|
| his| 1167|
| had| 1126|
|  as| 1121|
|with| 1040|
|  he| 1039|
| for| 1004|
+----+-----+
only showing top 20 rows

