**Creating spark session form a pyspark API**

In [21]:
import findspark
findspark.init()

In [22]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

Converting txt file (that contains pages of the book "The Body a guide for occupants by Bill Bryson") 
to a rdd (resilient distributed dataset)

In [23]:
text_rdd = spark.sparkContext.textFile('text_file.txt')

The words separated by a tab or \t

In [24]:
text_rdd.collect()

['ALSO\tBY\tBILL\tBRYSON',
 ' The\tLost\tContinent',
 ' The\tMother\tTongue',
 ' Neither\tHere\tNor\tThere',
 ' Made\tin\tAmerica',
 ' Notes\tfrom\ta\tSmall\tIsland',
 ' A\tWalk\tin\tthe\tWoods',
 ' I’m\ta\tStranger\tHere\tMyself',
 ' In\ta\tSunburned\tCountry',
 ' Bryson’s\tDictionary\tof\tTroublesome\tWords',
 ' Bill\tBryson’s\tAfrican\tDiary',
 ' A\tShort\tHistory\tof\tNearly\tEverything',
 ' A\tShort\tHistory\tof\tNearly\tEverything:\tSpecial\tIllustrated\tEdition',
 ' The\tLife\tand\tTimes\tof\tthe\tThunderbolt\tKid',
 ' Shakespeare:\tThe\tWorld\tas\tStage',
 ' Bryson’s\tDictionary\tfor\tWriters\tand\tEditors',
 ' At\tHome:\tA\tShort\tHistory\tof\tPrivate\tLife',
 ' At\tHome:\tA\tShort\tHistory\tof\tPrivate\tLife:\tIllustrated\tEdition',
 ' One\tSummer',
 ' The\tRoad\tto\tLittle\tDribbling:\tAdventures\tof\tan\tAmerican\tin\tBritain',
 '',
 'Copyright\t©\t2019\tby\tBill\tBryson',
 ' All\trights\treserved.\tPublished\tin\tthe\tUnited\tStates\tby\tDoubleday,\ta\tdivision\tof\tPengui

**Splitting lines of the text file to a words (from an array of lines to an array of words)**

In [25]:
text_rdd1 = text_rdd.flatMap(lambda x: x.split("\t"))

In [26]:
text_rdd1.collect()

['ALSO',
 'BY',
 'BILL',
 'BRYSON',
 ' The',
 'Lost',
 'Continent',
 ' The',
 'Mother',
 'Tongue',
 ' Neither',
 'Here',
 'Nor',
 'There',
 ' Made',
 'in',
 'America',
 ' Notes',
 'from',
 'a',
 'Small',
 'Island',
 ' A',
 'Walk',
 'in',
 'the',
 'Woods',
 ' I’m',
 'a',
 'Stranger',
 'Here',
 'Myself',
 ' In',
 'a',
 'Sunburned',
 'Country',
 ' Bryson’s',
 'Dictionary',
 'of',
 'Troublesome',
 'Words',
 ' Bill',
 'Bryson’s',
 'African',
 'Diary',
 ' A',
 'Short',
 'History',
 'of',
 'Nearly',
 'Everything',
 ' A',
 'Short',
 'History',
 'of',
 'Nearly',
 'Everything:',
 'Special',
 'Illustrated',
 'Edition',
 ' The',
 'Life',
 'and',
 'Times',
 'of',
 'the',
 'Thunderbolt',
 'Kid',
 ' Shakespeare:',
 'The',
 'World',
 'as',
 'Stage',
 ' Bryson’s',
 'Dictionary',
 'for',
 'Writers',
 'and',
 'Editors',
 ' At',
 'Home:',
 'A',
 'Short',
 'History',
 'of',
 'Private',
 'Life',
 ' At',
 'Home:',
 'A',
 'Short',
 'History',
 'of',
 'Private',
 'Life:',
 'Illustrated',
 'Edition',
 ' One',
 

**Total number of words in the file**

In [27]:
print("Total number of words: ", text_rdd1.count())

Total number of words:  149182


**Adding number of 1 to each word, lower and strip the word and then convert them into a tuple**

In [28]:
text_rdd2 = text_rdd1.map(lambda x: (x.lower().strip(), 1))

In [29]:
text_rdd2.collect()

[('also', 1),
 ('by', 1),
 ('bill', 1),
 ('bryson', 1),
 ('the', 1),
 ('lost', 1),
 ('continent', 1),
 ('the', 1),
 ('mother', 1),
 ('tongue', 1),
 ('neither', 1),
 ('here', 1),
 ('nor', 1),
 ('there', 1),
 ('made', 1),
 ('in', 1),
 ('america', 1),
 ('notes', 1),
 ('from', 1),
 ('a', 1),
 ('small', 1),
 ('island', 1),
 ('a', 1),
 ('walk', 1),
 ('in', 1),
 ('the', 1),
 ('woods', 1),
 ('i’m', 1),
 ('a', 1),
 ('stranger', 1),
 ('here', 1),
 ('myself', 1),
 ('in', 1),
 ('a', 1),
 ('sunburned', 1),
 ('country', 1),
 ('bryson’s', 1),
 ('dictionary', 1),
 ('of', 1),
 ('troublesome', 1),
 ('words', 1),
 ('bill', 1),
 ('bryson’s', 1),
 ('african', 1),
 ('diary', 1),
 ('a', 1),
 ('short', 1),
 ('history', 1),
 ('of', 1),
 ('nearly', 1),
 ('everything', 1),
 ('a', 1),
 ('short', 1),
 ('history', 1),
 ('of', 1),
 ('nearly', 1),
 ('everything:', 1),
 ('special', 1),
 ('illustrated', 1),
 ('edition', 1),
 ('the', 1),
 ('life', 1),
 ('and', 1),
 ('times', 1),
 ('of', 1),
 ('the', 1),
 ('thunderbolt',

**Summarizing occurences number of each word by the key (the word itself)**

In [30]:
text_rdd3 = text_rdd2.reduceByKey(lambda a,b: a + b)

In [31]:
text_rdd3.collect()

[('bryson', 4),
 ('lost', 32),
 ('continent', 1),
 ('neither', 5),
 ('nor', 8),
 ('there', 220),
 ('in', 3411),
 ('america', 42),
 ('notes', 16),
 ('walk', 11),
 ('i’m', 4),
 ('myself', 1),
 ('sunburned', 2),
 ('country', 11),
 ('dictionary', 9),
 ('of', 4927),
 ('african', 3),
 ('short', 22),
 ('everything', 29),
 ('illustrated', 5),
 ('edition', 2),
 ('thunderbolt', 2),
 ('kid', 2),
 ('world', 122),
 ('as', 1215),
 ('stage', 4),
 ('at', 833),
 ('home:', 3),
 ('private', 8),
 ('summer', 13),
 ('road', 1),
 ('dribbling:', 1),
 ('adventures', 6),
 ('an', 593),
 ('britain', 21),
 ('', 12),
 ('copyright', 8),
 ('©', 7),
 ('2019', 3),
 ('published', 20),
 ('united', 94),
 ('house', 11),
 ('llc,', 1),
 ('new', 287),
 ('york.', 3),
 ('imprint', 2),
 ('london,', 20),
 ('2019.', 4),
 ('www.doubleday.com', 1),
 ('portrayal', 1),
 ('dolphin', 1),
 ('are', 1122),
 ('registered', 1),
 ('trademarks', 1),
 ('llc.', 1),
 ('grateful', 2),
 ('is', 2212),
 ('university', 195),
 ('excerpts', 1),
 ('poems

**Convert the rdd to a spark dataframe with the Word and Column attributes**

In [32]:
text_rdd3_df = text_rdd3.toDF(["Word", "Count"])

**Sorting the df by descending order of the count**

In [33]:
from pyspark.sql.functions import desc

text_rdd3_desc = text_rdd3_df.orderBy(desc('Count'))

**Cleaning the dataset from an english stop words**

In [34]:
from pyspark.ml.feature import StopWordsRemover

stop_words = StopWordsRemover().getStopWords()
result_df = text_rdd3_desc.filter(~text_rdd3_desc.Word.isin(stop_words))

In [35]:
ten_common_words = result_df.limit(10)

**Ten the most common words out of the text file**

In [36]:
ten_common_words.show()

+-------+-----+
|   Word|Count|
+-------+-----+
|    one|  610|
| people|  373|
|   much|  308|
|    new|  287|
|percent|  286|
|    two|  270|
|   even|  260|
|  blood|  225|
|   many|  218|
|     us|  215|
+-------+-----+



**Checking on another txt file that contains of lorem ipsum text**

In [37]:
second_rdd = spark.sparkContext.textFile('short.txt')
second_rdd.collect()
second_rdd1 = second_rdd.flatMap(lambda x: x.split(" "))
second_rdd1.collect()
print("Total number of words: ", second_rdd1.count())
second_rdd2 = second_rdd1.map(lambda x: (x.lower().strip(), 1))
second_rdd2.collect()

second_rdd3 = second_rdd2.reduceByKey(lambda a,b: a + b)
second_rdd3.collect()

second_rdd3_df = second_rdd3.toDF(["Word", "Count"])

from pyspark.sql.functions import desc
second_rdd3_desc = second_rdd3_df.orderBy(desc('Count'))

from pyspark.ml.feature import StopWordsRemover
stop_words = StopWordsRemover().getStopWords()
result_df = second_rdd3_desc.filter(~second_rdd3_desc.Word.isin(stop_words))
result_df.show()

Total number of words:  554
+------------+-----+
|        Word|Count|
+------------+-----+
|        anim|   16|
|       irure|   13|
|     aliquip|   13|
|        nisi|   13|
|   cupidatat|   12|
|            |   12|
|         qui|   12|
|          ad|   12|
|        quis|   11|
|     eiusmod|   11|
|exercitation|   11|
|          et|   11|
|     ullamco|   11|
|      dolore|   11|
|         est|   10|
|    proident|   10|
| adipisicing|   10|
|       culpa|    9|
|       ipsum|    9|
|         non|    9|
+------------+-----+
only showing top 20 rows



**Ten the most common words of a second text file**

In [38]:
result_df.limit(10).show()

+---------+-----+
|     Word|Count|
+---------+-----+
|     anim|   16|
|    irure|   13|
|  aliquip|   13|
|     nisi|   13|
|cupidatat|   12|
|         |   12|
|      qui|   12|
|       ad|   12|
|     quis|   11|
|  eiusmod|   11|
+---------+-----+



In [39]:
spark.stop()