# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


####  Run this cell to set up and start your interactive session.


In [None]:
%idle_timeout 2880
%glue_version 4.0
%worker_type G.1X
%number_of_workers 5

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

In [4]:
from pyspark.sql import functions as func
bucket_name = "BUCKET_NAME"
inputDf = spark.read.text(bucket_name)
inputDf.show()

+--------------------+
|               value|
+--------------------+
|Self-Employment: ...|
|Achieving Financi...|
|       By Frank Kane|
|                    |
|                    |
|                    |
|Copyright � 2015 ...|
|All rights reserv...|
|                    |
|                    |
|            CONTENTS|
|          Disclaimer|
|             Preface|
|Part I: Making th...|
|  Overcoming Inertia|
|     Fear of Failure|
|Career Indoctrina...|
|The Carrot on a S...|
|      Ego Protection|
|Your Employer as ...|
+--------------------+
only showing top 20 rows


In [7]:
words = inputDf.select(func.explode(func.split(inputDf.value,"\\W+")).alias("word")) # split the data to words and create new rows using explode function
words.filter(words.word != "")
words.show()

+----------+
|      word|
+----------+
|      Self|
|Employment|
|  Building|
|        an|
|  Internet|
|  Business|
|        of|
|       One|
| Achieving|
| Financial|
|       and|
|  Personal|
|   Freedom|
|   through|
|         a|
| Lifestyle|
|Technology|
|  Business|
|        By|
|     Frank|
+----------+
only showing top 20 rows


In [10]:
lowercase_words = words.select(func.lower(words.word).alias("word"))
lowercase_words.show()

+----------+
|      word|
+----------+
|      self|
|employment|
|  building|
|        an|
|  internet|
|  business|
|        of|
|       one|
| achieving|
| financial|
|       and|
|  personal|
|   freedom|
|   through|
|         a|
| lifestyle|
|technology|
|  business|
|        by|
|     frank|
+----------+
only showing top 20 rows


In [15]:
word_counts = lowercase_words.groupBy("word").count().sort("count",ascending=False)
word_counts.show()

+--------+-----+
|    word|count|
+--------+-----+
|     you| 1878|
|      to| 1828|
|    your| 1420|
|     the| 1292|
|       a| 1191|
|      of|  970|
|     and|  934|
|        |  772|
|    that|  747|
|      it|  649|
|      in|  616|
|      is|  560|
|     for|  537|
|      on|  428|
|     are|  424|
|      if|  411|
|       s|  391|
|       i|  387|
|business|  383|
|     can|  376|
+--------+-----+
only showing top 20 rows


## We can also do this with RDDs

In [18]:
import re
def normalizeWords(lines):
    return re.compile(r'\W+', re.UNICODE).split(lines.lower())




In [19]:
lines = sc.textFile(bucket_name)
words_rdd = lines.flatMap(normalizeWords)
words_rdd.take(5)

['self', 'employment', 'building', 'an', 'internet']


In [20]:
result = words_rdd.map(lambda x: (x,1)).reduceByKey(lambda x,y: x+y)
result.take(5)

[('', 772), ('book', 39), ('4', 9), ('months', 23), ('once', 59)]


In [22]:
result_sorted = result.map(lambda x : (x[1], x[0])).sortByKey(ascending = False)
result_sorted.take(5)

[(1878, 'you'), (1828, 'to'), (1420, 'your'), (1292, 'the'), (1191, 'a')]


## We can also combine RDDs with DF to get the best of both worlds

In [23]:
import re
def normalizeWords(lines):
    return re.compile(r'\W+', re.UNICODE).split(lines.lower())




In [24]:
lines = sc.textFile(bucket_name)
words_rdd = lines.flatMap(normalizeWords)
words_rdd.take(5)

['self', 'employment', 'building', 'an', 'internet']


In [25]:
result = words_rdd.map(lambda x: (x,1)).reduceByKey(lambda x,y: x+y)
result.take(5)

[('self', 111), ('', 772), ('s', 391), ('unlimited', 6), ('where', 53)]


In [30]:
from pyspark.sql import Row
def createSchema(lines):
    return Row(word=str(lines[0]),
                       count=int(lines[1]))




In [31]:
words_mapped_for_df = result.map(createSchema)
words_df_from_rdd = spark.createDataFrame(words_mapped_for_df)
words_df_from_rdd.show()

+-----------+-----+
|       word|count|
+-----------+-----+
|       self|  111|
|           |  772|
|          s|  391|
|  unlimited|    6|
|      where|   53|
|       work|  144|
|      other|   78|
|    results|   25|
|     engine|   13|
|   avoiding|    5|
|       book|   39|
|   improved|    2|
|   quitting|   10|
|        but|  242|
|     matter|   13|
|        out|  161|
|responsibly|    3|
|possibility|    3|
|       good|   72|
|    devoted|    1|
+-----------+-----+
only showing top 20 rows


In [33]:
words_df_from_rdd.sort("count",ascending = False).show()

+--------+-----+
|    word|count|
+--------+-----+
|     you| 1878|
|      to| 1828|
|    your| 1420|
|     the| 1292|
|       a| 1191|
|      of|  970|
|     and|  934|
|        |  772|
|    that|  747|
|      it|  649|
|      in|  616|
|      is|  560|
|     for|  537|
|      on|  428|
|     are|  424|
|      if|  411|
|       s|  391|
|       i|  387|
|business|  383|
|     can|  376|
+--------+-----+
only showing top 20 rows
