In [1]:
from pyspark.sql import SparkSession

# SparkSession is an entry point to PySpark's functionality within a program. This entry point lets you access the features of Spark through Python. 

In [2]:
spark = (SparkSession.builder.appName("Converting articles into BoW Vectors").getOrCreate())

In [3]:
spark

In [4]:
data = spark.read.csv("all-the-news-2-1.csv")

# The older version of PySpark used to store the data in special kind of Data Structure called Resilient Distributed Dataset (RDD). The special feature of this data structure is that it is Row Major. 

# The new version of PySpark has also introduced a column major data structure called PySpark DataFrames. 

In [5]:
print(data)

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c7: string, _c8: string, _c9: string, _c10: string, _c11: string]


In [6]:
data.show(5)

+--------------------+--------------------+-------------------+----------------+-----+----+-----------+--------------------+--------------------+--------------------+--------------------+------------+
|                 _c0|                 _c1|                _c2|             _c3|  _c4| _c5|        _c6|                 _c7|                 _c8|                 _c9|                _c10|        _c11|
+--------------------+--------------------+-------------------+----------------+-----+----+-----------+--------------------+--------------------+--------------------+--------------------+------------+
|                null|          Unnamed: 0|               date|            year|month| day|     author|               title|             article|                 url|             section| publication|
|                   0|                   0|2016-12-09 18:31:00|            2016| 12.0|   9|Lee Drutman|We should take co...|"This post is par...|             however| several critics ...| for exam

In [7]:
data.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: string (nullable = true)
 |-- _c10: string (nullable = true)
 |-- _c11: string (nullable = true)



In [8]:
print(data.dtypes)

[('_c0', 'string'), ('_c1', 'string'), ('_c2', 'string'), ('_c3', 'string'), ('_c4', 'string'), ('_c5', 'string'), ('_c6', 'string'), ('_c7', 'string'), ('_c8', 'string'), ('_c9', 'string'), ('_c10', 'string'), ('_c11', 'string')]


In [9]:
data.show()

+--------------------+--------------------+-------------------+----------------+-----+----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                 _c0|                 _c1|                _c2|             _c3|  _c4| _c5|                 _c6|                 _c7|                 _c8|                 _c9|                _c10|                _c11|
+--------------------+--------------------+-------------------+----------------+-----+----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                null|          Unnamed: 0|               date|            year|month| day|              author|               title|             article|                 url|             section|         publication|
|                   0|                   0|2016-12-09 18:31:00|            2016| 12.0|   9|         Lee Drutman|We should take c

In [10]:
from pyspark.sql.functions import col

In [11]:
text_column = data.select("_c8")

In [12]:
text_column.show()

+--------------------+
|                 _c8|
+--------------------+
|             article|
|"This post is par...|
| The Indianapolis...|
|                null|
|DAVOS, Switzerlan...|
|PARIS (Reuters) -...|
|"Paris Hilton arr...|
|BERLIN, June 17 (...|
|                null|
|CARACAS (Reuters)...|
|"If only every da...|
|Google I/O, the c...|
|China is dismissi...|
|Elizabeth Warren ...|
|(Reuters) - The s...|
|Joakim Noah's ﻿mo...|
|"Jermaine Jackson...|
|LONDON (Reuters) ...|
|"Nancy Pelosi is ...|
|The nonpartisan d...|
+--------------------+
only showing top 20 rows



In [13]:
text_column = data.select(data["_c8"])

In [14]:
text_column.show()

+--------------------+
|                 _c8|
+--------------------+
|             article|
|"This post is par...|
| The Indianapolis...|
|                null|
|DAVOS, Switzerlan...|
|PARIS (Reuters) -...|
|"Paris Hilton arr...|
|BERLIN, June 17 (...|
|                null|
|CARACAS (Reuters)...|
|"If only every da...|
|Google I/O, the c...|
|China is dismissi...|
|Elizabeth Warren ...|
|(Reuters) - The s...|
|Joakim Noah's ﻿mo...|
|"Jermaine Jackson...|
|LONDON (Reuters) ...|
|"Nancy Pelosi is ...|
|The nonpartisan d...|
+--------------------+
only showing top 20 rows



In [15]:
text_column = data.select(data._c8)

In [16]:
text_column.show()

+--------------------+
|                 _c8|
+--------------------+
|             article|
|"This post is par...|
| The Indianapolis...|
|                null|
|DAVOS, Switzerlan...|
|PARIS (Reuters) -...|
|"Paris Hilton arr...|
|BERLIN, June 17 (...|
|                null|
|CARACAS (Reuters)...|
|"If only every da...|
|Google I/O, the c...|
|China is dismissi...|
|Elizabeth Warren ...|
|(Reuters) - The s...|
|Joakim Noah's ﻿mo...|
|"Jermaine Jackson...|
|LONDON (Reuters) ...|
|"Nancy Pelosi is ...|
|The nonpartisan d...|
+--------------------+
only showing top 20 rows



In [17]:
text_column = data.select(col("_c8"))

In [18]:
text_column.show()

+--------------------+
|                 _c8|
+--------------------+
|             article|
|"This post is par...|
| The Indianapolis...|
|                null|
|DAVOS, Switzerlan...|
|PARIS (Reuters) -...|
|"Paris Hilton arr...|
|BERLIN, June 17 (...|
|                null|
|CARACAS (Reuters)...|
|"If only every da...|
|Google I/O, the c...|
|China is dismissi...|
|Elizabeth Warren ...|
|(Reuters) - The s...|
|Joakim Noah's ﻿mo...|
|"Jermaine Jackson...|
|LONDON (Reuters) ...|
|"Nancy Pelosi is ...|
|The nonpartisan d...|
+--------------------+
only showing top 20 rows



In [19]:
from pyspark.sql.functions import split

In [20]:
text_column = data.select(split(col("_c8")," ").alias("article"))

In [21]:
text_column.show(truncate=100)

+----------------------------------------------------------------------------------------------------+
|                                                                                             article|
+----------------------------------------------------------------------------------------------------+
|                                                                                           [article]|
|["This, post, is, part, of, Polyarchy,, an, independent, blog, produced, by, the, political, refo...|
|[, The, Indianapolis, Colts, made, Andrew, Luck, the, highest-paid, player, in, NFL, history, thi...|
|                                                                                                null|
|[DAVOS,, Switzerland, (Reuters), -, U.S., President, Donald, Trump, denied, a, report, on, Friday...|
|[PARIS, (Reuters), -, Former, French, president, Nicolas, Sarkozy, published, a, new, memoir, on,...|
|["Paris, Hilton, arrived, at, LAX, Wednesday, dressed, to, pay, her, las

In [22]:
from pyspark.sql.functions import explode

In [23]:
tokens = text_column.select(explode(col("article")).alias("tokens"))

In [24]:
tokens.show()

+-----------+
|     tokens|
+-----------+
|    article|
|      "This|
|       post|
|         is|
|       part|
|         of|
| Polyarchy,|
|         an|
|independent|
|       blog|
|   produced|
|         by|
|        the|
|  political|
|     reform|
|    program|
|         at|
|        New|
|   America,|
|          a|
+-----------+
only showing top 20 rows



In [25]:
from pyspark.sql.functions import lower

In [26]:
normalized_tokens = tokens.select(lower(col("tokens")).alias("normalized tokens"))

In [27]:
normalized_tokens.show()

+-----------------+
|normalized tokens|
+-----------------+
|          article|
|            "this|
|             post|
|               is|
|             part|
|               of|
|       polyarchy,|
|               an|
|      independent|
|             blog|
|         produced|
|               by|
|              the|
|        political|
|           reform|
|          program|
|               at|
|              new|
|         america,|
|                a|
+-----------------+
only showing top 20 rows



In [28]:
from pyspark.sql.functions import regexp_extract

In [29]:
clean_tokens = normalized_tokens.select(regexp_extract(col("normalized tokens"),"[a-z]*",0).
                                        alias("Cleaned Tokens"))

In [30]:
clean_tokens.show()

+--------------+
|Cleaned Tokens|
+--------------+
|       article|
|              |
|          post|
|            is|
|          part|
|            of|
|     polyarchy|
|            an|
|   independent|
|          blog|
|      produced|
|            by|
|           the|
|     political|
|        reform|
|       program|
|            at|
|           new|
|       america|
|             a|
+--------------+
only showing top 20 rows



In [31]:
non_null_cleaned_tokens = clean_tokens.filter(col("Cleaned Tokens") != "")

In [32]:
non_null_cleaned_tokens.show()

+--------------+
|Cleaned Tokens|
+--------------+
|       article|
|          post|
|            is|
|          part|
|            of|
|     polyarchy|
|            an|
|   independent|
|          blog|
|      produced|
|            by|
|           the|
|     political|
|        reform|
|       program|
|            at|
|           new|
|       america|
|             a|
|    washington|
+--------------+
only showing top 20 rows



In [33]:
token_groups = non_null_cleaned_tokens.groupby(col("Cleaned Tokens"))

In [34]:
token_groups

<pyspark.sql.group.GroupedData at 0x28ef4cc86d0>

In [35]:
token_counts = token_groups.count()

In [36]:
token_counts.show()

+--------------+-------+
|Cleaned Tokens|  count|
+--------------+-------+
|          some|1348511|
|    likelihood|  12071|
|         still| 682599|
|         those| 741766|
|      tortured|   5220|
|        online| 200524|
|           few| 380259|
|   transaction|  31312|
|     indicator|   8984|
|     involving|  42441|
|      incoming|  12798|
|     connected|  40889|
|       jewelry|  15130|
|        bazaar|   2039|
|  safeguarding|   1675|
|        filing|  58015|
|        brands|  55095|
|       flashed|   2282|
|   interaction|   9840|
|           art| 183475|
+--------------+-------+
only showing top 20 rows



In [37]:
token_counts.orderBy("count",ascending=False).show(100)

+--------------+--------+
|Cleaned Tokens|   count|
+--------------+--------+
|           the|49614097|
|            to|25278753|
|            of|22664966|
|             a|22242784|
|           and|21699171|
|            in|18907734|
|          that|10654902|
|            on| 9087425|
|           for| 9066347|
|            is| 7811476|
|            it| 7043139|
|          with| 6416271|
|            as| 5385847|
|            by| 5363492|
|           was| 5249367|
|          said| 4944904|
|            at| 4684935|
|            he| 4379023|
|          from| 4267604|
|           are| 3800689|
|            be| 3782725|
|           has| 3751435|
|          have| 3703778|
|           but| 3560702|
|            an| 3538650|
|             i| 3406538|
|          this| 3401500|
|           his| 3371237|
|           not| 3152063|
|           you| 2872337|
|          they| 2779378|
|          more| 2692541|
|           its| 2616173|
|           who| 2578406|
|          will| 2493764|
|         th

# We are leaving this task of finding top 100 most frequently occurring tokens as it is at this point of time and we will be back to improve this code after learning about some advancd topics in pyspark dataframes. For time being, we are going to take a diversion and learn about some more basic operations which will lead us to advanced topics. 

In [39]:
data.select(*data.columns[:3]).show()

+--------------------+--------------------+-------------------+
|                 _c0|                 _c1|                _c2|
+--------------------+--------------------+-------------------+
|                null|          Unnamed: 0|               date|
|                   0|                   0|2016-12-09 18:31:00|
|                   1|                   1|2016-10-07 21:26:46|
|The highest-paid ...|https://www.busin...|               null|
|                   2|                   2|2018-01-26 00:00:00|
|                   3|                   3|2019-06-27 00:00:00|
|                   4|                   4|2016-01-27 00:00:00|
|                   5|                   5|2019-06-17 00:00:00|
|Editing by Tassil...|https://www.reute...|         Financials|
|                   6|                   6|2019-06-23 00:00:00|
|                   7|                   7|2018-05-02 17:09:00|
|                   8|                   8|2016-05-18 13:00:06|
|                   9|                  

In [46]:
data.select(*data.columns[7:12]).show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                 _c7|                 _c8|                 _c9|                _c10|                _c11|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|               title|             article|                 url|             section|         publication|
|We should take co...|"This post is par...|             however| several critics ...|         for example|
|Colts GM Ryan Gri...| The Indianapolis...|                null|                null|                null|
|                null|                null|                null|                null|                null|
|Trump denies repo...|DAVOS, Switzerlan...|https://www.reute...|               Davos|             Reuters|
|France's Sarkozy ...|PARIS (Reuters) -...|https://www.reute...|          World News|             Reuters|
|Paris Hilton: Wom...|"Paris Hilton a

In [47]:
data.select("_c7","_c10","_c11").show()

+--------------------+--------------------+--------------------+
|                 _c7|                _c10|                _c11|
+--------------------+--------------------+--------------------+
|               title|             section|         publication|
|We should take co...| several critics ...|         for example|
|Colts GM Ryan Gri...|                null|                null|
|                null|                null|                null|
|Trump denies repo...|               Davos|             Reuters|
|France's Sarkozy ...|          World News|             Reuters|
|Paris Hilton: Wom...|                null|                 TMZ|
|ECB's Coeure: If ...|                null|                null|
|                null|                null|                null|
|Venezuela detains...|          World News|             Reuters|
|You Can Trick You...| paying attention...| it makes it easi...|
|How to watch the ...|                null|                 Vox|
|China is dismissi...|   

In [48]:
data.select(col("_c7"),col("_c10"),col("_c11")).show()

+--------------------+--------------------+--------------------+
|                 _c7|                _c10|                _c11|
+--------------------+--------------------+--------------------+
|               title|             section|         publication|
|We should take co...| several critics ...|         for example|
|Colts GM Ryan Gri...|                null|                null|
|                null|                null|                null|
|Trump denies repo...|               Davos|             Reuters|
|France's Sarkozy ...|          World News|             Reuters|
|Paris Hilton: Wom...|                null|                 TMZ|
|ECB's Coeure: If ...|                null|                null|
|                null|                null|                null|
|Venezuela detains...|          World News|             Reuters|
|You Can Trick You...| paying attention...| it makes it easi...|
|How to watch the ...|                null|                 Vox|
|China is dismissi...|   

In [49]:
import numpy as np

In [50]:
column_chunks = np.array_split(np.array(data.columns),len(data.columns)//3)

In [51]:
column_chunks

[array(['_c0', '_c1', '_c2'], dtype='<U4'),
 array(['_c3', '_c4', '_c5'], dtype='<U4'),
 array(['_c6', '_c7', '_c8'], dtype='<U4'),
 array(['_c9', '_c10', '_c11'], dtype='<U4')]

In [52]:
for x in column_chunks:
    data.select(*x).show()

+--------------------+--------------------+-------------------+
|                 _c0|                 _c1|                _c2|
+--------------------+--------------------+-------------------+
|                null|          Unnamed: 0|               date|
|                   0|                   0|2016-12-09 18:31:00|
|                   1|                   1|2016-10-07 21:26:46|
|The highest-paid ...|https://www.busin...|               null|
|                   2|                   2|2018-01-26 00:00:00|
|                   3|                   3|2019-06-27 00:00:00|
|                   4|                   4|2016-01-27 00:00:00|
|                   5|                   5|2019-06-17 00:00:00|
|Editing by Tassil...|https://www.reute...|         Financials|
|                   6|                   6|2019-06-23 00:00:00|
|                   7|                   7|2018-05-02 17:09:00|
|                   8|                   8|2016-05-18 13:00:06|
|                   9|                  

In [53]:
dropped_data = data.drop(*data.columns[:7])

In [55]:
dropped_data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                 _c7|                 _c8|                 _c9|                _c10|                _c11|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|               title|             article|                 url|             section|         publication|
|We should take co...|"This post is par...|             however| several critics ...|         for example|
|Colts GM Ryan Gri...| The Indianapolis...|                null|                null|                null|
|                null|                null|                null|                null|                null|
|Trump denies repo...|DAVOS, Switzerlan...|https://www.reute...|               Davos|             Reuters|
|France's Sarkozy ...|PARIS (Reuters) -...|https://www.reute...|          World News|             Reuters|
|Paris Hilton: Wom...|"Paris Hilton a

In [56]:
D = dict(zip(dropped_data.columns,['title','article','url','section','publication']))

In [57]:
D

{'_c7': 'title',
 '_c8': 'article',
 '_c9': 'url',
 '_c10': 'section',
 '_c11': 'publication'}

In [58]:
for old_name,new_name in D.items():
    dropped_data = dropped_data.withColumnRenamed(old_name,new_name)

In [60]:
dropped_data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|               title|             article|                 url|             section|         publication|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|               title|             article|                 url|             section|         publication|
|We should take co...|"This post is par...|             however| several critics ...|         for example|
|Colts GM Ryan Gri...| The Indianapolis...|                null|                null|                null|
|                null|                null|                null|                null|                null|
|Trump denies repo...|DAVOS, Switzerlan...|https://www.reute...|               Davos|             Reuters|
|France's Sarkozy ...|PARIS (Reuters) -...|https://www.reute...|          World News|             Reuters|
|Paris Hilton: Wom...|"Paris Hilton a

In [61]:
data_copy = spark.read.csv("all-the-news-2-1.csv",inferSchema=True)