# Topic Model Notebook
Author: Andrew  

This notebook will outline the steps used when cleaning the raw articles from Towards Data Science. 

In [1]:
# Load Libraries
import pyspark

In [2]:
# start SparkSession
spark = pyspark.sql.SparkSession.builder.config("spark.driver.memory", "15g").getOrCreate()
spark.getActiveSession()

## Clean Data

In [3]:
# Load CSV into Spark

from pyspark.sql.types import *
import pandas as pd

df = pd.read_csv('../src/TDS_articles.csv', index_col=0)

mySchema = StructType([ StructField("title", StringType(), True)\
                       ,StructField("subtitle", StringType(), True)\
                       ,StructField("author", StringType(), True)\
                       ,StructField("date", StringType(), True)\
                       ,StructField("body", StringType(), True)\
                       ,StructField("link", StringType(), True)\
                       ,StructField("article_id", IntegerType(), True)])

articles = spark.createDataFrame(df, schema=mySchema)

In [4]:
# Delete the pandas df to save memory
del df

In [5]:
articles.show()

+--------------------+--------------------+--------------------+----------+--------------------+--------------------+----------+
|               title|            subtitle|              author|      date|                body|                link|article_id|
+--------------------+--------------------+--------------------+----------+--------------------+--------------------+----------+
|Lessons Learned F...| Be prepared to code|     John Wittenauer|2014-11-25|This content orig...|https://towardsda...|         1|
|   The Next Big Wave|                 NaN|       Salman Naseer|2017-03-03|IoT, Big Data, M2...|https://towardsda...|      1580|
|Thinking about Da...|What might DSaaP ...|       Chris Dowsett|2016-05-29|The “usability of...|https://towardsda...|      1657|
|The Science / Eng...|                 NaN|          Jenny Kwan|2015-07-13|As I wrote about ...|https://towardsda...|      1710|
|So You Want to be...|                 NaN|          Jenny Kwan|2015-07-13|We could discuss ...|h

In [5]:
# register SQL table
articles.registerTempTable('articles')

In [6]:
query = """
SELECT * FROM articles;
"""
spark.sql(query).show()

+--------------------+--------------------+--------------------+----------+--------------------+--------------------+----------+
|               title|            subtitle|              author|      date|                body|                link|article_id|
+--------------------+--------------------+--------------------+----------+--------------------+--------------------+----------+
|Lessons Learned F...| Be prepared to code|     John Wittenauer|2014-11-25|This content orig...|https://towardsda...|         1|
|   The Next Big Wave|                 NaN|       Salman Naseer|2017-03-03|IoT, Big Data, M2...|https://towardsda...|      1580|
|Thinking about Da...|What might DSaaP ...|       Chris Dowsett|2016-05-29|The “usability of...|https://towardsda...|      1657|
|The Science / Eng...|                 NaN|          Jenny Kwan|2015-07-13|As I wrote about ...|https://towardsda...|      1710|
|So You Want to be...|                 NaN|          Jenny Kwan|2015-07-13|We could discuss ...|h

In [8]:
query = """
SELECT COUNT(subtitle)
FROM articles
WHERE subtitle like CONCAT('%', author, '%');
"""

spark.sql(query).show()

+---------------+
|count(subtitle)|
+---------------+
|             85|
+---------------+



Some of the Authors are replicated in the subtitles. 

In [9]:
query = """
SELECT subtitle, author
FROM articles
WHERE subtitle like CONCAT('%', author, '%');
"""

spark.sql(query).show()

+--------------------+--------------------+
|            subtitle|              author|
+--------------------+--------------------+
|by Jose Marcial P...|Jose Marcial Port...|
|ashispapu (Ashis ...|         Ashis Samal|
|By Eli Bildner, E...|         Eli Bildner|
|Using Machine Lea...|          DeviceHive|
|RecSys Week 1: Th...|                   R|
|WHAT DO WE DO TO ...|                   T|
|Sukant Khurana (@...|      Sukant Khurana|
|Sukant Khurana (@...|      Sukant Khurana|
|                 NaN|                 NaN|
|                 NaN|                 NaN|
|Co-Authors: Konst...|  Konstantinos Bozas|
|                 NaN|                 NaN|
|Naveen Manwani - ...|      Naveen Manwani|
|By Zina Akrout, S...|     Samantha Bansil|
|Applying François...|                   A|
|You will soon be ...|                   Y|
|Laurent El Ghaoui...|   Laurent El Ghaoui|
|By Werlindo Mangr...|          Mia Iseman|
|Written by Vivian...| Viviane Lindenbergh|
|Rocket (Data) Sci...|         Y

In [5]:
# Register cleaning function as UDF 
from cleaning import clean_doc
from pyspark.sql.functions import udf

clean_udf = udf(lambda doc: clean_doc(doc), StringType())

In [6]:
# Clean "body" with udf
clean_df = articles.withColumn("clean_body", clean_udf("body"))

In [7]:
clean_df.show(5)

+--------------------+--------------------+---------------+----------+--------------------+--------------------+----------+--------------------+
|               title|            subtitle|         author|      date|                body|                link|article_id|          clean_body|
+--------------------+--------------------+---------------+----------+--------------------+--------------------+----------+--------------------+
|Lessons Learned F...| Be prepared to code|John Wittenauer|2014-11-25|This content orig...|https://towardsda...|         1|content originall...|
|   The Next Big Wave|                 NaN|  Salman Naseer|2017-03-03|IoT, Big Data, M2...|https://towardsda...|      1580|iot big data m m ...|
|Thinking about Da...|What might DSaaP ...|  Chris Dowsett|2016-05-29|The “usability of...|https://towardsda...|      1657|usability datum a...|
|The Science / Eng...|                 NaN|     Jenny Kwan|2015-07-13|As I wrote about ...|https://towardsda...|      1710|write p

In [10]:
clean_df.printSchema()

root
 |-- title: string (nullable = true)
 |-- subtitle: string (nullable = true)
 |-- author: string (nullable = true)
 |-- date: string (nullable = true)
 |-- body: string (nullable = true)
 |-- link: string (nullable = true)
 |-- article_id: integer (nullable = true)
 |-- clean_body: string (nullable = true)



## Preprocessing - Get Document Term Matrix (dtm) 

In [8]:
from pyspark.ml.feature import Tokenizer
from pyspark.ml import Pipeline
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.clustering import LDA


tk = Tokenizer(inputCol='clean_body', outputCol='tokens')
vectorizer = CountVectorizer(inputCol=tk.getOutputCol(), outputCol='term_freq', minDF=0.1, )
lda_model = LDA()


In [10]:
pipe = Pipeline(stages=[tk, vectorizer, lda_model])

In [9]:
clean_df = tk.transform(clean_df)

In [10]:
clean_df.select('tokens').show(5)

+--------------------+
|              tokens|
+--------------------+
|[content, origina...|
|[iot, big, data, ...|
|[usability, datum...|
|[write, previousl...|
|[discuss, suspect...|
+--------------------+
only showing top 5 rows



In [None]:
# Do NOT Run this Cell until you talk to a TA! takes a LOT of memory
cv = vectorizer.fit(clean_df)
dtm = cv.transform(clean_df)

In [None]:
dtm.show(5)