In [2]:
from pyspark.sql.functions import *
import time
import pyspark.sql.functions as F
from pyspark.sql.types import *

In [3]:
text_file = sc.textFile("hdfs:///pp-data/Comments.xml")

In [4]:
text_file.take(3)

['<?xml version="1.0" encoding="utf-8"?>',
 '<comments>',
 '  <row Id="13" PostId="23" Score="0" Text="Using /opt helps me keep track of the applications I\'ve installed myself." CreationDate="2010-07-28T19:36:59.773" UserId="10" ContentLicense="CC BY-SA 2.5" />']

In [5]:
filteredRDD = text_file.filter(lambda x: x.startswith("  <row "))

In [6]:
filteredRDD.take(1)

['  <row Id="13" PostId="23" Score="0" Text="Using /opt helps me keep track of the applications I\'ve installed myself." CreationDate="2010-07-28T19:36:59.773" UserId="10" ContentLicense="CC BY-SA 2.5" />']

In [7]:
cleanedRDD = filteredRDD.map(lambda x: x.lstrip("  "))

In [8]:
cleanedRDD.take(1)

['<row Id="13" PostId="23" Score="0" Text="Using /opt helps me keep track of the applications I\'ve installed myself." CreationDate="2010-07-28T19:36:59.773" UserId="10" ContentLicense="CC BY-SA 2.5" />']

In [9]:
import xml.etree.ElementTree as ET

def parse_xml(rdd):
    """
    Read the xml string from rdd, parse and extract the elements,
    then return a list of list.
    """
    root = ET.fromstring(rdd)
    rec = []
    
    if "Text" in root.attrib:
        rec.append(root.attrib['Text'])
    else:
        rec.append("N/A")
    
    return rec

In [10]:
records_rdd = cleanedRDD.map(lambda x : parse_xml(x))

In [11]:
records_rdd.take(3)

[["Using /opt helps me keep track of the applications I've installed myself."],
 ["but popping in a live cd I already have isn't going to work huh?"],
 ['That will revert the splash screen as well as the login? I almost did that, but grew hesitant.']]

In [49]:
stop_words_text = sc.textFile("file:///home/aarora7/P4-ayush-adarsh/03 stopwords.txt")

In [51]:
stop_words_text.take(3)

['a', 'about', 'above']

In [52]:
stop_words_rdd = stop_words_text.map(lambda x: x.split())

In [53]:
stop_words_rdd.take(3)

[['a'], ['about'], ['above']]

In [54]:
stop_words_df = spark.createDataFrame(stop_words_rdd)

In [55]:
stop_words_df.show()

+--------+
|      _1|
+--------+
|       a|
|   about|
|   above|
|  across|
|   after|
|   again|
| against|
|     all|
|  almost|
|   alone|
|   along|
| already|
|    also|
|although|
|  always|
|   among|
|      an|
|     and|
| another|
|     any|
+--------+
only showing top 20 rows



In [56]:
stop_words_list = list(stop_words_df.select('_1').toPandas()['_1'])
stop_words_list

['a',
 'about',
 'above',
 'across',
 'after',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'among',
 'an',
 'and',
 'another',
 'any',
 'anybody',
 'anyone',
 'anything',
 'anywhere',
 'are',
 'area',
 'areas',
 'around',
 'as',
 'ask',
 'asked',
 'asking',
 'asks',
 'at',
 'away',
 'b',
 'back',
 'backed',
 'backing',
 'backs',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'been',
 'before',
 'began',
 'behind',
 'being',
 'beings',
 'best',
 'better',
 'between',
 'big',
 'both',
 'but',
 'by',
 'c',
 'came',
 'can',
 'cannot',
 'case',
 'cases',
 'certain',
 'certainly',
 'clear',
 'clearly',
 'come',
 'could',
 'd',
 'did',
 'differ',
 'different',
 'differently',
 'do',
 'does',
 'done',
 'down',
 'down',
 'downed',
 'downing',
 'downs',
 'during',
 'e',
 'each',
 'early',
 'either',
 'end',
 'ended',
 'ending',
 'ends',
 'enough',
 'even',
 'evenly',
 'ever',
 'every',
 'everybody',
 'everyone',
 'everything',

In [57]:
def remove_stop_word(x):
    list = []
    s_split = x[0].split(" ")
    for i in s_split:
        i = i.lower()
        if i not in stop_words_list:
            if not i.startswith('.') and not i.endswith('.') and not i.startswith('?') and not i.endswith('?') and not i.startswith('&') and not i.endswith('&'):
                list.append((i,1))
    return list

In [58]:
cleaned_text = records_rdd.flatMap(lambda x: remove_stop_word(x))

In [59]:
cleaned_text.take(3)

[('/opt', 1), ('helps', 1), ('track', 1)]

In [60]:
topic_count = cleaned_text.reduceByKey(lambda y,x: x+y)

In [61]:
topic_count.take(3)

[('manpage,', 44), ('python', 6727), ("doesn't", 45536)]

In [62]:
topic_count_col = ["topic","count"]
topic_count_data = topic_count.toDF(topic_count_col)

In [63]:
topic_count_data.show()

+------------+-----+
|       topic|count|
+------------+-----+
|    manpage,|   44|
|      python| 6727|
|     doesn't|45536|
|      @muru,|  315|
|     exposes|   52|
|        sane|  289|
|     package|26528|
|      expose|  162|
|    because,|  284|
|        hope| 7065|
|        mean|19178|
|     manpage|  380|
| pepperflash|   47|
|      player| 1593|
|adobe-flash,|    1|
|      plugin| 2332|
|         cat| 1564|
|         pay|  753|
|      follow| 6359|
|     instead|18999|
+------------+-----+
only showing top 20 rows



In [64]:
final = topic_count_data.orderBy(col('count').desc())

In [None]:
final.repartition(1).write.csv("hdfs:///topic_count", sep=',')