In [8]:
from pyspark.sql.functions import *
import time
import pyspark.sql.functions as F
from pyspark.sql.types import *

In [9]:
text_file = sc.textFile("gs://stackoverflow-dataset-677/Posts.xml")

In [10]:
text_file.take(3)

['<?xml version="1.0" encoding="utf-8"?>',
 '<posts>',
 '  <row Id="1" PostTypeId="1" AcceptedAnswerId="2" CreationDate="2010-07-28T19:04:21.300" Score="61" ViewCount="3978" Body="&lt;p&gt;Every time I turn on my computer, I see a message saying something like:&lt;/p&gt;&#xA;&#xA;&lt;pre&gt;&lt;code&gt;Your battery may be old or broken.&#xA;&lt;/code&gt;&lt;/pre&gt;&#xA;&#xA;&lt;p&gt;I am already aware that my battery is bad. How do I suppress this message?&lt;/p&gt;&#xA;" OwnerUserId="5" LastEditorUserId="208574" LastEditDate="2014-12-16T01:47:45.980" LastActivityDate="2018-10-05T23:56:48.997" Title="How to get the &quot;Your battery is broken&quot; message to go away?" Tags="&lt;power-management&gt;&lt;notification&gt;" AnswerCount="4" CommentCount="2" FavoriteCount="7" ContentLicense="CC BY-SA 3.0" />']

In [11]:
filteredRDD = text_file.filter(lambda x: x.startswith("  <row "))

In [12]:
cleanedRDD = filteredRDD.map(lambda x: x.lstrip("  "))

In [33]:
cleanedRDD.take(1)

['<row Id="1" PostTypeId="1" AcceptedAnswerId="2" CreationDate="2010-07-28T19:04:21.300" Score="61" ViewCount="3978" Body="&lt;p&gt;Every time I turn on my computer, I see a message saying something like:&lt;/p&gt;&#xA;&#xA;&lt;pre&gt;&lt;code&gt;Your battery may be old or broken.&#xA;&lt;/code&gt;&lt;/pre&gt;&#xA;&#xA;&lt;p&gt;I am already aware that my battery is bad. How do I suppress this message?&lt;/p&gt;&#xA;" OwnerUserId="5" LastEditorUserId="208574" LastEditDate="2014-12-16T01:47:45.980" LastActivityDate="2018-10-05T23:56:48.997" Title="How to get the &quot;Your battery is broken&quot; message to go away?" Tags="&lt;power-management&gt;&lt;notification&gt;" AnswerCount="4" CommentCount="2" FavoriteCount="7" ContentLicense="CC BY-SA 3.0" />']

In [35]:
import xml.etree.ElementTree as ET

def parse_xml(rdd):
    """
    Read the xml string from rdd, parse and extract the elements,
    then return a list of list.
    """
    root = ET.fromstring(rdd)
    rec = []
    
    if "Id" in root.attrib:
        rec.append(int(root.attrib['Id']))
    else:
        rec.append(0)

    
    if "Score" in root.attrib:
        rec.append(int(root.attrib['Score']))
    else:
        rec.append(0)
    
    if "AnswerCount" in root.attrib:
        rec.append(int(root.attrib['AnswerCount']))
    else:
        rec.append(0)
    
    if "Title" in root.attrib:
        rec.append(root.attrib['Title'])
    else:
        rec.append("N/A")
    
    if "Body" in root.attrib:
        rec.append(root.attrib['Body'].replace("<p>","").replace("</p>","").replace("\n","").replace("<code>","").replace("</code>","").replace("<pre>","").replace("</pre>",""))
    else:
        rec.append("N/A")
    
    if "CreationDate" in root.attrib:
        rec.append(root.attrib['CreationDate'])
    else:
        rec.append("N/A")
        
    if "ViewCount" in root.attrib:
        rec.append(int(root.attrib['ViewCount']))
    else:
        rec.append(0)
        
        

    if "OwnerUserId" in root.attrib:
        rec.append(int(root.attrib['OwnerUserId']))
    else:
        rec.append(0)
    return rec
        

    if "LastEditorUserId" in root.attrib:
        rec.append(int(root.attrib['LastEditorUserId']))
    else:
        rec.append(0)
    return rec

In [36]:
records_rdd = cleanedRDD.map(lambda x : parse_xml(x))

In [37]:
records_rdd.take(1)

[[1,
  61,
  4,
  'How to get the "Your battery is broken" message to go away?',
  'Every time I turn on my computer, I see a message saying something like:Your battery may be old or broken.I am already aware that my battery is bad. How do I suppress this message?',
  '2010-07-28T19:04:21.300',
  3978,
  5]]

In [38]:
posts_data = ["id","post_score","answerCount","title","body","creationDate","ownerUserId","lastEditorUserId"]
posts_df = records_rdd.toDF(posts_data)

In [40]:
posts_df.show()

+---+----------+-----------+--------------------+--------------------+--------------------+-----------+----------------+
| id|post_score|answerCount|               title|                body|        creationDate|ownerUserId|lastEditorUserId|
+---+----------+-----------+--------------------+--------------------+--------------------+-----------+----------------+
|  1|        61|          4|How to get the "Y...|Every time I turn...|2010-07-28T19:04:...|       3978|               5|
|  2|        41|          0|                 N/A|Maybe <a href="ht...|2010-07-28T19:15:...|          0|               4|
|  3|        48|          5|How can I set the...|How can I set the...|2010-07-28T19:21:...|      13236|              35|
|  5|        22|          2|What are some alt...|What are some alt...|2010-07-28T19:23:...|        637|              10|
|  6|        41|          9|How to graphicall...|I have a ubuntu d...|2010-07-28T19:23:...|      23359|              27|
|  7|        26|          6|How 

In [None]:
df = df.dropna()

In [None]:
https://towardsdatascience.com/sentiment-analysis-with-pyspark-bc8e83f80c35