In [66]:
from pyspark.sql.functions import *
import time
import pyspark.sql.functions as F
from pyspark.sql.types import *

In [67]:
text_file = sc.textFile("gs://stackoverflow-dataset-677/Users.xml")

In [68]:
text_file.take(3)

['<?xml version="1.0" encoding="utf-8"?>',
 '<users>',
 '  <row Id="-1" Reputation="1" CreationDate="2010-07-28T16:38:27.683" DisplayName="Community" LastAccessDate="2010-07-28T16:38:27.683" WebsiteUrl="http://meta.stackexchange.com/" Location="on the server farm" AboutMe="&lt;p&gt;Hi, I\'m not really a person.&lt;/p&gt;&#xA;&#xA;&lt;p&gt;I\'m a background process that helps keep this site clean!&lt;/p&gt;&#xA;&#xA;&lt;p&gt;I do things like&lt;/p&gt;&#xA;&#xA;&lt;ul&gt;&#xA;&lt;li&gt;Randomly poke old unanswered questions every hour so they get some attention&lt;/li&gt;&#xA;&lt;li&gt;Own community questions and answers so nobody gets unnecessary reputation from them&lt;/li&gt;&#xA;&lt;li&gt;Own downvotes on spam/evil posts that get permanently deleted&lt;/li&gt;&#xA;&lt;li&gt;Own suggested edits from anonymous users&lt;/li&gt;&#xA;&lt;li&gt;&lt;a href=&quot;http://meta.stackexchange.com/a/92006&quot;&gt;Remove abandoned questions&lt;/a&gt;&lt;/li&gt;&#xA;&lt;/ul&gt;&#xA;" Views="0" UpV

In [69]:
filteredRDD = text_file.filter(lambda x: x.startswith("  <row "))

In [70]:
filteredRDD.take(1)

['  <row Id="-1" Reputation="1" CreationDate="2010-07-28T16:38:27.683" DisplayName="Community" LastAccessDate="2010-07-28T16:38:27.683" WebsiteUrl="http://meta.stackexchange.com/" Location="on the server farm" AboutMe="&lt;p&gt;Hi, I\'m not really a person.&lt;/p&gt;&#xA;&#xA;&lt;p&gt;I\'m a background process that helps keep this site clean!&lt;/p&gt;&#xA;&#xA;&lt;p&gt;I do things like&lt;/p&gt;&#xA;&#xA;&lt;ul&gt;&#xA;&lt;li&gt;Randomly poke old unanswered questions every hour so they get some attention&lt;/li&gt;&#xA;&lt;li&gt;Own community questions and answers so nobody gets unnecessary reputation from them&lt;/li&gt;&#xA;&lt;li&gt;Own downvotes on spam/evil posts that get permanently deleted&lt;/li&gt;&#xA;&lt;li&gt;Own suggested edits from anonymous users&lt;/li&gt;&#xA;&lt;li&gt;&lt;a href=&quot;http://meta.stackexchange.com/a/92006&quot;&gt;Remove abandoned questions&lt;/a&gt;&lt;/li&gt;&#xA;&lt;/ul&gt;&#xA;" Views="0" UpVotes="19522" DownVotes="185479" AccountId="-1" />']

In [71]:
cleanedRDD = filteredRDD.map(lambda x: x.lstrip("  "))

In [72]:
cleanedRDD.take(1)

['<row Id="-1" Reputation="1" CreationDate="2010-07-28T16:38:27.683" DisplayName="Community" LastAccessDate="2010-07-28T16:38:27.683" WebsiteUrl="http://meta.stackexchange.com/" Location="on the server farm" AboutMe="&lt;p&gt;Hi, I\'m not really a person.&lt;/p&gt;&#xA;&#xA;&lt;p&gt;I\'m a background process that helps keep this site clean!&lt;/p&gt;&#xA;&#xA;&lt;p&gt;I do things like&lt;/p&gt;&#xA;&#xA;&lt;ul&gt;&#xA;&lt;li&gt;Randomly poke old unanswered questions every hour so they get some attention&lt;/li&gt;&#xA;&lt;li&gt;Own community questions and answers so nobody gets unnecessary reputation from them&lt;/li&gt;&#xA;&lt;li&gt;Own downvotes on spam/evil posts that get permanently deleted&lt;/li&gt;&#xA;&lt;li&gt;Own suggested edits from anonymous users&lt;/li&gt;&#xA;&lt;li&gt;&lt;a href=&quot;http://meta.stackexchange.com/a/92006&quot;&gt;Remove abandoned questions&lt;/a&gt;&lt;/li&gt;&#xA;&lt;/ul&gt;&#xA;" Views="0" UpVotes="19522" DownVotes="185479" AccountId="-1" />']

In [73]:
import xml.etree.ElementTree as ET

def parse_xml(rdd):
    """
    Read the xml string from rdd, parse and extract the elements,
    then return a list of list.
    """
    root = ET.fromstring(rdd)
    rec = []
    id = root.attrib['Id']
    if id == "-1":
        id = "1"
    rec.append(id)
#     rec.append(root.attrib['Reputation'])
#     rec.append(root.attrib['CreationDate'])
    rec.append(root.attrib['DisplayName'])
    return rec

In [74]:
records_rdd = cleanedRDD.map(lambda x : parse_xml(x))

In [75]:
records_rdd.take(3)

[['1', 'Community'], ['2', 'Geoff Dalgas'], ['3', 'Jarrod Dixon']]

In [76]:
user_data = ["id","username"]
user_df = records_rdd.toDF(user_data)

In [77]:
user_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- username: string (nullable = true)



In [78]:
user_df.show(truncate=False)

+---+----------------+
|id |username        |
+---+----------------+
|1  |Community       |
|2  |Geoff Dalgas    |
|3  |Jarrod Dixon    |
|4  |txwikinger      |
|5  |Nathan Osman    |
|6  |Emmett          |
|7  |Helix           |
|8  |mechanical_meat |
|9  |Andrew          |
|10 |DLH             |
|11 |hannes.koller   |
|12 |Michael Terry   |
|13 |Keith Maurino   |
|14 |Jweede          |
|16 |Jeremy L        |
|17 |tutuca          |
|18 |excid3          |
|20 |ParanoiaPuppy   |
|21 |GeoD            |
|22 |Alan Featherston|
+---+----------------+
only showing top 20 rows



In [79]:
from pyspark.sql.types import StructType,StructField, StringType
userSchema = StructType([       
    StructField('id', StringType(), True),
    StructField('dept_id', StringType(), True)
])

user_df1 = spark.createDataFrame(data=records_rdd, schema = userSchema)

In [80]:
user_df1.printSchema()

root
 |-- id: string (nullable = true)
 |-- dept_id: string (nullable = true)



In [81]:
user_df1.show(truncate=False)

+---+----------------+
|id |dept_id         |
+---+----------------+
|1  |Community       |
|2  |Geoff Dalgas    |
|3  |Jarrod Dixon    |
|4  |txwikinger      |
|5  |Nathan Osman    |
|6  |Emmett          |
|7  |Helix           |
|8  |mechanical_meat |
|9  |Andrew          |
|10 |DLH             |
|11 |hannes.koller   |
|12 |Michael Terry   |
|13 |Keith Maurino   |
|14 |Jweede          |
|16 |Jeremy L        |
|17 |tutuca          |
|18 |excid3          |
|20 |ParanoiaPuppy   |
|21 |GeoD            |
|22 |Alan Featherston|
+---+----------------+
only showing top 20 rows



In [82]:
user_df1.count()

855054

In [84]:
user_df.repartition(1).write.csv("gs://stackoverflow-dataset-677/users_out", sep=',')