In [1]:
from pyspark.sql.functions import *
import time
import pyspark.sql.functions as F
from pyspark.sql.types import *

In [2]:
text_file = sc.textFile("gs://stackoverflow-dataset-677/Comments.xml")

In [3]:
text_file.take(3)

['<?xml version="1.0" encoding="utf-8"?>',
 '<comments>',
 '  <row Id="13" PostId="23" Score="0" Text="Using /opt helps me keep track of the applications I\'ve installed myself." CreationDate="2010-07-28T19:36:59.773" UserId="10" ContentLicense="CC BY-SA 2.5" />']

In [4]:
filteredRDD = text_file.filter(lambda x: x.startswith("  <row "))

In [5]:
filteredRDD.take(1)

['  <row Id="13" PostId="23" Score="0" Text="Using /opt helps me keep track of the applications I\'ve installed myself." CreationDate="2010-07-28T19:36:59.773" UserId="10" ContentLicense="CC BY-SA 2.5" />']

In [6]:
cleanedRDD = filteredRDD.map(lambda x: x.lstrip("  "))

In [7]:
cleanedRDD.take(1)

['<row Id="13" PostId="23" Score="0" Text="Using /opt helps me keep track of the applications I\'ve installed myself." CreationDate="2010-07-28T19:36:59.773" UserId="10" ContentLicense="CC BY-SA 2.5" />']

In [94]:
import xml.etree.ElementTree as ET

def parse_xml(rdd):
    """
    Read the xml string from rdd, parse and extract the elements,
    then return a list of list.
    """
    root = ET.fromstring(rdd)
    rec = []
    
    if "PostId" in root.attrib:
        rec.append(int(root.attrib['PostId']))
    else:
        rec.append(0)

    
    if "Score" in root.attrib:
        rec.append(int(root.attrib['Score']))
    else:
        rec.append(0)

    
    if "Text" in root.attrib:
        rec.append(root.attrib['Text'])
    else:
        rec.append("N/A")
    
    if "CreationDate" in root.attrib:
        rec.append(root.attrib['CreationDate'])
    else:
        rec.append("N/A")

    if "UserId" in root.attrib:
        rec.append(int(root.attrib['UserId']))
    else:
        rec.append(0)
    return rec

In [97]:
records_rdd = cleanedRDD.map(lambda x : parse_xml(x))

In [98]:
comments_data = ["postId","score","text","creationDate","userId"]
comments_df = records_rdd.toDF(comments_data)

In [99]:
comments_df.printSchema()

root
 |-- postId: long (nullable = true)
 |-- score: long (nullable = true)
 |-- text: string (nullable = true)
 |-- creationDate: string (nullable = true)
 |-- userId: long (nullable = true)



In [100]:
comments_df.show()

+------+-----+--------------------+--------------------+------+
|postId|score|                text|        creationDate|userId|
+------+-----+--------------------+--------------------+------+
|    23|    0|Using /opt helps ...|2010-07-28T19:36:...|    10|
|    18|    0|but popping in a ...|2010-07-28T19:38:...|    10|
|    27|    0|That will revert ...|2010-07-28T19:39:...|    50|
|    31|    0|I think you meant...|2010-07-28T19:41:...|    12|
|    18|    0|@DLH apparently n...|2010-07-28T19:41:...|    63|
|    12|    2|"ssh -X <server> ...|2010-07-28T19:46:...|    96|
|    12|    0|@Suppressingfire:...|2010-07-28T19:48:...|    10|
|    50|    0|Can you please re...|2010-07-28T19:48:...|    56|
|    27|    0|It probably shoul...|2010-07-28T19:49:...|     5|
|    58|    0|Do you mean the c...|2010-07-28T19:50:...|     5|
|    47|    0|Have you checked ...|2010-07-28T19:50:...|     4|
|    47|    1|Might be related ...|2010-07-28T19:51:...|   104|
|    58|    0|Do you use Gnome ...|2010-

In [101]:
comments_df.select("postId","score").show()

+------+-----+
|postId|score|
+------+-----+
|    23|    0|
|    18|    0|
|    27|    0|
|    31|    0|
|    18|    0|
|    12|    2|
|    12|    0|
|    50|    0|
|    27|    0|
|    58|    0|
|    47|    0|
|    47|    1|
|    58|    0|
|    60|    0|
|    18|    0|
|    52|    0|
|    56|    2|
|    10|    0|
|    70|    1|
|    70|    0|
+------+-----+
only showing top 20 rows



In [102]:
comments_df.createOrReplaceTempView("comments")

In [103]:
comments_sql_df = spark.sql("SELECT * FROM comments")

In [104]:
comments_sql_df.printSchema()

root
 |-- postId: long (nullable = true)
 |-- score: long (nullable = true)
 |-- text: string (nullable = true)
 |-- creationDate: string (nullable = true)
 |-- userId: long (nullable = true)



In [105]:
comments_sql_df.show()

+------+-----+--------------------+--------------------+------+
|postId|score|                text|        creationDate|userId|
+------+-----+--------------------+--------------------+------+
|    23|    0|Using /opt helps ...|2010-07-28T19:36:...|    10|
|    18|    0|but popping in a ...|2010-07-28T19:38:...|    10|
|    27|    0|That will revert ...|2010-07-28T19:39:...|    50|
|    31|    0|I think you meant...|2010-07-28T19:41:...|    12|
|    18|    0|@DLH apparently n...|2010-07-28T19:41:...|    63|
|    12|    2|"ssh -X <server> ...|2010-07-28T19:46:...|    96|
|    12|    0|@Suppressingfire:...|2010-07-28T19:48:...|    10|
|    50|    0|Can you please re...|2010-07-28T19:48:...|    56|
|    27|    0|It probably shoul...|2010-07-28T19:49:...|     5|
|    58|    0|Do you mean the c...|2010-07-28T19:50:...|     5|
|    47|    0|Have you checked ...|2010-07-28T19:50:...|     4|
|    47|    1|Might be related ...|2010-07-28T19:51:...|   104|
|    58|    0|Do you use Gnome ...|2010-

In [106]:
spark.sql("SELECT * FROM comments where userId = 10").show()

+------+-----+--------------------+--------------------+------+
|postId|score|                text|        creationDate|userId|
+------+-----+--------------------+--------------------+------+
|    23|    0|Using /opt helps ...|2010-07-28T19:36:...|    10|
|    18|    0|but popping in a ...|2010-07-28T19:38:...|    10|
|    12|    0|@Suppressingfire:...|2010-07-28T19:48:...|    10|
|    77|    0|hmm that would be...|2010-07-28T20:05:...|    10|
|    78|    0|oh really? hmm I ...|2010-07-28T20:35:...|    10|
|   396|    1|Where can I get a...|2010-07-29T14:55:...|    10|
|   405|    2|Huh I hadn't thou...|2010-07-29T17:10:...|    10|
|   159|    0|I had some basic ...|2010-07-29T17:20:...|    10|
|   331|    0|+5 Aw dang! Can't...|2010-07-29T17:54:...|    10|
|   327|    5|     community wiki?|2010-07-29T17:56:...|    10|
|   430|    0|@both: Agreed. Th...|2010-07-29T19:52:...|    10|
|   797|    0|I don't think it'...|2010-08-04T18:12:...|    10|
|   865|    4|Yeah I've been ve...|2010-

In [107]:
users_data = sc.textFile("gs://stackoverflow-dataset-677/users_out/*.csv")

In [108]:
users_data.take(3)

['1,Community', '2,Geoff Dalgas', '3,Jarrod Dixon']

In [109]:
def create_user(rdd):
    rdd_split = rdd.split(",")
    return [int(rdd_split[0]),rdd_split[1]]

In [110]:
users_rdd = users_data.map(lambda x: create_user(x))

In [111]:
users_rdd.take(3)

[[1, 'Community'], [2, 'Geoff Dalgas'], [3, 'Jarrod Dixon']]

In [112]:
user_data = ["id","username"]
user_df = users_rdd.toDF(user_data)

In [113]:
user_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- username: string (nullable = true)



In [114]:
user_df.show()

+---+----------------+
| id|        username|
+---+----------------+
|  1|       Community|
|  2|    Geoff Dalgas|
|  3|    Jarrod Dixon|
|  4|      txwikinger|
|  5|    Nathan Osman|
|  6|          Emmett|
|  7|           Helix|
|  8| mechanical_meat|
|  9|          Andrew|
| 10|             DLH|
| 11|   hannes.koller|
| 12|   Michael Terry|
| 13|   Keith Maurino|
| 14|          Jweede|
| 16|        Jeremy L|
| 17|          tutuca|
| 18|          excid3|
| 20|   ParanoiaPuppy|
| 21|            GeoD|
| 22|Alan Featherston|
+---+----------------+
only showing top 20 rows



In [115]:
user_df.createOrReplaceTempView("users")

In [116]:
user_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- username: string (nullable = true)



In [117]:
spark.sql("SELECT * FROM users where id=26").show()

+---+--------+
| id|username|
+---+--------+
| 26| njpatel|
+---+--------+



In [118]:
comments_users_sql_df = spark.sql("SELECT * FROM users u JOIN comments c ON u.id = c.UserId")
comments_users_sql_df.show()

+----+-------------------+------+-----+--------------------+--------------------+------+
|  id|           username|postId|score|                text|        creationDate|userId|
+----+-------------------+------+-----+--------------------+--------------------+------+
| 964|Hendrik Brummermann|  4602|    0|I can confirm thi...|2010-10-13T21:37:...|   964|
| 964|Hendrik Brummermann|118087|    0|They took it in d...|2012-04-28T06:17:...|   964|
| 964|Hendrik Brummermann|638027|    0|I have the same i...|2015-08-03T13:26:...|   964|
|1677|         eslambasha| 84949|    0|@fossfreedom i do...|2011-12-03T21:56:...|  1677|
|1697|           Frxstrem| 16683|    0|@Marco, I know, I...|2010-12-08T22:36:...|  1697|
|1697|           Frxstrem| 16784|    0|This seems to be ...|2010-12-09T19:05:...|  1697|
|1697|           Frxstrem| 16886|    1|I only want to di...|2010-12-10T22:26:...|  1697|
|1697|           Frxstrem| 16892|    1|This is not an ac...|2010-12-10T22:28:...|  1697|
|1697|           Frxs

In [None]:
user_data = ["id","username"]
user_df = users_rdd.toDF(user_data)

In [119]:
comments_users_sql_df.createOrReplaceTempView("comments_users")

In [121]:
user_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- username: string (nullable = true)

