In [90]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import year, month, dayofmonth


In [91]:
spark = SparkSession.builder.appName('github notebook').master("local").getOrCreate()

In [92]:
spark

In [105]:
df = spark.read.format("json").load("../ghfile/*.json.gz")

In [94]:
df.printSchema()

root
 |-- actor: struct (nullable = true)
 |    |-- avatar_url: string (nullable = true)
 |    |-- display_login: string (nullable = true)
 |    |-- gravatar_id: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- login: string (nullable = true)
 |    |-- url: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- id: string (nullable = true)
 |-- org: struct (nullable = true)
 |    |-- avatar_url: string (nullable = true)
 |    |-- gravatar_id: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- login: string (nullable = true)
 |    |-- url: string (nullable = true)
 |-- payload: struct (nullable = true)
 |    |-- action: string (nullable = true)
 |    |-- before: string (nullable = true)
 |    |-- comment: struct (nullable = true)
 |    |    |-- _links: struct (nullable = true)
 |    |    |    |-- html: struct (nullable = true)
 |    |    |    |    |-- href: string (nullable = true)
 |    |    |    |-- pull_request: struct (nul

In [106]:
df.columns

['actor', 'created_at', 'id', 'org', 'payload', 'public', 'repo', 'type']

In [107]:
df.show()

+--------------------+--------------------+----------+--------------------+--------------------+------+--------------------+-----------------+
|               actor|          created_at|        id|                 org|             payload|public|                repo|             type|
+--------------------+--------------------+----------+--------------------+--------------------+------+--------------------+-----------------+
|[https://avatars....|2018-07-24T16:00:00Z|8011174342|                null|[, 3d02b5ff979cea...|  true|[142025706, gameb...|        PushEvent|
|[https://avatars....|2018-07-24T16:00:00Z|8011174355|[https://avatars....|[created,, [, CON...|  true|[10941409, mesosp...|IssueCommentEvent|
|[https://avatars....|2018-07-24T16:00:00Z|8011174362|                null|[, d8a47b6700dd02...|  true|[142157684, guiCa...|        PushEvent|
|[https://avatars....|2018-07-24T16:00:00Z|8011174365|                null|[, 325a4f61c4d8e8...|  true|[135834980, Rtcas...|        PushEvent|

In [108]:
df.select("repo.*").show()

+---------+--------------------+--------------------+
|       id|                name|                 url|
+---------+--------------------+--------------------+
|142025706|gamebasedmarketin...|https://api.githu...|
| 10941409| mesosphere/marathon|https://api.githu...|
|142157684|     guiCarlier/akka|https://api.githu...|
|135834980|        Rtcash/Mercy|https://api.githu...|
|142036030|art123craft/learn...|https://api.githu...|
| 81668836|iotile/ng-iotile-...|https://api.githu...|
| 86505380|   davidpatrick/docs|https://api.githu...|
|142181923|      PadsCraft/Sova|https://api.githu...|
| 64603480|EngForDev/awesome...|https://api.githu...|
|142179091|serendipidoussoph...|https://api.githu...|
|138676186|      himobi/hotspot|https://api.githu...|
|142169764|       shaly/config1|https://api.githu...|
| 25623942|      apache/syncope|https://api.githu...|
|135091897|AgataDziubala/Git...|https://api.githu...|
|139280926|benkerns09/GolfPl...|https://api.githu...|
|  8859285|wordpress-mobile/

In [13]:
df.withColumn("year",year('created_at')).show()

+--------------------+--------------------+-----------+--------------------+--------------------+------+--------------------+-----------------+----+
|               actor|          created_at|         id|                 org|             payload|public|                repo|             type|year|
+--------------------+--------------------+-----------+--------------------+--------------------+------+--------------------+-----------------+----+
|[https://avatars....|2022-09-26T05:00:00Z|24214570454|                null|[, f52f75fea97fd5...|  true|[395936674, Hacke...|        PushEvent|2022|
|[https://avatars....|2022-09-26T05:00:00Z|24214570455|                null|[, a4d7fa423f387c...|  true|[539200849, ofaru...|        PushEvent|2022|
|[https://avatars....|2022-09-26T05:00:00Z|24214570470|                null|[opened,,,,,,,, [...|  true|[277422872, pddem...|      IssuesEvent|2022|
|[https://avatars....|2022-09-26T05:00:00Z|24214570473|                null|[opened,,,,,,,, [...|  true|[5

In [109]:
df.count()

644564

In [110]:
df1 = df.withColumn("year",year('created_at'))\
    .withColumn("month",month('created_at'))\
    .withColumn("day",dayofmonth('created_at'))


In [111]:
df1.write.partitionBy("year","month","day").parquet("output")

In [None]:
partitionBy

In [112]:
df3 = spark.read.parquet('output')

In [113]:
df3.printSchema()

root
 |-- actor: struct (nullable = true)
 |    |-- avatar_url: string (nullable = true)
 |    |-- display_login: string (nullable = true)
 |    |-- gravatar_id: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- login: string (nullable = true)
 |    |-- url: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- id: string (nullable = true)
 |-- org: struct (nullable = true)
 |    |-- avatar_url: string (nullable = true)
 |    |-- gravatar_id: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- login: string (nullable = true)
 |    |-- url: string (nullable = true)
 |-- payload: struct (nullable = true)
 |    |-- action: string (nullable = true)
 |    |-- before: string (nullable = true)
 |    |-- comment: struct (nullable = true)
 |    |    |-- _links: struct (nullable = true)
 |    |    |    |-- html: struct (nullable = true)
 |    |    |    |    |-- href: string (nullable = true)
 |    |    |    |-- pull_request: struct (nul

In [114]:
df3.show()

+--------------------+--------------------+----------+--------------------+--------------------+------+--------------------+-----------------+----+-----+---+
|               actor|          created_at|        id|                 org|             payload|public|                repo|             type|year|month|day|
+--------------------+--------------------+----------+--------------------+--------------------+------+--------------------+-----------------+----+-----+---+
|[https://avatars....|2018-07-24T16:00:00Z|8011174342|                null|[, 3d02b5ff979cea...|  true|[142025706, gameb...|        PushEvent|2018|    7| 24|
|[https://avatars....|2018-07-24T16:00:00Z|8011174355|[https://avatars....|[created,, [, CON...|  true|[10941409, mesosp...|IssueCommentEvent|2018|    7| 24|
|[https://avatars....|2018-07-24T16:00:00Z|8011174362|                null|[, d8a47b6700dd02...|  true|[142157684, guiCa...|        PushEvent|2018|    7| 24|
|[https://avatars....|2018-07-24T16:00:00Z|801117436

In [88]:
spark.stop()