In [1]:
! find /repositories

/repositories
/repositories/siva
/repositories/siva/latest
/repositories/siva/latest/5d
/repositories/siva/latest/5d/5d7303c49ac984a9fec60523f2d5297682e16646.siva
/repositories/siva/latest/65
/repositories/siva/latest/65/65c397a8673c0f4b98e3867e5fd6efdaa7d9ccd2.siva
/repositories/siva/latest/6b
/repositories/siva/latest/6b/6bc52531e707eb4b9b875c418a84f2e100ff6e73.siva
/repositories/siva/latest/cc
/repositories/siva/latest/cc/cce947b98a050c6d356bc6ba95030254914027b1.siva


In [27]:
from sourced.engine import Engine
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import Row

In [25]:
spark = SparkSession.builder\
  .master("local[*]").appName("workshop").getOrCreate()

engine = Engine(spark, "/repositories/siva/latest/", "siva")

In [6]:
engine.repositories.count()


4

In [7]:
engine.repositories.select('id').distinct().show(10, False)


+--------------------------+
|id                        |
+--------------------------+
|github.com/src-d/go-git   |
|github.com/src-d/hercules |
|github.com/src-d/enry     |
|github.com/src-d/go-kallax|
+--------------------------+



In [8]:
engine.repositories.references.head_ref.show()

+--------------------+---------------+--------------------+---------+
|       repository_id|           name|                hash|is_remote|
+--------------------+---------------+--------------------+---------+
|github.com/src-d/...|refs/heads/HEAD|98916b85c6fe08f2b...|     true|
|github.com/src-d/...|refs/heads/HEAD|2a161296e79cc1c98...|     true|
|github.com/src-d/...|refs/heads/HEAD|0db3b4b5536e6dc4d...|     true|
|github.com/src-d/...|refs/heads/HEAD|014493bed229e27d8...|     true|
+--------------------+---------------+--------------------+---------+



In [9]:
engine.repositories.references.head_ref.select('repository_id', 'hash').show(10, False)


+--------------------------+----------------------------------------+
|repository_id             |hash                                    |
+--------------------------+----------------------------------------+
|github.com/src-d/go-git   |98916b85c6fe08f2be5a235db43957d493ba37b9|
|github.com/src-d/go-kallax|2a161296e79cc1c98a5dc303deecc223abb482e5|
|github.com/src-d/enry     |0db3b4b5536e6dc4d9109d42897c00a5d92af0a7|
|github.com/src-d/hercules |014493bed229e27d8a18b8d104e9ac062ef799e1|
+--------------------------+----------------------------------------+



In [11]:
repos = engine.repositories
head_refs = repos.references.head_ref
tree_entries = head_refs.commits.tree_entries

In [22]:
md = tree_entries.filter(tree_entries.path.like("%.md"))

In [18]:
md.count()

18

In [20]:
md.printSchema()

root
 |-- commit_hash: string (nullable = false)
 |-- repository_id: string (nullable = false)
 |-- reference_name: string (nullable = false)
 |-- path: string (nullable = false)
 |-- blob: string (nullable = false)



In [21]:
md.select("repository_id", "path", "commit_hash").show(20, False)

+--------------------------+-----------------------------------------+----------------------------------------+
|repository_id             |path                                     |commit_hash                             |
+--------------------------+-----------------------------------------+----------------------------------------+
|github.com/src-d/go-git   |CODE_OF_CONDUCT.md                       |98916b85c6fe08f2be5a235db43957d493ba37b9|
|github.com/src-d/go-git   |COMPATIBILITY.md                         |98916b85c6fe08f2be5a235db43957d493ba37b9|
|github.com/src-d/go-git   |CONTRIBUTING.md                          |98916b85c6fe08f2be5a235db43957d493ba37b9|
|github.com/src-d/go-git   |README.md                                |98916b85c6fe08f2be5a235db43957d493ba37b9|
|github.com/src-d/go-git   |_examples/README.md                      |98916b85c6fe08f2be5a235db43957d493ba37b9|
|github.com/src-d/go-git   |_examples/storage/README.md              |98916b85c6fe08f2be5a235db43957d493

In [26]:
# https://spark.apache.org/docs/latest/sql-programming-guide.html
# 

In [41]:
import os
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf

filename_udf = udf(lambda path: os.path.splitext(path)[0], StringType())
extension_udf = udf(lambda path: os.path.splitext(path)[1], StringType())


In [46]:
ext = tree_entries.select("path")\
    .withColumn("filename", filename_udf(tree_entries.path))\
    .withColumn("extension", extension_udf(tree_entries.path))

In [47]:
ext.show()

+--------------------+--------------------+---------+
|                path|            filename|extension|
+--------------------+--------------------+---------+
|          .gitignore|          .gitignore|         |
|         .travis.yml|             .travis|     .yml|
|  CODE_OF_CONDUCT.md|     CODE_OF_CONDUCT|      .md|
|    COMPATIBILITY.md|       COMPATIBILITY|      .md|
|     CONTRIBUTING.md|        CONTRIBUTING|      .md|
|                 DCO|                 DCO|         |
|             LICENSE|             LICENSE|         |
|         MAINTAINERS|         MAINTAINERS|         |
|            Makefile|            Makefile|         |
|           README.md|              README|      .md|
| _examples/README.md|    _examples/README|      .md|
|_examples/branch/...|_examples/branch/...|      .go|
|_examples/checkou...|_examples/checkou...|      .go|
|_examples/clone/m...|_examples/clone/main|      .go|
|_examples/commit/...|_examples/commit/...|      .go|
| _examples/common.go|    _e

In [50]:
filesGroupByExt = ext.groupBy("extension").count().orderBy(desc("count"))

In [53]:
filesGroupByExt.show(25)

+-----------+-----+
|  extension|count|
+-----------+-----+
|        .go|  468|
|           |   27|
|        .md|   18|
|       .png|   11|
|      .tmpl|   11|
|      .gold|   11|
|       .yml|    7|
|       .csv|    6|
|      .java|    6|
|       .tgo|    5|
|        .py|    5|
|        .pb|    4|
|        .sh|    3|
|       .dot|    3|
|       .txt|    2|
|      .bash|    2|
|     .proto|    2|
|        .rb|    2|
|       .sbt|    2|
|.properties|    1|
|       .svg|    1|
|        .gp|    1|
|  .template|    1|
|       .enc|    1|
+-----------+-----+



In [57]:
tree_entries.blobs.classify_languages()\
    .printSchema()

root
 |-- blob_id: string (nullable = true)
 |-- commit_hash: string (nullable = true)
 |-- repository_id: string (nullable = true)
 |-- reference_name: string (nullable = true)
 |-- content: binary (nullable = true)
 |-- is_binary: boolean (nullable = false)
 |-- path: string (nullable = true)
 |-- lang: string (nullable = true)



In [67]:
langs = tree_entries.blobs.classify_languages()

extByLang = langs.select("path", "lang")\
    .withColumn("filename", filename_udf(langs.path))\
    .withColumn("extension", extension_udf(langs.path))\
    .groupBy("extension", "lang").count().orderBy(desc("count"))

In [68]:
extByLang.count()

29

In [70]:
extByLang.show(30, True)

+-----------+---------------+-----+
|  extension|           lang|count|
+-----------+---------------+-----+
|        .go|             Go|  468|
|        .md|       Markdown|   18|
|           |           null|   15|
|      .tmpl|           null|   11|
|       .png|           null|   11|
|      .gold|           null|   11|
|       .yml|           YAML|    7|
|       .csv|            CSV|    6|
|           |       Makefile|    6|
|      .java|           Java|    6|
|        .py|         Python|    5|
|       .tgo|           null|    5|
|           |           Text|    4|
|        .sh|          Shell|    3|
|       .dot| Graphviz (DOT)|    3|
|     .proto|Protocol Buffer|    2|
|       .sbt|          Scala|    2|
|      .bash|          Shell|    2|
|        .pb|           null|    2|
|        .pb|      PureBasic|    2|
|       .txt|           Text|    2|
|        .rb|           Ruby|    2|
|           |          Shell|    1|
|           |     Dockerfile|    1|
|.properties|            INI