In [118]:
from sourced.engine import Engine
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder\
.master("local[*]").appName("Examples")\
.getOrCreate()

engine = Engine(spark, "/repositories", "standard")

### Print Schema to show meta data from repositories

In [119]:
engine.repositories.printSchema()

root
 |-- id: string (nullable = false)
 |-- urls: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- is_fork: boolean (nullable = true)
 |-- repository_path: string (nullable = true)



### Show all repositories in /repositories

Even though there are 2 repositories, 4 are shown because of the remote and local references in git

In [120]:
engine.repositories.show()

+--------------------+--------------------+-------+--------------------+
|                  id|                urls|is_fork|     repository_path|
+--------------------+--------------------+-------+--------------------+
|github.com/keras-...|[https://github.c...|  false|file:/repositorie...|
|file:///repositor...|                  []|  false|file:/repositorie...|
|github.com/pandas...|[https://github.c...|  false|file:/repositorie...|
|file:///repositor...|                  []|  false|file:/repositorie...|
+--------------------+--------------------+-------+--------------------+



### Count the total of non-fork repositories

In [121]:
engine.repositories.filter("is_fork = false")\
.select("id").distinct()\
.count()

4

### Print schema of blobs in all head commits

In [122]:
head_blobs = engine.repositories.filter("is_fork = false").references\
.head_ref.commits.first_reference_commit\
.tree_entries.blobs\
.printSchema()

root
 |-- blob_id: string (nullable = false)
 |-- commit_hash: string (nullable = false)
 |-- repository_id: string (nullable = false)
 |-- reference_name: string (nullable = false)
 |-- content: binary (nullable = true)
 |-- is_binary: boolean (nullable = false)
 |-- path: string (nullable = false)



### Get the first 50 Python blobs from head commits

In [123]:
head_blobs = engine.repositories.filter("is_fork = false").references\
.head_ref.commits.first_reference_commit\
.tree_entries.blobs\
.classify_languages()\
.filter("is_binary = false")\
.filter("lang = 'Python'")\
.extract_uasts()\
.limit(50)\
.cache()

### Get the schema

In [124]:
head_blobs.printSchema()

root
 |-- blob_id: string (nullable = false)
 |-- commit_hash: string (nullable = false)
 |-- repository_id: string (nullable = false)
 |-- reference_name: string (nullable = false)
 |-- content: binary (nullable = true)
 |-- is_binary: boolean (nullable = false)
 |-- path: string (nullable = false)
 |-- lang: string (nullable = true)
 |-- uast: array (nullable = true)
 |    |-- element: binary (containsNull = true)



### Print result

In [125]:
head_blobs.show(50)

+--------------------+--------------------+--------------------+---------------+--------------------+---------+--------------------+------+-------------+
|             blob_id|         commit_hash|       repository_id| reference_name|             content|is_binary|                path|  lang|         uast|
+--------------------+--------------------+--------------------+---------------+--------------------+---------+--------------------+------+-------------+
|ad26818a5e4152dff...|beda1bc5132e6fef1...|github.com/keras-...|refs/heads/HEAD|[23 20 2D 2A 2D 2...|    false|     docs/autogen.py|Python|[[B@74b40f87]|
|727700bdfacb742da...|beda1bc5132e6fef1...|github.com/keras-...|refs/heads/HEAD|[23 20 2D 2A 2D 2...|    false|examples/addition...|Python|[[B@2735c04d]|
|ca92cdec58f03bb26...|beda1bc5132e6fef1...|github.com/keras-...|refs/heads/HEAD|[27 27 27 54 68 6...|    false|examples/antirect...|Python|[[B@11957f4c]|
|cbbccba2eba393bd9...|beda1bc5132e6fef1...|github.com/keras-...|refs/heads/H

### Or instead do a sampling of 5% of the blobs

In [126]:
head_blobs = engine.repositories.filter("is_fork = false").references\
.head_ref.commits.first_reference_commit\
.tree_entries.blobs\
.classify_languages()\
.filter("is_binary = false")\
.filter("lang = 'Python'")\
.sample(False, 0.05, 1024)\
.extract_uasts()\
.cache()

In [127]:
c = head_blobs.count()
print(c)

70


In [128]:
head_blobs.show(c)

+--------------------+--------------------+--------------------+---------------+--------------------+---------+--------------------+------+-------------+
|             blob_id|         commit_hash|       repository_id| reference_name|             content|is_binary|                path|  lang|         uast|
+--------------------+--------------------+--------------------+---------------+--------------------+---------+--------------------+------+-------------+
|cda4620c016a1e49f...|beda1bc5132e6fef1...|github.com/keras-...|refs/heads/HEAD|[27 27 27 4D 4E 4...|    false|examples/mnist_da...|Python|[[B@1d0a2be6]|
|07f630a79f548b05e...|beda1bc5132e6fef1...|github.com/keras-...|refs/heads/HEAD|[27 27 27 54 68 6...|    false|examples/mnist_ir...|Python|[[B@508bd6ad]|
|c8c05bff393095edc...|beda1bc5132e6fef1...|github.com/keras-...|refs/heads/HEAD|[27 27 27 54 68 6...|    false|examples/mnist_ne...|Python| [[B@e350845]|
|4c4ad165a8b89b089...|beda1bc5132e6fef1...|github.com/keras-...|refs/heads/H

### Get identifier tokens from Python files

In [129]:
idents = head_blobs.query_uast('//*[@roleIdentifier and not(@roleIncomplete)]')\
.printSchema()

root
 |-- blob_id: string (nullable = false)
 |-- commit_hash: string (nullable = false)
 |-- repository_id: string (nullable = false)
 |-- reference_name: string (nullable = false)
 |-- content: binary (nullable = true)
 |-- is_binary: boolean (nullable = false)
 |-- path: string (nullable = false)
 |-- lang: string (nullable = true)
 |-- uast: array (nullable = true)
 |    |-- element: binary (containsNull = true)
 |-- result: array (nullable = true)
 |    |-- element: binary (containsNull = true)



In [130]:
idents = head_blobs.query_uast('//*[@roleIdentifier and not(@roleIncomplete)]')\
.select("repository_id","path","result").distinct()

In [131]:
tokens = idents.extract_tokens().cache()

In [132]:
tokens.printSchema()

root
 |-- repository_id: string (nullable = false)
 |-- path: string (nullable = false)
 |-- result: array (nullable = true)
 |    |-- element: binary (containsNull = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [133]:
tokens.limit(10).show()

+--------------------+--------------------+--------------------+--------------------+
|       repository_id|                path|              result|              tokens|
+--------------------+--------------------+--------------------+--------------------+
|file:///repositor...|  pandas/core/ops.py|[[B@4ee15ed5, [B@...|[__future__, divi...|
|file:///repositor...|pandas/tests/cate...|[[B@5f153d1d, [B@...|[pytest, pandas, ...|
|github.com/keras-...|examples/mnist_ne...|[[B@254bc42d, [B@...|[__future__, prin...|
|file:///repositor...|pandas/tests/io/t...|[[B@51ba5e58, [B@...|[numpy, np, numpy...|
|github.com/keras-...|tests/keras/regul...|[[B@174b13c4, [B@...|[pytest, keras.mo...|
|file:///repositor...|keras/legacy/__in...|                  []|                  []|
|file:///repositor...|pandas/tests/seri...|[[B@67476857, [B@...|[numpy, np, panda...|
|github.com/pandas...|pandas/io/formats...|                  []|                  []|
|file:///repositor...| pandas/io/pickle.py|[[B@637442f

### Filter blobs based on a specific token in the blob

In [134]:
tokens.select("tokens").where(array_contains("tokens", "print")).show()

+--------------------+
|              tokens|
+--------------------+
|[__future__, prin...|
|[__future__, prin...|
|[__future__, prin...|
|[__future__, prin...|
|[__future__, prin...|
|[__future__, prin...|
|[__future__, prin...|
|[sys, math, xml.e...|
|[numpy, np, os, t...|
+--------------------+



### Top languages per number of blobs

Results is only Python because there is a filter above on the blobs

In [135]:
top_ten_langs = head_blobs.distinct()\
.groupBy("lang").agg(count("*").alias("count"))\
.orderBy("count").sort(desc("count")).limit(10)\
.show()

+------+-----+
|  lang|count|
+------+-----+
|Python|   70|
+------+-----+

