# Topics Covered:

 - [Initialize Apache Spark](#Initialize-Apache-Spark)
 - [Blobs of HEAD for Pyhton and Java](#Blobs-of-HEAD-for-Pyhton-and-Java)
 - [Read content of the files](#Read-content-of-the-files) 
 - [Extract UASTs](#Extract-UASTs)
 - [Visualize UASTs](#Visualize-UASTs)
 - [Manipulate UASTs](#Manipulate UASTs)

# Initialize Apache Spark

In [20]:
from sourced.engine import Engine
from pyspark.sql import SparkSession
import pyspark.sql.functions as fn

spark = SparkSession.builder\
  .master("local[*]").appName("workshop 2").getOrCreate()

engine = Engine(spark, "/repositories/siva/latest/", "siva")
print("%d repositories successfully loaded" % (engine.repositories.rdd.countApprox(1000)))

4 repositories successfully loaded


# Blobs of HEAD for Pyhton and Java

In [4]:
repos = engine.repositories

refs = repos\
    .filter(repos.is_fork == False)\
    .references

langs = refs\
    .filter(refs.is_remote == True)\
    .head_ref.commits\
    .tree_entries.blobs\
    .classify_languages()

head_blobs = langs\
    .filter(langs.is_binary == False)\
    .filter(langs.lang.isin(["Python", "Java"]))\
    .cache()

In [5]:
print("Number of blobs: {}".format(head_blobs.count()))
head_blobs.printSchema()

Number of blobs: 11
root
 |-- blob_id: string (nullable = true)
 |-- commit_hash: string (nullable = true)
 |-- repository_id: string (nullable = true)
 |-- reference_name: string (nullable = true)
 |-- content: binary (nullable = true)
 |-- is_binary: boolean (nullable = false)
 |-- path: string (nullable = true)
 |-- lang: string (nullable = true)



# Read content of the files

In [7]:
head_blobs\
    .select("repository_id", "path", \
            fn.translate(fn.decode(fn.substring(head_blobs.content, 0, 37), "UTF-8"), "\r\n", "  ").alias("content"))\
    .show(10, False)

+-------------------------+--------------------------------------------------+-------------------------------------+
|repository_id            |path                                              |content                              |
+-------------------------+--------------------------------------------------+-------------------------------------+
|github.com/src-d/enry    |java/src/main/java/tech/sourced/enry/Enry.java    |package tech.sourced.enry;  import te|
|github.com/src-d/enry    |java/src/main/java/tech/sourced/enry/GoUtils.java |package tech.sourced.enry;  import co|
|github.com/src-d/enry    |java/src/main/java/tech/sourced/enry/Guess.java   |package tech.sourced.enry;  /**  * Gu|
|github.com/src-d/enry    |java/src/test/java/tech/sourced/enry/EnryTest.java|package tech.sourced.enry;  import or|
|github.com/src-d/hercules|contrib/_plugin_example/plot_churn.py             |import argparse from datetime import |
|github.com/src-d/hercules|fix_yaml_unicode.py                  

# Extract UASTs

In [8]:
uasts = head_blobs\
    .extract_uasts()\
    .cache()

In [9]:
uasts.printSchema()

root
 |-- blob_id: string (nullable = true)
 |-- commit_hash: string (nullable = true)
 |-- repository_id: string (nullable = true)
 |-- reference_name: string (nullable = true)
 |-- content: binary (nullable = true)
 |-- is_binary: boolean (nullable = false)
 |-- path: string (nullable = true)
 |-- lang: string (nullable = true)
 |-- uast: array (nullable = false)
 |    |-- element: binary (containsNull = true)



In [10]:
uasts.select("repository_id", "path", "uast").show(10, False)

+-------------------------+--------------------------------------------------+-------------+
|repository_id            |path                                              |uast         |
+-------------------------+--------------------------------------------------+-------------+
|github.com/src-d/enry    |java/src/main/java/tech/sourced/enry/Enry.java    |[[B@1b1f1a73]|
|github.com/src-d/enry    |java/src/main/java/tech/sourced/enry/GoUtils.java |[[B@43ab1c46]|
|github.com/src-d/enry    |java/src/main/java/tech/sourced/enry/Guess.java   |[[B@4240d5bf]|
|github.com/src-d/enry    |java/src/test/java/tech/sourced/enry/EnryTest.java|[[B@c12084]  |
|github.com/src-d/hercules|contrib/_plugin_example/plot_churn.py             |[[B@636e7e6d]|
|github.com/src-d/hercules|fix_yaml_unicode.py                               |[[B@78bc076a]|
|github.com/src-d/hercules|labours.py                                        |[[B@3c95c8ef]|
|github.com/src-d/hercules|pb/__init__.py                             

# Visualize UASTs

In [15]:
from bblfsh import Node, filter as filter_uast, role_name

In [24]:
actual_uasts = uasts.select("uast").limit(1).collect()

for x in actual_uasts:
   print(Node.FromString(x.uast[0]))

internal_type: "CompilationUnit"
children {
  internal_type: "PackageDeclaration"
  properties {
    key: "internalRole"
    value: "package"
  }
  children {
    internal_type: "QualifiedName"
    properties {
      key: "internalRole"
      value: "name"
    }
    children {
      internal_type: "QualifiedName"
      properties {
        key: "internalRole"
        value: "qualifier"
      }
      children {
        internal_type: "SimpleName"
        properties {
          key: "internalRole"
          value: "qualifier"
        }
        token: "tech"
        start_position {
          offset: 8
          line: 1
          col: 9
        }
        end_position {
          offset: 12
          line: 1
          col: 13
        }
        roles: EXPRESSION
        roles: IDENTIFIER
      }
      children {
        internal_type: "SimpleName"
        properties {
          key: "internalRole"
          value: "name"
        }
        token: "sourced"
        start_position {
          

In [13]:
def uast_roles(uast) -> iter:
    stack = [(0, uast)]
    while stack:
        level, node = stack.pop()
        yield level, node
        stack.extend((level+1, c) for c in node.children[::-1])
        
def node2raw(node, internal_type=True):
    token = "|%s|" % node.token[:20].replace("\n", "\\n")
    roles = ", ".join([role_name(k) for k in node.roles])
    internal_role = node.internal_type
    positions = str(node.start_position.line) if node.start_position.line != 0 else ""
    return [positions, token, internal_role, roles] if internal_type else [positions, token, roles]

def rreplace(s, old, new, times):
    li = s.rsplit(old, times)
    return new.join(li)

def print_table(table):
    col_size = [max(len(cell) for cell in table_col) for table_col in zip(*table)]
    for raw in table:
        print("  ".join(cell.ljust(size) for cell, size in zip(raw, col_size)))

def print_uast_structure(uast, internal_type=True):
    if internal_type:
        table = [["#", "Token", "Internal Role", "Roles Tree"], ["", "", "", ""]]
    else:
        table = [["#", "Token", "Roles Tree"], ["", "", ""]]
    old_level = 0
    for level, node in uast_roles(uast):
        raw = node2raw(node, internal_type)
        if level != 0:
            raw[-1] = "┃ " * (level-1) + "┣ " + raw[-1]
        if level < old_level:
            table[-1][-1] = rreplace(table[-1][-1].replace("┣", "┗"), "┃", "┗", old_level-level-1)
            pass
        table.append(raw)
        old_level = level
    table[-1][-1] = table[-1][-1].replace("┣", "┗").replace("┃", "┗")
    print_table(table)
    
#https://gist.github.com/zurk/f13aa816a7c3220cf700c98c72891ec3#file-vis_uast-py

In [14]:
for x in uasts.select("uast").limit(1).collect():
   print_uast_structure(Node.FromString(x.uast[0]))

#    Token                   Internal Role                 Roles Tree                                                                 
                                                                                                                                      
1    ||                      CompilationUnit               FILE                                                                       
1    |package|               PackageDeclaration            ┣ DECLARATION, PACKAGE                                                     
1    ||                      QualifiedName                 ┃ ┣ EXPRESSION, IDENTIFIER, QUALIFIED                                      
1    ||                      QualifiedName                 ┃ ┃ ┣ EXPRESSION, IDENTIFIER, QUALIFIED                                    
1    |tech|                  SimpleName                    ┃ ┃ ┃ ┣ EXPRESSION, IDENTIFIER                                             
1    |sourced|               SimpleName                

# Manipulate UASTs

In [45]:
FUNC_XPATH = "//*[@roleFunction and @roleDeclaration]"
FUNC_NAME_XPATH = "/*[@roleFunction and @roleIdentifier and @roleName] " \
                "| /*/*[@roleFunction and @roleIdentifier and @roleName]"

def extract_functions_from_uast(uast):
    """https://github.com/src-d/ml/blob/24c3d750882fceb819d82aed55115e2d9fe3c348/sourced/ml/transformers/moder.py#L80"""
    allfuncs = list(filter_uast(uast, FUNC_XPATH))
    internal = set()
    for func in allfuncs:
        if id(func) in internal:
            continue
        for sub in filter_uast(func, FUNC_XPATH):
            if sub != func:
                internal.add(id(sub))
    for f in allfuncs:
        if id(f) not in internal:
            name = "+".join(n.token for n in filter_uast(f, FUNC_NAME_XPATH))
            if name:
                yield f, name



In [47]:
for f in extract_functions_from_uast(Node.FromString(actual_uasts[0].uast[0])):
    a, b = f
    print(b)

isAuxiliaryLanguage
getLanguage
getLanguageByContent
getLanguageByEmacsModeline
getLanguageByExtension
getLanguageByShebang
getLanguageByFilename
getLanguageByModeline
getLanguageByVimModeline
getLanguageExtensions
getLanguages
getMimeType
isBinary
isConfiguration
isDocumentation
isDotFile
isImage
isVendor
