In [7]:
from sourced.engine import Engine
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder\
.master("local[*]").appName("Examples")\
.getOrCreate()

engine = Engine(spark, "/repositories/test", "siva")

print("%d repositories successfully loaded" % (engine.repositories.count()))

7 repositories successfully loaded


In [2]:
seed=1234

## Repositories schema

In [52]:
engine.repositories.printSchema()

root
 |-- id: string (nullable = false)
 |-- urls: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- is_fork: boolean (nullable = true)
 |-- repository_path: string (nullable = true)



## References schema

In [53]:
engine.repositories.references.printSchema()

root
 |-- repository_id: string (nullable = false)
 |-- name: string (nullable = false)
 |-- hash: string (nullable = false)
 |-- is_remote: boolean (nullable = false)



## Commits schema

In [54]:
engine.repositories.references.commits.printSchema()

root
 |-- repository_id: string (nullable = false)
 |-- reference_name: string (nullable = false)
 |-- index: integer (nullable = false)
 |-- hash: string (nullable = false)
 |-- message: string (nullable = false)
 |-- parents: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- parents_count: integer (nullable = false)
 |-- author_email: string (nullable = true)
 |-- author_name: string (nullable = true)
 |-- author_date: timestamp (nullable = true)
 |-- committer_email: string (nullable = true)
 |-- committer_name: string (nullable = true)
 |-- committer_date: timestamp (nullable = true)



## Blobs schema

In [55]:
engine.repositories.references.commits.blobs.printSchema()

root
 |-- blob_id: string (nullable = false)
 |-- commit_hash: string (nullable = false)
 |-- repository_id: string (nullable = false)
 |-- reference_name: string (nullable = false)
 |-- content: binary (nullable = true)
 |-- is_binary: boolean (nullable = false)
 |-- path: string (nullable = false)



 ## Get branch with latest commit for each repository

In [31]:
df = engine.repositories.references\
    .filter("is_remote = true")\
    .filter("NOT name LIKE 'refs/heads/HEAD' AND name LIKE 'refs/heads/%'")\
    .commits\
    .cache()
    
latest_commit = df.groupBy("repository_id").agg(max("committer_date").alias("date"))\
                 .withColumnRenamed('repository_id', 'id')

latest_repo_ref = df.select("repository_id", "reference_name", "committer_date")\
                    .join(latest_commit, 
                          (df.repository_id == latest_commit.id) 
                           & (df.committer_date == latest_commit.date))\
                    .select('repository_id', 'reference_name')\
                    .cache()                 

print("First 10 repo branches:")
latest_repo_ref.show(10)

print("Repository count after join %d " % (latest_repo_ref.count()))

First 10 repo branches:
+--------------------+-----------------+
|       repository_id|   reference_name|
+--------------------+-----------------+
|github.com/AlexPa...|refs/heads/master|
|github.com/jmailh...|refs/heads/master|
|github.com/geekco...|refs/heads/master|
|github.com/dotfen...|refs/heads/master|
|github.com/bluepi...|refs/heads/master|
|github.com/antoni...|refs/heads/master|
|github.com/mingra...|refs/heads/master|
+--------------------+-----------------+

Repository count after join 7 


## Splitting train and test sets

We can use randomSplit over the repositories DataFrame to get the train and test sets. The same can be done later to get the train and validation sets depending on the specific cross-validation approach used. 

In [30]:
data = latest_repo_ref\
    .withColumnRenamed('repository_id', 'repo_id')\
    .withColumnRenamed('reference_name', 'ref_name')\

[train, test] = data.randomSplit([0.8, 0.2], seed)

print("Total count %d || Train count %d || Test count %d" % (data.count(), train.count(), test.count()))

Total count 7 || Train count 4 || Test count 3


## Prepare train dataset with Python UASTs

First, get Python blobs with UASTs:

In [32]:
# Get repo_ids and ref_names to filter and avoid extracting UASTs on all blobs
repo_ids, ref_names = set(), set()

select = latest_repo_ref.select("repository_id", "reference_name").collect()
for row in select: repo_ids.add(row.repository_id); ref_names.add(row.reference_name);

# Get blobs with UASTs
python_blobs = df.blobs\
    .repartition(32)\
    .filter(df.blobs.repository_id.isin(repo_ids))\
    .filter(df.blobs.reference_name.isin(ref_names))\
    .classify_languages()\
    .filter("is_binary = false")\
    .filter("lang = 'Python'")\
    .dropDuplicates(['blob_id'])\
    .cache()

In [33]:
python_blobs.count()

122

In [43]:
python_uasts = python_blobs\
    .repartition(32)\
    .extract_uasts()\
    .drop("content")\
    .cache()

In [44]:
python_uasts.count()

342

Join python_blobs dataframe with the train dataframe to get UASTS we want to use for training:

In [45]:
train_python = train\
    .join(python_uasts, 
       (train.repo_id == python_uasts.repository_id) 
        & (train.ref_name == python_uasts.reference_name))\
    .cache()

In [46]:
train_python.count()

314

In [47]:
train_python.printSchema()

root
 |-- repo_id: string (nullable = false)
 |-- ref_name: string (nullable = false)
 |-- index: integer (nullable = false)
 |-- hash: string (nullable = false)
 |-- message: string (nullable = false)
 |-- parents: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- parents_count: integer (nullable = false)
 |-- author_email: string (nullable = true)
 |-- author_name: string (nullable = true)
 |-- author_date: timestamp (nullable = true)
 |-- committer_email: string (nullable = true)
 |-- committer_name: string (nullable = true)
 |-- committer_date: timestamp (nullable = true)
 |-- blob_id: string (nullable = true)
 |-- commit_hash: string (nullable = true)
 |-- repository_id: string (nullable = true)
 |-- reference_name: string (nullable = true)
 |-- is_binary: boolean (nullable = false)
 |-- path: string (nullable = true)
 |-- lang: string (nullable = true)
 |-- uast: array (nullable = false)
 |    |-- element: binary (containsNull = true)

