In [1]:
import findspark
findspark.init()  # Auto-detects SPARK_HOME

from pprint import pprint
import json
from typing import List
from datetime import datetime

from pyspark.sql import SparkSession
from pyspark import SparkContext

In [2]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("PySpark RDD Assignment") \
    .getOrCreate()

sc = spark.sparkContext

# first load to DF because we have a multiline json
commits_df = spark.read.option("multiline", "true") \
      .json("../data/data_raw.json")

# show df schema
commits_df.printSchema()

25/02/06 23:51:23 WARN Utils: Your hostname, cbakos-laptop resolves to a loopback address: 127.0.1.1; using 192.168.178.143 instead (on interface wlp0s20f3)
25/02/06 23:51:23 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/06 23:51:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

root
 |-- _id: struct (nullable = true)
 |    |-- $oid: string (nullable = true)
 |-- author: struct (nullable = true)
 |    |-- avatar_url: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- login: string (nullable = true)
 |    |-- site_admin: boolean (nullable = true)
 |    |-- type: string (nullable = true)
 |-- commit: struct (nullable = true)
 |    |-- author: struct (nullable = true)
 |    |    |-- date: string (nullable = true)
 |    |    |-- email: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |-- comment_count: long (nullable = true)
 |    |-- committer: struct (nullable = true)
 |    |    |-- date: string (nullable = true)
 |    |    |-- email: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |-- message: string (nullable = true)
 |    |-- tree: struct (nullable = true)
 |    |    |-- sha: string (nullable = true)
 |    |-- verification: struct (nullable = true)
 |    |    |-- payload: string (nu

In [3]:
# convert to JSON (string) RDD
json_rdd = commits_df.toJSON()

# Parse the JSON strings before processing
commit_rdd = json_rdd.map(lambda row: json.loads(row))
pprint(commit_rdd.first())

[Stage 1:>                                                          (0 + 1) / 1]

{'_id': {'$oid': '5ce691346480fd1565a33621'},
 'author': {'avatar_url': 'https://avatars2.githubusercontent.com/u/691258?v=4',
            'id': 691258,
            'login': 'emericg',
            'site_admin': False,
            'type': 'User'},
 'commit': {'author': {'date': '2019-05-23T12:19:58.000Z',
                       'email': 'emeric.grange@gmail.com',
                       'name': 'Emeric'},
            'comment_count': 0,
            'committer': {'date': '2019-05-23T12:19:58.000Z',
                          'email': 'emeric.grange@gmail.com',
                          'name': 'Emeric'},
            'message': 'Minor UI improvements',
            'tree': {'sha': '79cb4c492cc6205cd6c90c76a2e777366547f875'},
            'verification': {'reason': 'unsigned', 'verified': False}},
 'committer': {'avatar_url': 'https://avatars2.githubusercontent.com/u/691258?v=4',
               'id': 691258,
               'login': 'emericg',
               'site_admin': False,
               

                                                                                

### Assignment 1
Reductions are often used in data processing in order to gather more useful data out of raw data. 

In this case we want to know how many commits a given RDD contains.

@param commits RDD containing commit data.

@return Long indicating the number of commits in the given RDD.


In [4]:
def assignment_1(commit_rdd):
    return commit_rdd.count()

In [5]:
# test
result = assignment_1(commit_rdd)
assert result == 10000

                                                                                

### Assignment 2
We want to know how often programming languages are used in committed files. We require a RDD containing Tuples of the used file extension, combined with the number of occurrences. If no filename or file extension is used we assume the language to be 'unknown'.

@param commits RDD containing commit data.

@return RDD containing tuples indicating the programming language (extension) and number of occurrences."""

In [6]:
def assignment_2(commit_rdd):
    result_rdd = (commit_rdd.flatMap(lambda commit: [fs['filename'] for fs in commit['files']])       # get filenames
                           .map(lambda filename: filename if "." in filename else "unknown")          # determine unknown file extension cases
                           .map(lambda filename: (filename.split(".")[-1], 1))                        # get file extensions for 'known files', add 1s for next step reduce
                           .reduceByKey(lambda v1, v2: v1+v2)                                         # count per file extensions
                 )
    return result_rdd

In [7]:
# test
result_rdd = assignment_2(commit_rdd)
result_set = set(result_rdd.collect())
expected = {("cmake", 13), ("png", 4467)}
assert expected.issubset(result_set) == True, f"Expected {expected} not found in results"

                                                                                

### Assignment 3

Competitive users on Github might be interested in their ranking in number of commits. We require as return a RDD containing Tuples of the rank (zero indexed) of a commit author, a commit authors name and the number of commits made by the commit author. As in general with performance rankings, a higher performance means a better ranking (0 = best). In case of a tie, the lexicographical ordering of the usernames should be used to break the tie.

@param commits RDD containing commit data.

@return RDD containing commit author names and total count of commits done by the author, in ordered fashion.

In [8]:
def assignment_3(commit_rdd):
    result_rdd = (commit_rdd
                  .map(lambda commit_data: 
                       (commit_data['commit']['author']['name'], 1))  # get author names, with commit count per commit (=1)
                  .reduceByKey(lambda v1, v2: v1+v2)                  # count commits per user
                  .sortBy(lambda pair: pair[0], ascending=True)       # first ensure lexicographical ordering of the usernames 
                  .sortBy(lambda pair: pair[1], ascending=False)      # utilize pyspark stable ordering (keep lexicographical order to break ties), order by count
                  .zipWithIndex()                                     # add ranks
                  .map(lambda x: (x[1], x[0][0], x[0][1]))            # fix formatting, new format: (rank, name, count)
                 )
    return result_rdd

In [9]:
# test
result_rdd = assignment_3(commit_rdd)
top_results = result_rdd.take(96)
result_set = set(top_results)
expected = {(8, "Otto Taute", 34), (9, "SpeedTracker", 33), (85, "John", 8)}
assert True == expected.issubset(result_set), "Expected subset not found in top results"

                                                                                

### Assignment 4
Some users are interested in seeing an overall contribution of all their work. For this exercise we an RDD that contains the committer name and the total of their commits. As stats are optional, missing Stat cases should be handles as s"Stat(0, 0, 0)". If an User is given that is not in the dataset, then the username should not occur in the return RDD.

@param commits RDD containing commit data.

@return RDD containing committer names and an aggregation of the committers Stats.


In [10]:
def assignment_4(commit_rdd, users_list: List[str]):
    result_rdd = (commit_rdd.map(lambda c_data: (c_data['commit']['author']['name'],             # get name
                                                (c_data.get('stats', {}).get('total', 0),        # get total number of changes in this commit
                                                c_data.get('stats', {}).get('additions', 0),     # get number of additions in this commit
                                                c_data.get('stats', {}).get('deletions', 0))     # get number of deletions in this commit
                                                ))                                            
                            .filter(lambda t: t[0] in users_list)                                # keep users of interest
                            .reduceByKey(lambda v1, v2: (v1[0]+v2[0], v1[1]+v2[1], v1[2]+v2[2])) # add them up per user across commits
                            .map(lambda x: {x[0]: {"total": x[1][0],
                                                  "additions": x[1][1],
                                                  "deletions": x[1][2]}})                        # formatting
                 )
    return result_rdd

In [11]:
# test
result_rdd = assignment_4(commit_rdd, ["Otto Taute"])
results = result_rdd.collect()
expected_stats = {'total': 724, 'additions': 703, 'deletions': 21}
expected_res = {"Otto Taute": expected_stats}
assert expected_res == results[0], "Stats for Otto Taute do not match"

                                                                                

### Assignment 5
There are different types of people, those who own repositories, and those who make commits. Although Git blame is excellent in finding these types of people, we want to do it in Spark. We require as output an RDD containing the names of commit authors and repository owners that have either exclusively committed to repositories, or exclusively own repositories in the given RDD. Note that the repository owner is contained within Github urls.

@param commits RDD containing commit data.

@return RDD of Strings representing the username that have either only committed to repositories or only own repositories 

Note: consider comparisons only between the commit/author/name and the repo owner username found in the url.

In [12]:
def assignment_5(commit_rdd):

    # get distinct repo names
    repos_rdd = commit_rdd.map(lambda c_data: c_data['url'].split("/")[4]).distinct().collect()
    repos_set = set(repos_rdd)
    bc_repos = sc.broadcast(repos_set)

    # get distinct user names 
    users_rdd = commit_rdd.map(lambda c_data: c_data['commit']['author']['name']).distinct().collect()
    users_set = set(users_rdd)
    bc_users = sc.broadcast(users_set)

    # results: users who made commits but have no repos and repos that have no users committing with the same name
    result_rdd = (commit_rdd.map(lambda c_data: c_data['commit']['author']['name'])
                            .filter(lambda name: name not in bc_repos.value)
                 ).union(
                     (commit_rdd.map(lambda c_data: c_data['url'].split("/")[4])
                                .filter(lambda repo: repo not in bc_users.value)
                     )
                 )
    
    return result_rdd

In [13]:
# test
result_rdd = assignment_5(commit_rdd)
results = set(result_rdd.collect())

expected_subset = {"EdenStrive", "Paul Blackmore", "nextgis"}

assert expected_subset.issubset(results), "expected subset not in result"


                                                                                

### Assignment 6

Sometimes developers make mistakes, sometimes they make many. One way of observing mistakes in commits is by looking at so-called revert commits. We define a 'revert streak' as the number of times `Revert` occurs in a commit. Note that for a commit to be eligible for a 'commit streak', its message must start with `Revert`.

As an example: `Revert "Revert ...` would be a revert streak of 2, whilst `Oops, Revert Revert little mistake` would not be a 'revert streak' at all.

We require as return a RDD containing Tuples of the username of a commit author and a Tuple containing the length of the longest streak of an user and how often said streak has occurred. Note that we are only interested in the longest commit streak of each author (and its frequency).

 @param commits RDD containing commit data.

 @return RDD of Tuple type containing a commit author username, and a tuple containing the length of the longest commit streak as well its frequency

In [14]:
def assignment_6(commit_rdd):   
    result_rdd = (commit_rdd.map(lambda c_data: 
                                 (c_data['commit']['author']['name'],           # keep relevant data: user names, commit messages
                                  c_data['commit']['message']))
                            .filter(lambda x: x[1].startswith("Revert "))       # filter out all cases when commit message does not start with "Revert "
                            .map(lambda x: ((x[0], x[1].count("Revert ")), 1))  # count number of 'Revert's in each case
                            .reduceByKey(lambda v1, v2: v1+v2)                  # use count and user name as double key to get frequency by counting these
                            .map(lambda x: ((x[0][0], x[1]), x[0][1]))          # format to be able to find max count per user: ((name, revert frequency), revert count)
                            .reduceByKey(lambda c1, c2: max(c1, c2))            # keep cases with max count per user (and frequency)
                            .map(lambda x: (x[0][0], (x[1], x[0][1])))          # format result: (name, (max count, frequency))
                 )
    return result_rdd



In [15]:
# test
result = assignment_6(commit_rdd)
result_set = set(result.collect())
expected_set = set({("Oscar Näzell", (2, 1)), 
                   ("DMEdesignmyeye", (2, 1)), 
                   ("zebbykhairah", (1, 1)),
                   ("sayan7848", (1, 4)), 
                   ("Max Stewart", (1, 1)), 
                   ("Yehuda Alkalay", (1, 1)), 
                   ("Aleksander Andresen", (1, 1)),
                   ("joshuous", (1, 2)), 
                   ("jalalirs", (1, 2)), 
                   ("steverendell", (1, 1)),
                   ('Aurel Bílý', (1, 1)),
                   ('David Bürgin', (1, 1)),
                   ('James Santucci', (1, 1)),
                   ('AzzyC', (1, 1)),
                   ('hajeklu', (1, 1)),
                   ('miraclestars', (1, 1)), 
                   ('François Kooman', (1, 1)) 
                   })

assert expected_set == result_set, "expected set does not match result set"

                                                                                

### Assignment 7
We want to know the number of commits that are made to each repository contained in the given RDD. Besides the number of commits, we also want to know the unique committers that contributed to the repository. 

Note that from this exercise on, expensive functions like groupBy are no longer allowed to be used. In real life these wide dependency functions are performance killers, but luckily there are better performing alternatives! The automatic graders will check the computation history of the returned RDD's.

@param commits RDD containing commit data.

@return RDD containing a tuple indicating the repository name, the number of commits made to the repository as well as the unique committer usernames that committed to the repository.

In [16]:
def assignment_7(commit_rdd):
    result_rdd = (commit_rdd.map(lambda c_data: 
                                 (c_data['url'].split("/")[5],               # get relevant data: repo name, committer name (commit/author/name) - put names in sets, init 1 for count     
                                 ({c_data['commit']['author']['name']}, 1)))  
                            .reduceByKey(lambda v1, v2: (v1[0].union(v2[0]), # reduce by key: count commits per repo name, collect usernames by unioning sets
                                                         v1[1]+v2[1]))
                            .map(lambda x: (x[0], x[1][1], tuple(x[1][0])))  # formatting
                 )
    return result_rdd

In [17]:
# test
result = assignment_7(commit_rdd)



expected_set = {("SharedUmbrella", 1, ("AlbertGandolf", ))}

result_test_set = set(result.filter(lambda x: x[0] == "SharedUmbrella").collect())

assert  expected_set == result_test_set

                                                                                

### Assignment 8
Return RDD of tuples containing the repository name and all the files that are contained in that repository.
Note that the file names must be unique, so if files occur multiple times (for example due to removal, or new additions), the newest File object must be returned. As the files' filenames are an `Option[String]` discard the files that do not have a filename.

@param commits RDD containing commit data.

@return RDD containing the files in each repository as described above.

In [18]:
def assignment_8(commit_rdd):
    result_rdd = (
        commit_rdd.flatMap(lambda c_data: [((c_data['url'].split("/")[5],        # gather necessaary data: repo names (from url), filenames (files[array]/element/filename), file objects (files[array]/element) - use flatmap to get each file element separately, commit/author/date to find latest versions
                                            file_object.get("filename", None)),  # map so that keys are: repo name and filename
                                            (file_object, 
                                             c_data['commit']['author']['date'])) for file_object in c_data['files']])
                  .filter(lambda x: x[0][1] is not None)                         # filter cases when no filename is given
                  .reduceByKey(lambda v1, v2: v1 if v1[1][1] > v2[1][1] else v2) # reduce by key: keep entries with latest dates
                  .map(lambda x: (x[0][0], [x[1][0]]))                           # formatting: (repo, list([file object]))
                  .reduceByKey(lambda v1, v2: v1 + v2)                           # combine latest file objects into a list per repo
        
    )

    return result_rdd

In [19]:
# test
result_rdd = assignment_8(commit_rdd)

result_subset1 = (result_rdd.filter(lambda x: x[0] == "hackaton-pelicula-009")
                            .flatMap(lambda x: [(x[0], file["filename"], file["sha"]) for file in x[1]])
                            .filter(lambda x: x[1] == "src/style.css")
                            .collect()
                           )

assert result_subset1[0][2] == "eea08e79ef5ba079af92e5eebad9239bd128213e", "sample commit sha incorrect"

                                                                                

### Assignment 9

For this assignment you are asked to find all the files of a single repository. This in order to create an overview of each files, do this by creating a tuple containing the file name, all corresponding commit SHA's as well as a Stat object representing the changes made to the file.

@param commits RDD containing commit data.

@return RDD containing Tuples representing a file name, its corresponding commit SHA's and a Stats object representing the total aggregation of changes for a file. Format: RDD[(filename: String, SHAs: Seq[String], Stats: Tuple[int, int, int])]

In [20]:
def assignment_9(commit_rdd, repo_name):
    result_rdd = (commit_rdd.filter(lambda c_data: c_data["url"].split("/")[5] == repo_name)  # filter out irrelevant repos
                            .flatMap(lambda c_data: [(file.get('filename', None),             # get relevant data: filenames, commit sha (per file), stats (flatMap), use filenames as keys
                                                      ([file.get('sha', None)], 
                                                       (file.get('changes', None), 
                                                        file.get('additions', None), 
                                                        file.get('deletions', None)
                                                       ))) 
                                                     for file in c_data["files"]])
                            .reduceByKey(lambda v1, v2:                        # reduce by key to merge shas into one list, add up stats
                                                         (v1[0] + v2[0],       # combine shas
                                                         (v1[1][0]+v2[1][0],   # changes
                                                         v1[1][1]+v2[1][1],    # additions
                                                         v1[1][2]+v2[1][2]))   # deletions
                                        )
                            .map(lambda x: (x[0], x[1][0], x[1][1]))           # formatting
                
                )

    return result_rdd
    

In [21]:
# test
expected_num_files = 40
expected_file1_key = "rustfmt.toml"
expected_stats_file1 = (12, 12, 0)

result_rdd = assignment_9(commit_rdd, "gitlab-rs")
result_file1 = result_rdd.filter(lambda x: x[0] == expected_file1_key).collect()[0]

assert result_file1[2] == expected_stats_file1
assert len(result_rdd.collect()) == expected_num_files

                                                                                

### Assignment 10
We want to generate an overview of the work done by an user per repository. For this we request an RDD containing a tuple containing the committer username, repository name and a `Stats` object. The Stats object containing the total number of additions, deletions and total contribution.

@param commits RDD containing commit data.

@return RDD containing tuples of committer names, repository names and and Option[Stat] representing additions and deletions.

In [57]:
def assignment_10(commit_rdd):
    result_rdd = (commit_rdd.map(lambda c_data: ((c_data["commit"]["committer"]["name"],        # get relevant data: committer name, repo name, stats, make name and repo the keys
                                                 c_data["url"].split("/")[5]),
                                               (c_data.get("stats", {}).get("total", 0),        
                                                c_data.get("stats", {}).get("additions", 0), 
                                                c_data.get("stats", {}).get("deletions", 0))))
                            .reduceByKey(lambda v1, v2: (v1[0]+v2[0], v1[1]+v2[1],v1[2]+v2[2])) # reduce by key to add up stats
                            .map(lambda x: (x[0][0], x[0][1], x[1]))                            # format result
                 )
    return result_rdd

In [58]:
# test
result_rdd = assignment_10(commit_rdd)
result_list = result_rdd.collect()

expected_set = {("GitHub", "danfe", (0, 0, 0)), 
                ("ivankovskiy-pc", "AIM-Chat", (75, 45, 30))}

result_set = set(result_list)
assert expected_set.issubset(result_set) == True, "expected subset not in result"

                                                                                

In [59]:
# stop spark
spark.stop()