In [80]:
from bs4 import BeautifulSoup
from collections import Counter
import json
import os
import pandas as pd
from pandas.io.json import json_normalize    
import re


In [71]:
def is_event(record, e):
    return record['type'] == e

In [72]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

def preprocess_message(m):
    m = m.lower()
    m = strip_html(m)
    m = remove_between_square_brackets(m)
    return m
    

### Events

In [93]:
events = []
with open("../data/practice_gh.json", 'r') as f:
    for i, record in enumerate(f):
        data = json.loads(record)
        events.append({
            'idx': i,
            'repo_id': data['repo']['id'],
            'event_type': data['type']})

In [94]:
df_events = pd.DataFrame(events)

In [104]:
df_events.groupby("repo_id").count().reset_index().sort_values('idx', ascending=False).head()

Unnamed: 0,repo_id,event_type,idx
2488,21481110,80,80
135,702550,64,64
5066,28648149,42,42
1713,15233168,36,36
5886,28689169,30,30


In [105]:
df_events[df_events['repo_id']==21481110].head()

Unnamed: 0,event_type,idx,repo_id
88,PushEvent,88,21481110
208,PushEvent,208,21481110
317,PushEvent,317,21481110
458,PushEvent,458,21481110
602,PushEvent,602,21481110


### Push Events

In [None]:
class EventParser:
    def __init__(self, data, idx):
        self.raw_data = data
        self.idx = idx
        
    def parse(self, data):
        """Extract attributes from data."""
        NotImplementedError()
        
    def to_dict(self):
        """Attributes to dict."""
        NotImplementedError()
        
class PushEventParser(EventParser):
    def __init__(self, data, idx):
        super(PushEventParser, self).__init__(data, idx)
        self.type = "PushEvent"
        
    def parse(self):
        """Extract attributes from data."""
        self.repo_id = data['repo']['id']
        self.repo_name = data['repo']['name']
        self.before = data['payload']['before']
        self.commits = data['payload']['commits']
        self.processed_commits = self._extract_commits()
        self.distinct = data['payload']['distinct_size']  # Number of commits in payload
        
    def to_dict(self):
        """Attributes to dict list of dicts.
        
        Note that we return list of dicts to handle payloads with lists.
        
        Expect to accumulate data in lists using .extend()
        
        """
        data = []
        d_global = {
            "repo_id": self.repo_id,
            "repo_name": self.repo_name,
            "distinct": self.distinct
        }
        for commit in self.processed_commits:
            commit.update(d_global)
            data.append(commit)
        return data
        
    def _extract_commits(self):
        processed_commits = []
        for i, commit in enumerate(self.commits):
            curr_commmit = {
                'commit_idx': i,
                'commit_sha': commit['sha'],
                'is_unique': commit['distinct'],
                'message': commit['message']
            }
            processed_commits.append(curr_commmit)
        return processed_commits

In [None]:
d = []
with open("../data/practice_gh.json", 'r') as f:
    for i, record in enumerate(f):
        data = json.loads(record)
        if is_event(data, 'PushEvent'):
            parser = PushEventParser(data, i)
            parser.parse()
            curr_data = parser.to_dict()
            d.extend(curr_data)

In [75]:
df = pd.DataFrame(d)

In [76]:
preprocess_message(df['message'][0])

'fix main header height on mobile'

In [106]:
# df.sort_values('distinct', ascending=False)
df[df['repo_id'] == 15841152].head()

Unnamed: 0,commit_idx,commit_sha,distinct,is_unique,message,repo_id,repo_name
2547,0,568cfcc7ba58e46ab8e282ba87a4fc5a01b37365,2287,True,lwt 2.4.6,15841152,pippijn/opam-repository
2548,1,af9f482a6cac522d5df6754a2186c50fba8eef2c,2287,True,tighter constraint on rss for stog 0.4 -> 0.7,15841152,pippijn/opam-repository
2549,2,a6444b805dc3489f1c08ea0aff2a02b277920633,2287,True,Merge branch 'master' of https://github.com/OC...,15841152,pippijn/opam-repository
2550,3,8a005b284fc80e5af176a8c14c77364c5b5103ce,2287,True,OCaml version specified,15841152,pippijn/opam-repository
2551,4,656fd3e9f93d8e1e4223fc225d53c478568973f3,2287,True,-j jobs option added to Coq IDE,15841152,pippijn/opam-repository


In [107]:
df.groupby(by=['repo_id']) \
    .count() \
    .reset_index(drop=False) \
    .sort_values('repo_name', ascending=False) \
    .head()

Unnamed: 0,repo_id,commit_idx,commit_sha,distinct,is_unique,message,repo_name
236,6058234,235,235,235,235,235,235
235,6058151,80,80,80,80,80,80
1012,21481110,80,80,80,80,80,80
231,6031942,80,80,80,80,80,80
1076,22131621,59,59,59,59,59,59
