In [67]:
from bs4 import BeautifulSoup
from collections import Counter, defaultdict
import json
import nltk
import numpy as np
import os
import pandas as pd
from pandas.io.json import json_normalize    
import re

Parsing functionality for event types:
```
{'CommitCommentEvent',
 'CreateEvent',
 'DeleteEvent',
 'ForkEvent',
 'GollumEvent',
 'IssueCommentEvent',
 'IssuesEvent',
 'MemberEvent',
 'PublicEvent',
 'PullRequestEvent',
 'PullRequestReviewCommentEvent',
 'PushEvent',
 'ReleaseEvent',
 'WatchEvent'}
 ```

In [23]:
def is_event(record, e):
    return record['type'] == e

In [24]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

def preprocess_message(m):
    m = m.lower()
    m = strip_html(m)
    m = remove_between_square_brackets(m)
    return m
    

### Events

In [71]:
event_sequence = defaultdict(list)
events = []
with open("../data/practice_gh.json", 'r') as f:
    for i, record in enumerate(f):
        data = json.loads(record)
        _data = {
            'idx': i,
            'repo_id': data['repo']['id'],
            'created_at': data['created_at'],
            'event_type': data['type']}
        events.append(_data)
        event_sequence[_data['repo_id']].append(_data['event_type'])

In [72]:
sequences = [np.array(['start'] + s + ['end']) for s in event_sequence.values()]

In [77]:
sequences_flattend = np.concatenate(sequences).flatten()

In [90]:
# Bigrams
d_bigrams = defaultdict(list)
for (first, second) in list(nltk.bigrams(sequences_flattend)):
    d_bigrams[first].append(second)
d2_bigrams = defaultdict(Counter)
# Transition counts
for k, arr in d_bigrams.items():
    d2_bigrams[k] = Counter(arr)

In [91]:
# TODO (BP): Need to fill 0 probs for events that do not co-occur.
for k, arr in d2_bigrams.items():
    keys = arr.keys()
    cnts = np.fromiter(arr.values(), dtype=float)
    total = np.sum(cnts)
    probs = cnts / total
    d2_bigrams[k] = {k:pr for k, pr in zip(keys, probs)}

In [92]:
d2_bigrams

defaultdict(collections.Counter,
            {'CommitCommentEvent': {'CommitCommentEvent': 0.273972602739726,
              'ForkEvent': 0.0136986301369863,
              'IssueCommentEvent': 0.0136986301369863,
              'IssuesEvent': 0.0273972602739726,
              'PullRequestEvent': 0.0273972602739726,
              'PushEvent': 0.1095890410958904,
              'end': 0.5342465753424658},
             'CreateEvent': {'CreateEvent': 0.25696804894629505,
              'DeleteEvent': 0.023113528212100613,
              'ForkEvent': 0.0006798096532970768,
              'IssueCommentEvent': 0.0013596193065941536,
              'IssuesEvent': 0.004078857919782461,
              'MemberEvent': 0.002719238613188307,
              'PullRequestEvent': 0.02379333786539769,
              'PullRequestReviewCommentEvent': 0.0006798096532970768,
              'PushEvent': 0.17675050985723997,
              'ReleaseEvent': 0.004758667573079538,
              'WatchEvent': 0.002039428959891

In [39]:
df_events = pd.DataFrame(events)

In [40]:
df_events.groupby("repo_id").count().reset_index().sort_values('idx', ascending=False).head(n=5)

Unnamed: 0,repo_id,created_at,event_type,idx
2488,21481110,80,80,80
135,702550,64,64,64
5066,28648149,42,42,42
1713,15233168,36,36,36
5886,28689169,30,30,30


In [50]:
df_events.to_dict().keys()

dict_keys(['event_type', 'repo_id', 'idx', 'created_at'])

In [34]:
df_events[df_events['repo_id']==724712]

Unnamed: 0,created_at,event_type,idx,repo_id
1554,2015-01-01T15:08:29Z,IssueCommentEvent,1554,724712
1835,2015-01-01T15:10:00Z,PullRequestEvent,1835,724712
4492,2015-01-01T15:24:14Z,IssuesEvent,4492,724712
5602,2015-01-01T15:30:14Z,CommitCommentEvent,5602,724712
5944,2015-01-01T15:31:59Z,IssuesEvent,5944,724712
7635,2015-01-01T15:41:05Z,CommitCommentEvent,7635,724712
9615,2015-01-01T15:51:05Z,CommitCommentEvent,9615,724712
9624,2015-01-01T15:51:06Z,IssueCommentEvent,9624,724712
9626,2015-01-01T15:51:07Z,PushEvent,9626,724712
9627,2015-01-01T15:51:07Z,CommitCommentEvent,9627,724712


### Push Events

In [35]:
    


class EventParser:
    def __init__(self, data, idx):
        self.raw_data = data
        self.idx = idx
        
    def parse(self, data):
        """Extract attributes from data."""
        NotImplementedError()
        
    def to_dict(self):
        """Attributes to dict."""
        NotImplementedError()
        
class PushEventParser(EventParser):
    def __init__(self, data, idx):
        super(PushEventParser, self).__init__(data, idx)
        self.type = "PushEvent"
        
    def parse(self):
        """Extract attributes from data."""
        self.repo_id = data['repo']['id']
        self.repo_name = data['repo']['name']
        self.before = data['payload']['before']
        self.commits = data['payload']['commits']
        self.processed_commits = self._extract_commits()
        self.distinct = data['payload']['distinct_size']  # Number of commits in payload
        self.created_at = data['created_at']
        
    def to_dict(self):
        """Attributes to dict list of dicts.
        
        Note that we return list of dicts to handle payloads with lists.
        
        Expect to accumulate data in lists using .extend()
        
        """
        data = []
        d_global = {
            "repo_id": self.repo_id,
            "repo_name": self.repo_name,
            "distinct": self.distinct,
            "created_at": self.created_at
        }
        for commit in self.processed_commits:
            commit.update(d_global)
            data.append(commit)
        return data
        
    def _extract_commits(self):
        processed_commits = []
        for i, commit in enumerate(self.commits):
            curr_commmit = {
                'commit_idx': i,
                'commit_sha': commit['sha'],
                'is_unique': commit['distinct'],
                'message': commit['message']
            }
            processed_commits.append(curr_commmit)
        return processed_commits

In [37]:
d = []
with open("../data/practice_gh.json", 'r') as f:
    for i, record in enumerate(f):
        data = json.loads(record)
        if is_event(data, 'PushEvent'):
            import pdb; pdb.set_trace();
            
            parser = PushEventParser(data, i)
            parser.parse()
            curr_data = parser.to_dict()
            d.extend(curr_data)

> <ipython-input-37-b24b9e20a8f0>(8)<module>()
-> parser = PushEventParser(data, i)
(Pdb) c
> <ipython-input-37-b24b9e20a8f0>(6)<module>()
-> import pdb; pdb.set_trace();
(Pdb) c
> <ipython-input-37-b24b9e20a8f0>(8)<module>()
-> parser = PushEventParser(data, i)
(Pdb) quit()


BdbQuit: 

In [75]:
df = pd.DataFrame(d)

In [76]:
preprocess_message(df['message'][0])

'fix main header height on mobile'

In [106]:
# df.sort_values('distinct', ascending=False)
df[df['repo_id'] == 15841152].head()

Unnamed: 0,commit_idx,commit_sha,distinct,is_unique,message,repo_id,repo_name
2547,0,568cfcc7ba58e46ab8e282ba87a4fc5a01b37365,2287,True,lwt 2.4.6,15841152,pippijn/opam-repository
2548,1,af9f482a6cac522d5df6754a2186c50fba8eef2c,2287,True,tighter constraint on rss for stog 0.4 -> 0.7,15841152,pippijn/opam-repository
2549,2,a6444b805dc3489f1c08ea0aff2a02b277920633,2287,True,Merge branch 'master' of https://github.com/OC...,15841152,pippijn/opam-repository
2550,3,8a005b284fc80e5af176a8c14c77364c5b5103ce,2287,True,OCaml version specified,15841152,pippijn/opam-repository
2551,4,656fd3e9f93d8e1e4223fc225d53c478568973f3,2287,True,-j jobs option added to Coq IDE,15841152,pippijn/opam-repository


In [107]:
df.groupby(by=['repo_id']) \
    .count() \
    .reset_index(drop=False) \
    .sort_values('repo_name', ascending=False) \
    .head()

Unnamed: 0,repo_id,commit_idx,commit_sha,distinct,is_unique,message,repo_name
236,6058234,235,235,235,235,235,235
235,6058151,80,80,80,80,80,80
1012,21481110,80,80,80,80,80,80
231,6031942,80,80,80,80,80,80
1076,22131621,59,59,59,59,59,59
