In [31]:
import pandas as pd
import json
import datetime
import os

data_dir = '/Users/coreysery/Developer/School/CSC3130/reposcrape/data/'

alldata = {}
"""
Alldata structure

[date]
    [lang]
        [repo]
            author
            desc
            forks
            lang
            repo
            repoLink
            stars
            starsToday
"""

# Iterate through dates
d = datetime.datetime(2017, 2, 1)
end = datetime.datetime.now()
delta = datetime.timedelta(days=1)
while d <= end:
    date = d.strftime("%Y-%m-%d")
    try:
        fn = "{}.github.json".format(date)
        with open(data_dir + fn) as json_data:
            data = json.load(json_data)
            alldata[date] = data
    except Exception as e:
        pass
    
    d += delta

# for filename in os.listdir(data_dir):
#     date = filename.split('.')[0]
#     with open(data_dir + filename) as json_data:
#         alldata[date] = json.load(json_data)
        
print('done')




done


In [None]:
"""
Data structure

[
    date: [
        lang: [
            repos: [
                author
                desc
                forks
                lang
                repo
                repoLink
                stars
                starsToday
            ]
        ]
    ]
]

The raw data is not formattable to panda data strucutures
"""

# alldata["2017-02-09"]


In [55]:
# Top Trending repos
import operator

def reposBy(field, date, n=50):
    """
    Sort repos for given day
    
    Args:
        field: Field to sort by (stars, stars today, or forks)
        date: Date to pull from the alldata dictionary
        n: Return the top n repos
        
    Returns:
        Sorted list of sets
        Each set is (repo, count)
        where count is the sum of the given field for the repo
    """
    
    # Check that day exists
    if alldata[date] is None:
        return []
    
    
    results = {}

    for lang, repos in alldata[date].items():
        for r in repos:
            name = r['repo']
            results[name] = r[field]
            
    sorted_res = sorted(results.items(), key=operator.itemgetter(1), reverse=True)
    results = dict((x, y) for x, y in sorted_res[:n-1])

    return results



In [64]:
# Top Trending Languages

import operator

def languagesBy(field, date, n=50):
    """
    Sort languages for given day
    
    Args:
        field: Field to sort by (stars, stars today, or forks)
        date: Date to pull from the alldata dictionary
        n: Return the top n languages
        
    Returns:
        Sorted list of sets
        Each set is (language, count)
        where count is the sum of the given field for the language
    """
    
    # Check that day exists
    if alldata[date] is None:
        return {}
    
    results = {}
    for lang, repos in alldata[date].items():
        results[lang] = 0
        
        for r in repos:
            results[lang] += r[field]


    sorted_res = sorted(results.items(), key=operator.itemgetter(1), reverse=True)
    results = dict((x, y) for x, y in sorted_res[:n-1])
    
    return results



In [4]:
def getAll(func, args):
    by = {}

    for date in alldata.keys():
#         byStars[date] = reposBy('forks', date, 10)
        by[date] = func(args.split())

    results = {}
    for date in by:
        for pair in by[date]:
            repo = pair[0]
            if not repo in results:
                results[repo] = []

            results[repo].append(pair[1])

    return results

In [None]:
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

# Needed to run plotly offline
init_notebook_mode(connected=True)

byStars = {}

for date in alldata.keys():
    byStars[date] = reposBy('stars', date, 10)
    
results = {}
for date in byStars:
    for repo, count in byStars[date].items():
        if not repo in results:
#             print(repo)
            results[repo] = []

        results[repo].append(count)

# print(results)

traces = []
for repo, history in results.items():
    traces.append(go.Scatter(
#         x = history,
        y = history,
        name = repo
    ))

iplot(traces)

    

In [None]:
def velocity(top):
    return 0
    

In [None]:
velocity(reposBy('forks', date, 10))

In [67]:
import numpy as np

"""
stars on a day for a language
"""

dates = list(alldata.keys())
dates.sort()
res = {'Date': dates}

for i, date in enumerate(dates):
    data = languagesBy('stars', date, 10)
    for lang, amount in data.items():
        if not lang in res:
            print(lang)
            res[lang] = np.zeros(len(dates))
        res[lang][i] = amount
        
df = pd.DataFrame(res)
df = df.set_index('Date')

df
    


Go
C
TypeScript
JavaScript
Python
C++
Ruby
HTML
all
API Blueprint
Isabelle ROOT
GCC Machine Description
Gentoo Eclass
Gettext Catalog
Literate Agda
Linux Kernel Module
Inform 7
Web Ontology Language
Standard ML
Unity3D Asset
Jupyter Notebook
Wavefront Object
Maven POM
Shell
Ant Build System
AGS Script
SubRip Text
Alpine Abuild
DNS Zone
Cap'n Proto
Java
Filebench WML
Ecere Projects
Grammatical Framework
Visual Basic
Groovy Server Pages
Spline Font Database
OpenType Feature File
Public Key
OpenEdge ABL
TI Program
Parrot Assembly
Pure Data
Protocol Buffer
DIGITAL Command Language
OpenRC runscript
POV-Ray SDL
Darcs Patch
Game Maker Language
Inno Setup
1C Enterprise
Graph Modeling Language
Raw token data
Gentoo Ebuild
Literate CoffeeScript
Literate Haskell
Module Management System
Component Pascal
Emacs Lisp
Unified Parallel C
Unix Assembly
SRecode Template
Python console
Csound Document
C2hs Haskell
ColdFusion CFC
Propeller Spin
Linker Script
Sublime Text Config
World of Warcraft Addon Dat

Unnamed: 0_level_0,1C Enterprise,AGS Script,API Blueprint,APL,Alpine Abuild,Ant Build System,Apollo Guidance Computer,Arduino,C,C#,...,TypeScript,Unified Parallel C,Unity3D Asset,Unix Assembly,Vim script,Visual Basic,Wavefront Object,Web Ontology Language,World of Warcraft Addon Data,all
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-02-09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,201330.0,0.0,...,171769.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,445915.0
2017-02-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,217839.0,0.0,...,175043.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,367667.0
2017-02-11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,213180.0,0.0,...,169218.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,362092.0
2017-02-12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,221257.0,0.0,...,183678.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,318397.0
2017-02-13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,187261.0,0.0,...,184076.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,274831.0
2017-02-14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,197646.0,0.0,...,185820.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,276658.0
2017-02-15,0.0,0.0,361452.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-02-16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,179858.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,407429.0,0.0,407420.0
2017-02-17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,193845.0,0.0,...,157882.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,426164.0
2017-02-19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,193414.0,0.0,...,181544.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,487300.0


In [52]:
import numpy as np

"""
forks on a day for a language
"""

dates = list(alldata.keys())
dates.sort()
res = {'Date': dates}

for i, date in enumerate(dates):
    data = languagesBy('forks', date)
    for lang, amount in data.items():
#         print(lang)
        if not lang in res:
            a = np.zeros(len(dates))
            a.fill(0)
            res[lang] = a
#         print(res[lang])
        res[lang][i] = amount
        
df = pd.DataFrame(res)
df = df.set_index('Date')

df
    


{0: 'Ecere Projects', 1409: 'ColdFusion', 7939: 'Matlab', 4: 'Omgrofl', 1989: 'Protocol Buffer', 32: 'Harbour', 54: 'XProc', 10: 'IGOR Pro', 909: 'Nim', 14: 'UrWeb', 6287: 'Kotlin', 22: 'APL', 721: 'Mathematica', 1475: 'Limbo', 2901: 'Arduino', 6678: 'Vue', 25: 'Ioke', 4378: 'Eagle', 79: 'TLA', 28: 'Fantom', 182: 'Jasmin', 1440: 'ANTLR', 3: 'Filebench WML', 806: 'OpenEdge ABL', 18854: 'C#', 4305: 'Perl', 20269: 'CoffeeScript', 3054: 'POV-Ray SDL', 48: 'Monkey', 8: 'Zimpl', 7603: 'Erlang', 62: 'Mirah', 13238: 'Emacs Lisp', 82873: 'C++', 190: 'HLSL', 7789: 'Assembly'}


ValueError: could not convert string to float: 'Ecere Projects'

In [44]:
import numpy as np

"""
stars on a day for a repo
"""

dates = list(alldata.keys())
dates.sort()
res = {'Date': dates}

for i, date in enumerate(dates):
    data = reposBy('stars', date)
    for lang, amount in data.items():
#         print(lang)
        if not lang in res:
            a = np.zeros(len(dates))
            a.fill(0)
            res[lang] = a
#         print(res[lang])
        res[lang][i] = amount
        
df = pd.DataFrame(res)
df = df.set_index('Date')

df
    


Unnamed: 0_level_0,.dotfiles,.emacs.d,.hammerspoon,.nixpkgs,.vim,0.30000000000000004,007,07Client,0ctf2017_kernel_pwn,0x0D,...,zsdx,zsh-autosuggestions,zsh-completions,zsh-syntax-highlighting,zshconf,zstd,ztree,zxcvbn,zxing,zynq-sandbox
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-02-09,2.0,1082.0,12.0,0.0,0.0,0.0,100.0,2.0,0.0,23.0,...,0.0,0.0,0.0,0.0,6.0,0.0,106.0,6473.0,0.0,24.0
2017-02-10,2.0,1082.0,12.0,0.0,158.0,0.0,100.0,2.0,0.0,23.0,...,0.0,0.0,0.0,0.0,6.0,0.0,0.0,6476.0,0.0,24.0
2017-02-11,2.0,0.0,12.0,0.0,0.0,0.0,100.0,2.0,0.0,23.0,...,0.0,0.0,0.0,0.0,6.0,4688.0,0.0,6478.0,0.0,25.0
2017-02-12,2.0,0.0,12.0,0.0,0.0,0.0,100.0,2.0,0.0,23.0,...,0.0,0.0,0.0,0.0,6.0,0.0,0.0,6480.0,0.0,25.0
2017-02-13,2.0,0.0,12.0,0.0,0.0,0.0,100.0,2.0,0.0,23.0,...,0.0,0.0,0.0,0.0,6.0,0.0,0.0,6484.0,0.0,25.0
2017-02-14,2.0,1083.0,12.0,0.0,0.0,0.0,100.0,2.0,0.0,23.0,...,0.0,0.0,0.0,0.0,6.0,0.0,0.0,6485.0,0.0,25.0
2017-02-15,2.0,0.0,12.0,0.0,0.0,0.0,100.0,2.0,0.0,23.0,...,0.0,0.0,0.0,0.0,6.0,0.0,0.0,6490.0,0.0,25.0
2017-02-16,2.0,1084.0,12.0,0.0,0.0,0.0,100.0,2.0,0.0,23.0,...,0.0,0.0,0.0,0.0,6.0,0.0,0.0,6493.0,0.0,25.0
2017-02-17,2.0,0.0,12.0,0.0,0.0,0.0,100.0,2.0,0.0,23.0,...,0.0,0.0,0.0,0.0,6.0,0.0,0.0,6494.0,0.0,25.0
2017-02-19,2.0,1084.0,12.0,0.0,0.0,0.0,100.0,2.0,0.0,23.0,...,0.0,2165.0,0.0,3232.0,6.0,0.0,0.0,6621.0,0.0,25.0


In [None]:
"""
April 9th

Difficulties
    My data in its raw format wasn't able to fit to dataframe. It's difficult to show before and after representations.

"""