# Setup stuff

In [104]:
from fabric import Connection
import subprocess
from dateutil.parser import parse
import datetime
from collections import defaultdict
import csv
import pandas as pd


class RemoteFile(object):
    def __init__(self, machine, path):
        self.machine = machine
        self.path = path
        
    def age(self): 
        r = self.machine.run("stat -c '%y' " + self.path, hide=True )
        return(parse(r.stdout.strip()))
    def checksum(self):
        r = self.machine.run("sum " + self.path, hide=True )
        return(r.stdout.split(" ")[0].strip())
    
    

class LocalFile(object):
    def __init__(self, path):
        self.path = path
        
    def age(self): 
        r = subprocess.check_output(['stat', '-f', '%m', self.path])
        return(datetime.datetime.utcfromtimestamp(int(r)))
    def checksum(self):
        r = subprocess.run(['sum', self.path], capture_output=True)
        return(r.stdout.decode().split(" ")[0].strip())
    def countdistinct(self, column):
        try:
            values = defaultdict(int)
            for row in csv.DictReader(open(self.path)):
                values[row[column]] += 1
            return values
        except KeyError as ve:
            print(column, "not found in", self.path,":",";".join(row.keys()))
    def dictreader(self):
        return csv.DictReader(open(self.path))
    def df(self, indexes):
        df = pd.read_csv(self.path,skipinitialspace=True)
        df.set_index(indexes,inplace=True)#, drop=True, inplace=True)
        return df
   
erebor = Connection("cbogart@erebor.lti.cs.cmu.edu")
oscar = Connection("cbogart@da4.eecs.utk.edu:443")


# Canonical list of packages to study

In [110]:
original_repo_list = RemoteFile(oscar,"/home/cbogart/dev-migration/data/common/gh-repos.txt")
local_orig_repo_list = LocalFile("/Users/bogart-MBP-isri/Dropbox/research/dev-migration/data/common/gh-repos.txt")
original_repo_map = RemoteFile(erebor,"/usr2/scratch/cbogart/dev-migration/data/mapping.csv")
local_orig_repo_map = LocalFile("/Users/bogart-MBP-isri/Dropbox/research/dev-migration/data/common/pkg-repo-map.csv")
print("Original repo list: ", original_repo_list.age())
print("Local repo list: ", local_orig_repo_list.age(), "Checksum: ", original_repo_list.checksum() == local_orig_repo_list.checksum())
print("Original repo map: ", original_repo_map.age())
print("Local repo map: ", local_orig_repo_map.age(), "Checksum: ", original_repo_map.checksum() == local_orig_repo_map.checksum())

#host,repo,ecosystem,packageID,packageName
repos_per_ecosystem  = defaultdict(dict)
multi_package_repos = defaultdict(set)
for row in local_orig_repo_map.dictreader():
    if row["repo"] in repos_per_ecosystem[row["ecosystem"]]:
        multi_package_repos[row["ecosystem"]].add(row["repo"])
        del repos_per_ecosystem[row["ecosystem"]][row["repo"]]
    else:
        repos_per_ecosystem[row["ecosystem"]][row["repo"]] = row["packageID"]
    

Original repo list:  2018-03-27 13:59:14.675785-04:00
Local repo list:  2018-07-11 21:37:30 Checksum:  True
Original repo map:  2018-07-22 16:58:24.926131-04:00
Local repo map:  2018-07-12 16:27:52 Checksum:  False


In [115]:
ecosystems = ("Atom Bioconductor Cargo CocoaPods CPAN CRAN Eclipse Go " + \
           "Hackage Hex Lua Maven NPM NuGet Packagist Pypi Rubygems Stackage").split(" ")
# found oscar packages
oscar_repos = defaultdict(set) 
merged_repo_map = dict()   # repo to pacakge
for e in ecosystems:
    print("Ecosystem",e,":")
    oscar_found_repos = RemoteFile(oscar,"/data/play/cbogart/foundpackages." + e + ".csv")
    local_oscar_found_repos = LocalFile("/Users/bogart-MBP-isri/Dropbox/research/dev-migration/data/common/foundpackages." + e + ".csv")
    print("    Oscar found repos, local match=", oscar_found_repos.checksum() == local_oscar_found_repos.checksum())
    oscar_repos[e] = { r["package"] for r in local_oscar_found_repos.dictreader() }
    print("    Libraries.io has ", len(repos_per_ecosystem[e]), 
          "unique repos, and", len(multi_package_repos[e]), "repos with multiple packages")
    print("    OSCAR found ", len(oscar_repos[e]), "repos")
    overlap = oscar_repos[e].intersection(repos_per_ecosystem[e].keys())
    print("    The overlap is", len(overlap))
    merged_repo_map[e] = {p:repos_per_ecosystem[e][p] for p in overlap }
                                                                                                                                             

Ecosystem Atom :
    Oscar found repos, local match= True
    Libraries.io has  10163 unique repos, and 224 repos with multiple packages
    OSCAR found  8965 repos
    The overlap is 8788
Ecosystem Bioconductor :
    Oscar found repos, local match= True
    Libraries.io has  0 unique repos, and 0 repos with multiple packages
    OSCAR found  1475 repos
    The overlap is 0
Ecosystem Cargo :
    Oscar found repos, local match= False
    Libraries.io has  9274 unique repos, and 841 repos with multiple packages
    OSCAR found  8722 repos
    The overlap is 8118
Ecosystem CocoaPods :
    Oscar found repos, local match= True
    Libraries.io has  37737 unique repos, and 739 repos with multiple packages
    OSCAR found  32312 repos
    The overlap is 31756
Ecosystem CPAN :


UnexpectedExit: Encountered a bad command exit code!

Command: 'sum /data/play/cbogart/foundpackages.CPAN.csv'

Exit code: 1

Stdout:



Stderr:

sum: /data/play/cbogart/foundpackages.CPAN.csv: No such file or directory



In [116]:
orig_repo_ecosystems = local_orig_repo_map.countdistinct("ecosystem")
print(orig_repo_ecosystems)
for e in merged_repo_map: print(e,len(merged_repo_map[e]))


defaultdict(<class 'int'>, {'Packagist': 191926, 'NPM': 510393, 'Cargo': 12490, 'NuGet': 53324, 'Rubygems': 105434, 'Pypi': 80966, 'Go': 748775, 'CocoaPods': 39453, 'Hex': 5737, 'CPAN': 10313, 'Maven': 61477, 'Hackage': 7941, 'Atom': 10615, 'CRAN': 3529})
Atom 8788
Bioconductor 0
Cargo 8118
CocoaPods 31756


In [93]:
# Author-level information

In [None]:
#author,eco,package,epoch,tz,core,files
#larrybrid <jcameronfulton@gmail.com>,Pypi,LarryBrid_python-mailerlite,1441576273,-0400,1,
packages = dict()
for e in ecosystems:
    authact = RemoteFile
    packages[e] = 


# Ecosystem-wide practices

* Counts of major practices by year by ecosystem as distilled directly from libraries.io
* Gathered by 10a_lib_project_practices-db.py
* file is releases.csv

In [98]:

lio_raw_versions = RemoteFile(erebor,"/usr2/scratch/libraries.io/raw/versions-1.0.0-2017-06-15.csv")
eco_yrly_prax_lio = RemoteFile(erebor,"/usr2/scratch/cbogart/dev-migration/data/libraries/releases.csv")
get_eco_yrly_prax_lio = RemoteFile(erebor,"/usr2/scratch/cbogart/dev-migration/scripts/10a_lib_project-practices-db.py")
local_eco_yrly_prax_lio = LocalFile("../data/laptop/releases.csv")
print("Counts of release practices by ecosystem from libraries.io")
print("   latest script updated ", get_eco_yrly_prax_lio.age())
print("   latest file output updated ", eco_yrly_prax_lio.age())
print("   local file output updated ", local_eco_yrly_prax_lio.age())
print("   local file checksum comparison ", eco_yrly_prax_lio.checksum() == local_eco_yrly_prax_lio.checksum())


Counts of release practices by ecosystem from libraries.io
   latest script updated  2018-07-23 16:41:51.916120-04:00
   latest file output updated  2018-07-23 16:44:44.300282-04:00
   local file output updated  2018-07-23 20:46:39
   local file checksum comparison  True


# Package-level practice data from libraries.io


In [None]:
#../data/laptop/pkg_releases.csv
#../data/laptop/pkg_releases.csv
#year,eco,kind,package,count
#1970,Rubygems,release,nakilon,1


# Influence data  package x ecosystem

In [None]:
#"../data/laptop/infl/infl_pkg_" 

# Author-level and package-level summaries from Oscar

In [None]:
for e in ecosystem:
    author_prax = RemoteFile(oscar,"/data/play/authBehavior." + e + ".csv.gz")
    local_author_prax = LocalFile("../data/laptop/authBehavior." + e + ".csv.gz")
    local_repo_prax = LocalFile("../data/laptop/projectBehavior." + e + ".csv.gz")
    
