In [1]:
import base64
import glob
from pprint import pprint
import pickle
import numpy as np
import datetime
import pandas as pd
import networkx as nx
from networkx import Graph, DiGraph, simple_cycles
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import requests
from github import Github
from IPython.display import display, HTML
pd.set_option("display.max_rows", None, "display.max_columns", None)

In [None]:
repo_dic={'Android-Universal-Image-Loader':'nostra13/Android-Universal-Image-Loader',
'antlr':'antlr/antlr4',
'BroadleafCommerce':'BroadleafCommerce/BroadleafCommerce',
'hazelcast':'hazelcast/hazelcast',
'junit':'junit-team/junit',
'mapdb':'jankotek/mapdb',
'mcMMO':'mcMMO-Dev/mcMMO',
'nasa_mct':'nasa/mct',
'neo4j':'neo4j/neo4j',
'netty':'netty/netty',
'orientdb':'orientechnologies/orientdb',
'titan':'thinkaurelius/titan',
'dragula':'bevacqua/dragula',
'sequelpro':'sequelpro/sequelpro',
'wifiphisher':'wifiphisher/wifiphisher',
'redshift':'jonls/redshift',
'winget-cli':'microsoft/winget-cli',
'PaddleDetection':'PaddlePaddle/PaddleDetection',
'EyeWitness':'FortyNorthSecurity/EyeWitness',
'photoprism':'photoprism/photoprism',
'chartist_js':'gionkunz/chartist-js',
'cleave_js':'nosir/cleave.js',
'ossrs_srs':'ossrs/srs',
'go_cloud':'google/go-cloud',
'Windows_driver_samples':'microsoft/Windows-driver-samples',
'sweetalert':'sweetalert2/sweetalert2',
'files-community_Files':'files-community/Files',
'gspread':'burnash/gspread',
'prisma':'prisma/prisma',
'd3_d3':'d3/d3',
'snipe-it':'snipe/snipe-it',
'klipper':'KevinOConnor/klipper',
'TheAlgorithms_Java':'TheAlgorithms/Java',
'lighthouse':'GoogleChrome/lighthouse',
'aws_cli':'aws/aws-cli',
'jquery':'jquery/jquery',
'socketio':'socketio/socket.io',
'expressjs':'expressjs/express',
'swift_evolution':'apple/swift-evolution',
'CNTK':'Microsoft/CNTK',
'sqlmap':'sqlmapproject/sqlmap',
'Arduino':'arduino/Arduino',
'iina':'iina/iina',
'corda':'corda/corda',
'LightGBM':'microsoft/LightGBM',
'MMdnn':'microsoft/MMdnn',
'gson':'google/gson',
'rstudio':'rstudio/rstudio',
'turicreate':'apple/turicreate',
'hive':'apache/hive',
'intellij_plugins':'JetBrains/intellij-plugins',
'eureka':'Netflix/eureka',
'ignite':'apache/ignite',
'hammerjs':'hammerjs/hammer.js',
'postgres':'postgres/postgres',
'gitbucket':'gitbucket/gitbucket',
'mediaelement':'mediaelement/mediaelement',
'graal':'oracle/graal',
'fabric':'hyperledger/fabric',
'jquery-mobile':'jquery/jquery-mobile',
'Activiti':'Activiti/Activiti',
'nopCommerce':'nopSolutions/nopCommerce',
'zipline':'quantopian/zipline',
'CoreNLP':'stanfordnlp/CoreNLP',
'metabase':'metabase/metabase',
'efcore':'dotnet/efcore',
'harbor':'goharbor/harbor',
'swoole-src':'swoole/swoole-src',
'Signal-Android':'signalapp/Signal-Android',
'tdesktop':'telegramdesktop/tdesktop',
'ExoPlayer':'google/ExoPlayer',
'kong':'Kong/kong',
'ReactiveCocoa':'ReactiveCocoa/ReactiveCocoa',
'checkstyle':'checkstyle/checkstyle',
'rabbitmq-server':'rabbitmq/rabbitmq-server',
'cesium':'CesiumGS/cesium',
'ILSpy':'icsharpcode/ILSpy'}


In [None]:
for key, value in repo_dic.items():
    
    df=pd.read_csv('data/commits/'+key+'.csv',index_col=0)
    
    #remove committers which is null values 
    df=df.dropna(subset=['committer']).reset_index(drop=True)

    lst=df.SHA.unique().tolist()

    # remove files changes greater than 200
    df=df[df['files changed']<200].reset_index(drop=True) 

    # group commits based on 3 months
    df['date'] = df['date'].apply(lambda _: datetime.datetime.strptime(_,'%Y-%m-%d %H:%M:%S'))
    df=df.sort_values(by='date',ascending=False).reset_index(drop=True)   
    # data collection threshold time
    start_time = '5/15/21'     
    # Define dates as datetime objects
    start_time = datetime.datetime.strptime(start_time, '%m/%d/%y')
    time_change = datetime.timedelta(weeks=13) 
    time_lst=[]
    time=start_time
    time_lst.append(time)
    for s_date in df.date:
        if time<df.iloc[-1].date:
            break
        time = time-time_change
        time_lst.append(time)

    #create commit groups
    commitgroups=[]
    for value in range(len(time_lst)-1):
        df_interval=df [ (df.date<time_lst[value]) & (df.date>time_lst[value+1]) ]
        commitgroups.extend(df_interval.groupby('filename')['committer'].unique().tolist())

    #construct developer networks
    %matplotlib inline
    G_symmetric1 = nx.Graph()
    for group in commitgroups:
        test_list = group
        res = [(a, b) for idx, a in enumerate(test_list) for b in test_list[idx + 1:]] 
        for x in res:    
            G_symmetric1.add_edge(x[0],x[1])

    #save commit groups characterized to DSN
    a_file = open("data/commit_groups/commitGroups_"+key+".pkl", "wb")
    pickle.dump(commitgroups, a_file)
    a_file.close()


### network_metrics_collection

In [None]:
commitgroups_files = (glob.glob("data/commit_groups/*.pkl"))

# get network metrics for all constructed developer social networks in dataframe 
df_row=[]
for file in commitgroups_files:

    a_file = open(file, "rb")
    commit_groups=pickle.load(a_file)
    
    %matplotlib inline
    G_symmetric = nx.Graph()
    for group in commit_groups:
        test_list = group
        res = [(a, b) for idx, a in enumerate(test_list) for b in test_list[idx + 1:]] 
        for x in res:    
            G_symmetric.add_edge(x[0],x[1])
    
    G=[G_symmetric.subgraph(c).copy() for c in sorted(nx.connected_components(G_symmetric),key=len,reverse=True)][0]
    
    #find no of components in graph
    components = dict(enumerate(nx.connected_components(G)))

    #add different centrality metrics to a list
    lst=[]
    lst.append('--')
    lst.append(len(G))
    lst.append(G.number_of_edges())
    lst.append(sum(dict(G.degree()).values())/float(len(G)))
    lst.append(nx.average_clustering(G))
    lst.append(nx.transitivity(G))
    lst.append(nx.degree_pearson_correlation_coefficient(G)) 
    lst.append(nx.average_shortest_path_length(G))
    lst.append(nx.diameter(G))
    lst.append(nx.radius(G))
    lst.append(nx.clustering(G))
    lst.append(dict(nx.degree(G))) 
    lst.append(nx.eccentricity(G))
    lst.append(nx.betweenness_centrality(G))
    lst.append(nx.closeness_centrality(G))
    lst.append(nx.eigenvector_centrality(G))
    lst.append(nx.degree_centrality(G))
    lst.append(nx.pagerank(G))
    try:
        hits_authorities=nx.hits(G)
        lst.append(hits_authorities[0])
        lst.append(hits_authorities[1])
    except:
        lst.append(dict())
        lst.append(dict())
    df_row.append(lst)   


# create dataframe from collected list of metrics
columns= ['repo','no_of_nodes','no_of_edges','average_degree','average_clustering_coefficient',
          'transitivity','assortativity_coefficient','average_path_length','diameter','radius',
          'clustering_coefficients','degree_of_nodes','eccentricities',
          'betweenness_centralities','closeness_centralities','eigenvector_centralities',
          'degree_centralities','page_rank','hubs','authorities']
df=pd.DataFrame(df_row,columns=columns)
df['repo']=commitgroups_files
for index in df.index:
    df['repo'][index]=df['repo'][index][13:-4]

#convert the dictionary node values to lists 
for index in df.index:
    df['clustering_coefficients'][index]=[round(num, 2) for num in list(df['clustering_coefficients'][index].values())]
    df['degree_of_nodes'][index]=[round(num, 2) for num in list(df['degree_of_nodes'][index].values())]
    df['eccentricities'][index]=[round(num, 2) for num in list(df['eccentricities'][index].values())]
    df['betweenness_centralities'][index]=[round(num, 2) for num in list(df['betweenness_centralities'][index].values())]
    df['closeness_centralities'][index]=[round(num, 2) for num in list(df['closeness_centralities'][index].values())]
    df['eigenvector_centralities'][index]=[round(num, 2) for num in list(df['eigenvector_centralities'][index].values())]
    df['degree_centralities'][index]=[round(num, 2) for num in list(df['degree_centralities'][index].values())]
    df['page_rank'][index]=[round(num, 2) for num in list(df['page_rank'][index].values())]
    df['hubs'][index]=[round(num, 2) for num in list(df['hubs'][index].values())]
    df['authorities'][index]=[round(num, 2) for num in list(df['authorities'][index].values())]

# add power law exponents
#!pip install powerlaw
#import powerlaw
df['powerlaw_exponent']=df['degree_of_nodes'].apply(lambda x: powerlaw.Fit(x).power_law.alpha)

# save dataframe
a_file = open("data/Network_Metrics_real_dataset.pkl", "wb")
pickle.dump(df, a_file)
a_file.close()  
