In [1]:
import pandas as pd
import numpy as np 
import requests
import os
import zipfile

# Functions

In [160]:
def month_count(dframe, column, return_column=None):
    '''This function will return a grupby object with the counts for all columns per month'''
    
    month_gb = dframe.groupby(dframe[column].dt.to_period("M")).count()
    
    if (return_column==None):
        return [month_gb[column]]
    else:
        return [month_gb[return_column]]

In [185]:
def association(df):
    '''This function will return a Pandas DataFrame within a list, of the association distribution for each month'''
    
    pr_count_month_mem = df[df['authorAssociation']=='MEMBER'].groupby(df.createdAt.dt.to_period('M'))
    pr_count_month_con = df[df['authorAssociation']=='CONTRIBUTOR'].groupby(df.createdAt.dt.to_period("M"))
    pr_count_month_non = df[df['authorAssociation']=='NONE'].groupby(df.createdAt.dt.to_period("M"))
    pr_count_month_own = df[df['authorAssociation']=='OWNER'].groupby(df.createdAt.dt.to_period("M"))
    pr_count_month_col = df[df['authorAssociation']=='COLLABORATOR'].groupby(df.createdAt.dt.to_period("M"))
    
    total = month_count(df, 'createdAt')[0]
    
    unique = df.authorAssociation.unique()
    templist = []
    
    for i in unique:
        templist.append(switch(i, pr_count_month_mem, pr_count_month_con, pr_count_month_non, pr_count_month_own, pr_count_month_col, total, unique)) 
    
    assoc_dist = pd.DataFrame(templist).T
    assoc_dist['v_count'] = pd.Series()
    assoc_dist = assoc_dist.fillna(0)
    print(assoc_dist.columns)
    
    for i in assoc_dist.columns[:-1]:
        assoc_dist['v_count'] += assoc_dist[i]
    
    
    index = assoc_dist.index.to_list()
    v_count = assoc_dist.v_count
    index_for_df = assoc_dist.index.levels[0][:]
    
    temp = pd.DataFrame({'CONTRIBUTOR':None, 'MEMBER':None, 'NONE':None, 'OWNER':None, 'COLLABORATOR':None},index=index_for_df)
    
    for i in index:
        timestamp = i[0].strftime('%Y-%m')
        assoc = i[1]
        count = v_count[timestamp][assoc]
        if (assoc=='CONTRIBUTOR'):
            temp.loc[timestamp, assoc] = count
        if (assoc=='MEMBER'):
            temp.loc[timestamp, assoc] = count
        if (assoc=='NONE'):
            temp.loc[timestamp, assoc] = count
        if (assoc=='OWNER'):
            temp.loc[timestamp, assoc] = count
        if (assoc=='COLLABORATOR'):
            temp.loc[timestamp, assoc] = count
    temp = temp.fillna(0)
        
    return temp

In [188]:
def switch(i, mem, con, non, own, col, tot, uni): 
    pr_count_month_mem = mem
    pr_count_month_non = non
    pr_count_month_con = con
    pr_count_month_own = own
    pr_count_month_col = col
    
    total = tot
    unique = uni
    
    switcher={
            'NONE': [pr_count_month_non.authorAssociation.value_counts()/total if ('NONE' in unique) else np.nan][0], 
            'MEMBER': [pr_count_month_mem.authorAssociation.value_counts()/total if ('MEMBER' in unique) else np.nan][0],
            'CONTRIBUTOR': [pr_count_month_con.authorAssociation.value_counts()/total if ('CONTRIBUTOR' in unique) else np.nan][0],
            'OWNER': [pr_count_month_own.authorAssociation.value_counts()/total if ('OWNER' in unique) else np.nan][0],
            'COLLABORATOR': [pr_count_month_col.authorAssociation.value_counts()/total if ('COLLABORATOR' in unique) else np.nan][0],
         }
    return switcher.get(i,"Invalid")


In [163]:
def duration(df, duration_column):
    '''This function will return a list of a series with the avg duration per month'''
    df = df[df['state'] != 'OPEN']
    if len(df) != 0:
        df['duration'] = df['closedAt'] - df['createdAt']
        duration = df[duration_column].apply(lambda x: x.total_seconds()).groupby(df.createdAt.dt.to_period('M'))
        avg_duration = duration.mean().apply(lambda x: pd.to_timedelta(x, unit='s'))
        return [avg_duration]
    else:
        return None

In [164]:
def repo_count_filler(file_list, column_name, repo_df, start_date, return_column=None):
    '''File list must be a list of pickles, column name will create a new column where the data will be loaded to
       return column will return a different column within the groupby count df'''
    
    repo_df[column_name] = None
    for f in file_list:
        name = f.split('_', 1)[0]
        index = repo_df[repo_df['owner'].apply(lambda x: x.find(name))==0].index
        if (len(index)==0):
            continue
        else:
            index = index[0]
            repo_df.at[index, column_name] = month_count(pd.read_pickle('./repo_files/{}'.format(f)),start_date, return_column)
#             print(name)

In [165]:
def repo_star_filler(file_list, column_name, repo_df, start_date, return_column=None):
    '''File list must be a list of pickles, column name will create a new column where the data will be loaded to
       return column will return a different column within the groupby count df'''
    
    repo_df[column_name] = None
    for f in file_list:
        name = f.split('_', 1)[0]
        index = repo_df[repo_df['owner'].apply(lambda x: x.find(name))==0].index
        if (len(index)==0):
            continue
        else:
            df = pd.read_pickle('./repo_files/{}'.format(f))
            df = df[df['starredAt'] >= '2012-12-31']
            index = index[0]
            repo_df.at[index, column_name] = month_count(df, start_date, return_column)

In [166]:
def repo_duration_filler(file_list, column_name, repo_df):
    '''File list must be a list of pickles, column name will create a new column where the data will be loaded to
       return column will return a different column within the groupby count df'''
    
    repo_df[column_name] = None
    for f in file_list:
        name = f.split('_', 1)[0]
        index = repo_df[repo_df['owner'].apply(lambda x: x.find(name))==0].index
        if (len(index)==0):
            continue
        else:
            index = index[0]
            repo_df.at[index, column_name] = duration(pd.read_pickle('./repo_files/{}'.format(f)), 'duration')

In [167]:
def repo_assoc_filler(file_list, column_name, repo_df):
    '''File list must be a list of pickles, column name will create a new column where the data will be loaded to'''
    
    repo_df[column_name] = None
    for f in file_list:
        name = f.split('_', 1)[0]
        index = repo_df[repo_df['owner'].apply(lambda x: x.find(name))==0].index
        if (len(index)==0):
            continue
        else:
            df = pd.read_pickle('./repo_files/{}'.format(f))
            df = df.reset_index().drop(columns='index')
            index = index[0]
            repo_df.at[index, column_name] = association(df)

---

# Load all 200 repos

In [72]:
# Load 200 repos
repo_200 = pd.read_pickle('df_repo_5000.pk1')
repo_200 = repo_200[:200]

In [46]:
# Find the path of the files
path = ['../../../Files/more_repos/cleaned_0-99/', '../../../Files/more_repos/cleaned_100-199/']
path_2 = [i for i in os.listdir(path[0])]

# Save all repo's names
repo_names = []

# Loop through paths to append the repo's names to `repo_names` list
for i in range(len(path)):
    for j in range(len(path_2)):
        for k in os.listdir(path[i]+path_2[j]):
            repo_names.append(k)

In [84]:
# List comprehension of all 4 
cm_names = [i for i in repo_names if '_cm.pk1' in i]
is_names = [i for i in repo_names if '_is.pk1' in i]
pr_names = [i for i in repo_names if '_pr.pk1' in i]
st_names = [i for i in repo_names if '_st.pk1' in i]

### Update repo with: count, duration, and association

In [156]:
# Updates the repo by adding a new column and its value is a seris of new column's by month
repo_count_filler(cm_names, 'commit_count', repo_200, 'committedDate')
repo_count_filler(is_names, 'issue_count', repo_200, 'createdAt')
repo_count_filler(pr_names, 'pull_request_count', repo_200, 'createdAt')
repo_star_filler(st_names, 'stars_count', repo_200, 'starredAt')



In [149]:
# Update the repo by adding a new duration column of `pr` and `is` by month
repo_duration_filler(pr_names, 'pr_duration', repo_200)
repo_duration_filler(is_names, 'is_duration', repo_200)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-cop

In [189]:
# Update the repo by add assoc of `pr` and `is` by month
repo_assoc_filler(pr_names, 'pr_assoc', repo_200)
repo_assoc_filler(is_names, 'is_assoc', repo_200)



Index([0, 1, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 2, 3, 'v_count'], dtype='object')
Index([0, 1, 2, 3, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 2, 3, 'v_count'], dtype='object')




Index([0, 1, 2, 3, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 2, 3, 'v_count'], dtype='object')




Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')




Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')




Index([0, 1, 2, 3, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')




Index([0, 1, 2, 3, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 2, 3, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')




Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 2, 3, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')




Index([0, 1, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 2, 3, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')




Index([0, 1, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 2, 3, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')




Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')




Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')




Index([0, 1, 2, 3, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')




Index([0, 1, 2, 3, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')




Index([0, 1, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')




Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')




Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')




Index([0, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')




Index([0, 1, 2, 3, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')




Index([0, 1, 2, 3, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')




Index([0, 1, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')




Index([0, 1, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')




Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')




Index([0, 1, 2, 3, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')




Index([0, 'v_count'], dtype='object')
Index([0, 1, 2, 3, 'v_count'], dtype='object')




Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')




Index([0, 1, 'v_count'], dtype='object')




Index([0, 1, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')




Index([0, 1, 2, 3, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')




Index([0, 1, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 2, 3, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')




Index([0, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')




Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')




Index([0, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 2, 3, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')




Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')




Index([0, 1, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')




Index([0, 1, 2, 3, 'v_count'], dtype='object')
Index([0, 1, 2, 3, 'v_count'], dtype='object')




Index([0, 1, 2, 3, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 2, 3, 'v_count'], dtype='object')




Index([0, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')




Index([0, 1, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')




Index([0, 1, 2, 3, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 2, 3, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')




Index([0, 1, 2, 3, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')




Index([0, 1, 2, 3, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')




Index([0, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')




Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')




Index([0, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')




Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')




Index([0, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')




Index([0, 1, 2, 3, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')




Index([0, 1, 2, 3, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')




Index([0, 1, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')




Index([0, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')




Index([0, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')




Index([0, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 2, 3, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')




Index([0, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 2, 3, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')




Index([0, 1, 2, 3, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')




Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')




Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')




Index([0, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 2, 3, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')




Index([0, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')




Index([0, 1, 2, 3, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')




Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')




Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 2, 3, 'v_count'], dtype='object')




Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')




Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 2, 3, 'v_count'], dtype='object')




Index([0, 'v_count'], dtype='object')
Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')




Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 2, 3, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')




Index([0, 1, 'v_count'], dtype='object')
Index([0, 1, 2, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')
Index([0, 'v_count'], dtype='object')




### Save df as a pickle file

In [191]:
repo_200.to_pickle('repo_200.pk1')

---

In [258]:
df = pd.read_pickle('repo_200.pk1')
df

Unnamed: 0,createdAt,description,totalForks,hasIssuesEnabled,hasWikiEnabled,id,totalIssues,licenseInfo,name,nameWithOwner,...,totalCommits,totalStargazers,updatedAt,commit_count,issue_count,pull_request_count,stars_count,pr_duration,pr_assoc,is_assoc
0,2016-08-21 05:31:51+00:00,An android process bar library associated with...,37,True,True,MDEwOlJlcG9zaXRvcnk2NjE4Mjg1MA==,0,,android_ProcessBar,hzw1199/android_ProcessBar,...,10.0,199,2019-08-28 06:14:45+00:00,"[[9, 1]]",,,"[[143, 16, 3, 6, 2, 2, 1, 2, 3, 3, 1, 2, 1, 1,...",,,
1,2015-10-20 18:22:34+00:00,A ScrollView component that handles keyboard a...,377,True,True,MDEwOlJlcG9zaXRvcnk0NDYyNjI1MA==,272,MIT,react-native-keyboard-aware-scroll-view,APSL/react-native-keyboard-aware-scroll-view,...,166.0,3118,2019-09-09 21:17:49+00:00,"[[2, 6, 2, 1, 4, 7, 9, 1, 5, 2, 6, 4, 3, 7, 4,...","[[2, 3, 2, 10, 14, 6, 6, 5, 3, 5, 5, 5, 7, 3, ...","[[1, 1, 3, 6, 2, 5, 3, 4, 2, 4, 4, 2, 2, 5, 5,...","[[1, 17, 2, 7, 5, 6, 16, 13, 27, 30, 39, 38, 4...","[[11 days 02:17:31, 521 days 17:08:22, 2 days ...",CONTRIBUTOR MEMBER NONE OW...,CONTRIBUTOR MEMBER NONE OW...
2,2015-11-24 12:43:44+00:00,Sample app to demonstrate multidex,51,True,True,MDEwOlJlcG9zaXRvcnk0Njc5MTAyMg==,1,,multidex-sample,mmadev/multidex-sample,...,2.0,118,2019-03-22 01:44:33+00:00,[[2]],[[1]],,"[[27, 20, 8, 1, 5, 3, 4, 5, 2, 4, 4, 1, 1, 3, ...",,,CONTRIBUTOR MEMBER NONE OWNER C...
3,2015-10-05 09:34:22+00:00,multiNetX is a python package for the manipula...,31,True,True,MDEwOlJlcG9zaXRvcnk0MzY3NTc1OQ==,6,,multinetx,nkoub/multinetx,...,70.0,104,2019-08-22 15:04:41+00:00,"[[41, 12, 3, 3, 2, 1, 3, 5]]","[[1, 1, 1, 1, 1, 1]]","[[1, 2]]","[[1, 40, 4, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1...","[[4 days 04:09:04, 0 days 03:50:42]]",CONTRIBUTOR MEMBER NONE OWNER C...,CONTRIBUTOR MEMBER NONE OWNER C...
4,2013-07-18 14:39:58+00:00,Painlessly create beautiful matplotlib plots.,140,True,True,MDEwOlJlcG9zaXRvcnkxMTUwNTIxOA==,64,MIT,prettyplotlib,olgabot/prettyplotlib,...,252.0,1482,2019-09-09 13:16:14+00:00,"[[3, 70, 8, 18, 3, 27, 35, 12, 21, 12, 15, 17,...","[[1, 5, 1, 3, 10, 4, 5, 4, 2, 9, 4, 4, 1, 1, 1...","[[1, 4, 3, 8, 2, 1, 3, 5, 4, 3, 1, 1, 1]]",,"[[1 days 12:00:22, 54 days 08:35:33.750000, 1 ...",CONTRIBUTOR MEMBER NONE OWNE...,CONTRIBUTOR MEMBER NONE O...
5,2010-08-15 22:59:33+00:00,"Github Repository Finder, now powered by GitHu...",8,True,True,MDEwOlJlcG9zaXRvcnk4Mzk5NTQ=,15,,GithubFinder,sr3d/GithubFinder,...,128.0,126,2019-07-14 10:02:17+00:00,"[[106, 6, 2, 1, 1, 1, 7, 4]]","[[7, 2, 2, 3, 1]]",[[2]],"[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",[[0 days 04:11:59]],CONTRIBUTOR MEMBER NONE OWNER C...,CONTRIBUTOR MEMBER NONE O...
6,2016-04-01 13:53:58+00:00,🍡 LeetCode Online Judge刷题题解(Java/C++/Python/Ru...,113,True,True,MDEwOlJlcG9zaXRvcnk1NTIzNjA1Ng==,6,,LeetCode,liuchuo/LeetCode,...,354.0,327,2019-09-09 03:50:24+00:00,"[[72, 21, 1, 4, 47, 65, 25, 1, 8, 15, 16, 21, ...","[[1, 1, 1, 1, 2]]","[[1, 1]]","[[2, 1, 1, 2, 1, 5, 4, 5, 1, 5, 3, 7, 2, 5, 4,...",,CONTRIBUTOR MEMBER NONE OWNER C...,CONTRIBUTOR MEMBER NONE OWNER C...
7,2012-03-20 16:17:18+00:00,【鼠鬚管】Rime for macOS,263,True,True,MDEwOlJlcG9zaXRvcnkzNzc3MjEw,316,GPL-3.0,squirrel,rime/squirrel,...,389.0,2269,2019-09-09 18:56:21+00:00,"[[12, 8, 15, 10, 11, 2, 2, 12, 7, 3, 10, 42, 1...","[[1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 2, 1, 3, 2,...","[[2, 1, 5, 1, 2, 2, 1, 1, 3, 1, 1, 1, 1, 2, 1,...","[[2, 7, 6, 11, 5, 7, 5, 3, 4, 5, 9, 8, 5, 7, 5...","[[1 days 01:34:17, 0 days 23:41:26, 0 days 06:...",CONTRIBUTOR MEMBER NONE OWNE...,CONTRIBUTOR MEMBER NONE OW...
8,2018-12-01 21:10:44+00:00,Reversible Reproducible Documents,23,True,True,MDEwOlJlcG9zaXRvcnkxNTk5ODc1NjA=,43,NOASSERTION,redoc,noamross/redoc,...,103.0,391,2019-09-09 09:52:39+00:00,"[[51, 42, 6, 2, 2]]","[[23, 13, 2, 3, 2]]","[[10, 1]]","[[192, 5, 5, 4, 148, 20, 10, 4, 3]]","[[4 days 05:31:00.400000, 5 days 09:00:57]]",CONTRIBUTOR MEMBER NONE OWNER C...,CONTRIBUTOR MEMBER NONE O...
9,2019-02-07 03:27:12+00:00,Provable adversarial robustness at ImageNet scale,12,True,True,MDEwOlJlcG9zaXRvcnkxNjk1MTM4MzA=,0,,smoothing,locuslab/smoothing,...,5.0,83,2019-09-08 08:08:57+00:00,"[[3, 1, 1]]",,"[[2, 1, 1]]","[[30, 2, 2, 14, 15, 12, 5, 3]]","[[0 days 00:00:21.500000, 0 days 00:00:07, 1 d...",CONTRIBUTOR MEMBER NONE OWNER C...,


# Calculate Dyanmic Time Warping(DTW)

In [245]:
# Index by repo names

owner_names = df['nameWithOwner'].str.split('/').str.join('-').tolist()

In [246]:
owner_names

['hzw1199-android_ProcessBar',
 'APSL-react-native-keyboard-aware-scroll-view',
 'mmadev-multidex-sample',
 'nkoub-multinetx',
 'olgabot-prettyplotlib',
 'sr3d-GithubFinder',
 'liuchuo-LeetCode',
 'rime-squirrel',
 'noamross-redoc',
 'locuslab-smoothing',
 'facebookarchive-grace',
 'SharpRepository-SharpRepository',
 'soulmachine-leetcode',
 'sijuv-protobuf-codec',
 'rajneshrat-ratos',
 'developit-preact-compat',
 'Azure-Samples-raspberry-pi-web-simulator',
 'kentnguyen-KNPathTableViewController',
 'P01son6415-MatchModels',
 'apg-django-favorites',
 'GcsSloop-MacDeveloper',
 'segment-boneyard-analytics.js-integrations',
 'ttscoff-Slogger',
 'aminography-PrimeDatePicker',
 'shery-awesome-atom-packages',
 'jimweirich-sicp-study',
 'chrismattmann-tika-similarity',
 'standard-standard',
 'venmo-VENTouchLock',
 'aurelia-ui-toolkits-aurelia-materialize-bridge',
 'reggie1996-FaceDetect',
 'rails-jquery-rails',
 'sergev-LiteBSD',
 'TailorDev-monod',
 'retest-recheck-web',
 'edlich-nosql-databa

In [222]:
dtw.rename(index=dtw)

hzw1199-android_ProcessBar                                                  hzw1199-android_ProcessBar
APSL-react-native-keyboard-aware-scroll-view              APSL-react-native-keyboard-aware-scroll-view
mmadev-multidex-sample                                                          mmadev-multidex-sample
nkoub-multinetx                                                                        nkoub-multinetx
olgabot-prettyplotlib                                                            olgabot-prettyplotlib
sr3d-GithubFinder                                                                    sr3d-GithubFinder
liuchuo-LeetCode                                                                      liuchuo-LeetCode
rime-squirrel                                                                            rime-squirrel
noamross-redoc                                                                          noamross-redoc
locuslab-smoothing                                                       

In [269]:
df.head()

Unnamed: 0,createdAt,description,totalForks,hasIssuesEnabled,hasWikiEnabled,id,totalIssues,licenseInfo,name,nameWithOwner,...,totalCommits,totalStargazers,updatedAt,commit_count,issue_count,pull_request_count,stars_count,pr_duration,pr_assoc,is_assoc
0,2016-08-21 05:31:51+00:00,An android process bar library associated with...,37,True,True,MDEwOlJlcG9zaXRvcnk2NjE4Mjg1MA==,0,,android_ProcessBar,hzw1199/android_ProcessBar,...,10.0,199,2019-08-28 06:14:45+00:00,"[[9, 1]]",,,"[[143, 16, 3, 6, 2, 2, 1, 2, 3, 3, 1, 2, 1, 1,...",,,
1,2015-10-20 18:22:34+00:00,A ScrollView component that handles keyboard a...,377,True,True,MDEwOlJlcG9zaXRvcnk0NDYyNjI1MA==,272,MIT,react-native-keyboard-aware-scroll-view,APSL/react-native-keyboard-aware-scroll-view,...,166.0,3118,2019-09-09 21:17:49+00:00,"[[2, 6, 2, 1, 4, 7, 9, 1, 5, 2, 6, 4, 3, 7, 4,...","[[2, 3, 2, 10, 14, 6, 6, 5, 3, 5, 5, 5, 7, 3, ...","[[1, 1, 3, 6, 2, 5, 3, 4, 2, 4, 4, 2, 2, 5, 5,...","[[1, 17, 2, 7, 5, 6, 16, 13, 27, 30, 39, 38, 4...","[[11 days 02:17:31, 521 days 17:08:22, 2 days ...",CONTRIBUTOR MEMBER NONE OW...,CONTRIBUTOR MEMBER NONE OW...
2,2015-11-24 12:43:44+00:00,Sample app to demonstrate multidex,51,True,True,MDEwOlJlcG9zaXRvcnk0Njc5MTAyMg==,1,,multidex-sample,mmadev/multidex-sample,...,2.0,118,2019-03-22 01:44:33+00:00,[[2]],[[1]],,"[[27, 20, 8, 1, 5, 3, 4, 5, 2, 4, 4, 1, 1, 3, ...",,,CONTRIBUTOR MEMBER NONE OWNER C...
3,2015-10-05 09:34:22+00:00,multiNetX is a python package for the manipula...,31,True,True,MDEwOlJlcG9zaXRvcnk0MzY3NTc1OQ==,6,,multinetx,nkoub/multinetx,...,70.0,104,2019-08-22 15:04:41+00:00,"[[41, 12, 3, 3, 2, 1, 3, 5]]","[[1, 1, 1, 1, 1, 1]]","[[1, 2]]","[[1, 40, 4, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1...","[[4 days 04:09:04, 0 days 03:50:42]]",CONTRIBUTOR MEMBER NONE OWNER C...,CONTRIBUTOR MEMBER NONE OWNER C...
4,2013-07-18 14:39:58+00:00,Painlessly create beautiful matplotlib plots.,140,True,True,MDEwOlJlcG9zaXRvcnkxMTUwNTIxOA==,64,MIT,prettyplotlib,olgabot/prettyplotlib,...,252.0,1482,2019-09-09 13:16:14+00:00,"[[3, 70, 8, 18, 3, 27, 35, 12, 21, 12, 15, 17,...","[[1, 5, 1, 3, 10, 4, 5, 4, 2, 9, 4, 4, 1, 1, 1...","[[1, 4, 3, 8, 2, 1, 3, 5, 4, 3, 1, 1, 1]]",,"[[1 days 12:00:22, 54 days 08:35:33.750000, 1 ...",CONTRIBUTOR MEMBER NONE OWNE...,CONTRIBUTOR MEMBER NONE O...


In [234]:
from tslearn.metrics import dtw_path
from sklearn.preprocessing import normalize

In [283]:
def calc_dtw(num):
    dtw = []
    path = []
    for el in range(len(df)):
        if df['commit_count'][el] != None:
            print(owner_names[el], el)
            dtw_pathing = dtw_path(normalize(df['commit_count'][num])[0], normalize(df['commit_count'][el])[0])
            dtw.append({owner_names[el]:{owner_names[el]:dtw_pathing[1]}})
            path.append({owner_names[el]:{owner_names[el]:dtw_pathing[0]}})
        else:
            dtw.append({owner_names[el]:{owner_names[el]:np.nan}})
            path.append({owner_names[el]:{owner_names[el]:np.nan}})
    return dtw[0], path[0]

In [284]:
dtw_list = []
path_list = []
for i in range(len(df)):
    dtw, path = calc_dtw(i)
    dtw_list.append(dtw)
    path_list.append(path)

hzw1199-android_ProcessBar 0
APSL-react-native-keyboard-aware-scroll-view 1
mmadev-multidex-sample 2
nkoub-multinetx 3
olgabot-prettyplotlib 4
sr3d-GithubFinder 5
liuchuo-LeetCode 6
rime-squirrel 7
noamross-redoc 8
locuslab-smoothing 9
facebookarchive-grace 10
SharpRepository-SharpRepository 11
soulmachine-leetcode 12
sijuv-protobuf-codec 13
rajneshrat-ratos 14
developit-preact-compat 15
Azure-Samples-raspberry-pi-web-simulator 16
kentnguyen-KNPathTableViewController 17
P01son6415-MatchModels 18
apg-django-favorites 19
GcsSloop-MacDeveloper 20
segment-boneyard-analytics.js-integrations 21
ttscoff-Slogger 22
aminography-PrimeDatePicker 23
shery-awesome-atom-packages 24
jimweirich-sicp-study 25
chrismattmann-tika-similarity 26
standard-standard 27
venmo-VENTouchLock 28
aurelia-ui-toolkits-aurelia-materialize-bridge 29
reggie1996-FaceDetect 30
rails-jquery-rails 31
sergev-LiteBSD 32
TailorDev-monod 33
retest-recheck-web 34
edlich-nosql-database.org 35
dotnet-architecture-eShopOnContainers

ValueError: Expected 2D array, got scalar array instead:
array=nan.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
calc_dtw(1)

In [278]:
dtw, path = calc_dtw(0)
path

{'hzw1199-android_ProcessBar': {'hzw1199-android_ProcessBar': [(0, 0),
   (1, 1)]}}