In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys
sys.path.append(os.path.join(os.getcwd(), '..', '..', '..'))

import itertools
from collections import Counter

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from src.data.loader import load_all_datasets
from src.utils.plots import get_simple_axis, rotate_xticklabels, make_plot
from src.utils.df_utils import series_of_list_to_one_hot

In [None]:
sns.set_style("darkgrid")
sns.set_context("notebook")
sns.set(rc={'figure.figsize':(12, 5)})

In [None]:
dataset = load_all_datasets()
print(f"Original dataset size: {len(dataset)}")

dataset['repo_requirements'] = dataset['repo_requirements'].apply(lambda libs: [lib_name for lib_name in libs if lib_name is not None])
valid_data_idx = dataset['repo_requirements'].apply(lambda libs: len(libs) > 0)
dataset = dataset[valid_data_idx]
dataset = dataset.drop_duplicates('full_name')
print(f"Filtered dataset size: {len(dataset)}")

In [None]:
N_TOP_LIBS = 100

# Basic information

In [None]:
print(f"Number of repositories: {len(dataset)}")

In [None]:
libs_one_hot = series_of_list_to_one_hot(dataset, 'repo_requirements')

print(f"Matrix density: {libs_one_hot.sparse.density: 0.5f}")

In [None]:
print('\n'.join(dataset.columns))

In [None]:
def get_lib_flatten():
    for lib in itertools.chain.from_iterable(dataset['repo_requirements']):
        yield lib

all_libs = set(get_lib_flatten())
lib_counts = Counter(get_lib_flatten())
lib_counts = pd.DataFrame.from_dict(lib_counts, orient='index', columns=['counts']).reset_index().rename(columns={'index': 'lib_name'}).sort_values('counts', ascending=False)
lib_counts['repo_fraction'] = lib_counts['counts'] / len(dataset)
top_libs = lib_counts.head(N_TOP_LIBS)['lib_name'].tolist()
print(f"Total unique libs: {len(all_libs)}")

# Data exploration

## Basic statistics

In [None]:
grouper_freq = 'Y'
time_grouped_ds = dataset.groupby(pd.Grouper(key='created_at', freq=grouper_freq), as_index=False).size()
time_grouped_ds['created_at'] = time_grouped_ds['created_at'].dt.date
ax = sns.barplot(data=time_grouped_ds, x='created_at', y='size', ci=None)
ax.set_title(f"Number of repos per ${grouper_freq}$")
plt.show()

In [None]:
dataset['lib_num'] = dataset['repo_requirements'].map(len)
ax = sns.histplot(data=dataset, x='lib_num', log_scale=True)
ax.set_title("Number of libraries per repo")
plt.show()

In [None]:
# dist_plot_cols = ['n_subscribers', 'n_stars', 'n_forks', 'n_all_issues', 'n_branches', 'n_milestones_all']
dist_plot_cols = ['n_stars', 'n_forks', 'n_all_issues', 'n_branches', 'n_milestones_all']
fig, axes = make_plot(n_plots=len(dist_plot_cols), n_cols=3, sharex=False, sharey=False, col_width=8, ret_fig=True)
for col, ax in zip(dist_plot_cols, axes.flatten()):
    sns.histplot(dataset[col], ax=ax)
    ax.set_yscale('log')
    ax.set_xlabel('')
    ax.set_title(col)

fig.suptitle("Features distributions")
fig.tight_layout()

## Libraries analysis

In [None]:
ax = sns.barplot(data=lib_counts.iloc[:N_TOP_LIBS], x='lib_name', y='repo_fraction', ci=None, ax=get_simple_axis(width=25))
rotate_xticklabels(ax)
ax.set_title("Library popularity (fraction of repos using library)")
plt.show()

In [None]:
N_TOP_COEXISTENCE = 30
top_libs_one_hot = libs_one_hot[top_libs[:N_TOP_COEXISTENCE]].reset_index(drop=True)
libs_coexistence = top_libs_one_hot.T.dot(top_libs_one_hot)
ax = sns.heatmap(data=libs_coexistence, ax=get_simple_axis(15, 10))
ax.set_title("Top libraries coexistence")
plt.show()