# 0 Importing Packages

In [19]:
# Load the autoreload extension to automatically reload modules before executing code (to avoid restarting the kernel)
%load_ext autoreload 
# NB. uncomment the line above first time you run this cell
%autoreload 2

from pathlib import Path
import pandas as pd
import networkx as nx

from resources.network_functions import GraphConstructor, calculate_weighted_density

# Set max row view for pandas to 100
pd.set_option('display.max_rows', 100)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 0.1 File Paths

In [20]:
fp_main = Path('/Volumes/SAM-SODAS-DISTRACT/Coding Distraction/github_as_market_device')
fp_main_output = Path(fp_main / 'output')   

# 1 Load in edgelist data

In [21]:
# Read in the data
all_edges_user_level = pd.read_parquet(fp_main_output / 'all_edges_user_level.gzip.parquet')
attention_edges_user_level = pd.read_parquet(fp_main_output / 'attention_edges_user_level.gzip.parquet')
collaboration_edges_user_level = pd.read_parquet(fp_main_output / 'collaboration_edges_user_level.gzip.parquet')

In [24]:
## 1.1 Construct the graphs
gc_attention = GraphConstructor(attention_edges_user_level, graph_type='attention')
attention_graph = gc_attention.get_graph()

gc_collaboration = GraphConstructor(collaboration_edges_user_level, graph_type='collaboration')
collaboration_graph = gc_collaboration.get_graph()

# 2 Collaboration Network Stats

## 2.1 General network statistics

In [None]:
# User level
no_users = len(pd.unique(collaboration_edges_user_level[['src', 'target']].values.ravel()))
no_unique_inter_user_to_user = collaboration_edges_user_level[collaboration_edges_user_level['d_inter_level'] == 1][['src', 'target']].drop_duplicates().shape[0]
no_unique_intra_user_to_user = collaboration_edges_user_level[collaboration_edges_user_level['d_intra_level'] == 1][['src', 'target']].drop_duplicates().shape[0]

# Company level
no_companies = len(set(collaboration_edges_user_level["src_company"]).union(collaboration_edges_user_level["target_company"]))
no_inter_company_edges_directed = len([(u,v) for u, v, d in collaboration_graph.edges(data=True) if d.get("d_inter_level") == 1])

# Total weight of inter-company edges (user-level, directed)
no_inter_gh = collaboration_edges_user_level[collaboration_edges_user_level['d_inter_level'] == 1].shape[0]

# Total weight of self-loop edges (src_company == tgt_company)z
no_intra_gh = collaboration_edges_user_level[collaboration_edges_user_level['d_intra_level'] == 1].shape[0]

# Calculate the weighted density of the collaboration graph
weighted_density_collaboration = calculate_weighted_density(collaboration_graph)

print(f"No. of users: {no_users}")
print(f"No. of companies: {no_companies}")
print(f"Inter-company GH actions: {no_inter_gh}")
print(f"Intra-company GH actions: {no_intra_gh}")
print(f"Unique inter-company edges (directed): {no_inter_company_edges_directed}")
print(f"Unique directed user-to-user edges (inter): {no_unique_inter_user_to_user}")
print(f"Unique directed user-to-user edges (intra): {no_unique_intra_user_to_user}")
print(f"Weighted density: {weighted_density_collaboration}")

No. of users: 70
No. of companies: 20
Unique inter-company, user-to-user edges (directed): 6
Unique intra-company, user-to-user edges (directed): 59
Inter-company GH actions: 8
Intra-company GH actions: 98
Unique inter-company edges (directed): 6
Weighted density: 0.015789473684210527


## 2.2 Describing GitHub user's actions on company-level (incl. intra-company level)
- E.g. if *X*-user (working for company *Z*) has starred three repo's owned by *Y*-user (working for company *W*) this will give add three to the count of out-going edges for company *Z*, and three in-going edges for *W*. 

In [7]:
# Count outgoing and incoming edges and exclude self-loops
outgoing_counts = collaboration_edges_user_level.groupby('src_company').size().reset_index(name='outgoing_edges')
incoming_counts = collaboration_edges_user_level.groupby('target_company').size().reset_index(name='incoming_edges')

# Merge counts to get a single table
company_stats = pd.merge(outgoing_counts, incoming_counts, left_on='src_company', right_on='target_company', how='outer')

# Fill NaN values (in case some companies only have incoming or outgoing edges)
company_stats = company_stats.fillna(0)

# Rename columns for clarity
company_stats = company_stats[['src_company', 'outgoing_edges', 'incoming_edges']]
company_stats.columns = ['Company', 'Outgoing Edges', 'Incoming Edges']

# Sort by total edges
company_stats['Total Edges'] = company_stats['Outgoing Edges'] + company_stats['Incoming Edges']
company_stats = company_stats.sort_values('Total Edges', ascending=False)

# Convert to LaTeX table format
latex_table = company_stats.to_latex(index=False, caption="Company Network Edge Counts", label="tab:company_edges", column_format="lcc")

# Print the table
# print(latex_table)

# Display the company statistics DataFrame
company_stats


Unnamed: 0,Company,Outgoing Edges,Incoming Edges,Total Edges
16,trifork,39.0,39,78.0
11,shape,10.0,10,20.0
8,netcompany,10.0,8,18.0
1,delegateas,6.0,7,13.0
0,abtion,6.0,6,12.0
12,signifly,5.0,5,10.0
2,eg a s,4.0,4,8.0
9,nuuday,4.0,4,8.0
6,miracle,4.0,2,6.0
7,must,3.0,3,6.0


## 2.3 Describing GitHub user's actions on company-level (only inter-company level)

In [8]:
# Exclude self-loops
company_edges = collaboration_edges_user_level[collaboration_edges_user_level['src_company'] != collaboration_edges_user_level['target_company']]

# Count outgoing and incoming edges and exclude self-loops
outgoing_counts = company_edges.groupby('src_company').size().reset_index(name='outgoing_edges')
incoming_counts = company_edges.groupby('target_company').size().reset_index(name='incoming_edges')

# Merge counts to get a single table
company_stats = pd.merge(outgoing_counts, incoming_counts, left_on='src_company', right_on='target_company', how='outer')

# Fill NaN values (in case some companies only have incoming or outgoing edges)
company_stats = company_stats.fillna(0)

# Rename columns for clarity
company_stats = company_stats[['src_company', 'outgoing_edges', 'incoming_edges']]
company_stats.columns = ['Company', 'Outgoing Edges', 'Incoming Edges']

# Sort by total edges
company_stats['Total Edges'] = company_stats['Outgoing Edges'] + company_stats['Incoming Edges']
company_stats = company_stats.sort_values('Total Edges', ascending=False)

# Convert to LaTeX table format
latex_table = company_stats.to_latex(index=False, caption="Company Network Edge Counts", label="tab:company_edges", column_format="lcc")

# Print the table
# print(latex_table)

# Display the company statistics DataFrame
company_stats


Unnamed: 0,Company,Outgoing Edges,Incoming Edges,Total Edges
4,trifork,2.0,2.0,4.0
0,delegateas,1.0,2.0,3.0
1,miracle,2.0,0.0,2.0
2,netcompany,2.0,0.0,2.0
7,0,0.0,2.0,2.0
3,siteimprove,1.0,0.0,1.0
5,0,0.0,1.0,1.0
6,0,0.0,1.0,1.0


## 2.4 Adjadency matrix collaboration (only inter-level conenctions)

In [9]:
# Filter the attention edges to exclude self-loops
mask = collaboration_edges_user_level['src_company'] != collaboration_edges_user_level['target_company']
collaboration_edges_user_level_inter = collaboration_edges_user_level[mask].copy()

# Get unique labels from source and target company labels
labels = pd.Index(
    collaboration_edges_user_level_inter['src_company_label'].tolist() +
    collaboration_edges_user_level_inter['target_company_label'].tolist()
).unique()

# Group by source and target labels, counting occurrences
grouped = collaboration_edges_user_level_inter.groupby(
    ['src_company_label', 'target_company_label']
).size().reset_index(name='count')

# Pivot into adjacency matrix
matrix = grouped.pivot(
    index='src_company_label',
    columns='target_company_label',
    values='count'
).fillna(0)

# Reindex matrix rows and columns to labels and sort
matrix = matrix.reindex(labels, axis=0).reindex(labels, axis=1).sort_index(axis=0).sort_index(axis=1)
matrix = matrix.fillna(0).astype(int)

# MultiIndex columns for clarity
matrix.columns = pd.MultiIndex.from_product([['Target'], matrix.columns])
matrix.index.name = 'Source'

# Total connections count (excluding self-loops)
connections_total = matrix.sum().sum()
print(f"Total connections (excluding self-loops): {connections_total}")

# Extract unique companies and categories from the filtered edges
src_df = collaboration_edges_user_level_inter[['src_company', 'src_company_category']].rename(
    columns={'src_company': 'company', 'src_company_category': 'category'}
)
target_df = collaboration_edges_user_level_inter[['target_company', 'target_company_category']].rename(
    columns={'target_company': 'company', 'target_company_category': 'category'}
)

# Concatenate, drop duplicates to get unique companies with categories
all_companies_with_categories = pd.concat([src_df, target_df]).drop_duplicates()

# Count companies per category
category_counts = all_companies_with_categories['category'].value_counts().sort_index()

print("\nNumber of unique companies per category in the matrix:")
print(category_counts)

# Make latex table from the matrix
latex_matrix = matrix.to_latex(
    caption="Collaboration Edges Matrix (Excluding Self-Loops)",
    label="tab:collaboration_matrix",
    column_format="l" + "c" * (len(matrix.columns)),
    index_names=False,
    float_format="%.0f"
)
# Print the LaTeX table
# print(latex_matrix)

# Show matrix
matrix

Total connections (excluding self-loops): 8

Number of unique companies per category in the matrix:
category
1    5
2    1
3    1
4    1
Name: count, dtype: int64


Unnamed: 0_level_0,Target,Target,Target,Target
Unnamed: 0_level_1,1 Digital and marketing consultancies,2 Bespoke app companies,3 Data-broker- and infrastructure companies,4 Companies with specific digital part/app as part of service/product
Source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1 Digital and marketing consultancies,3,0,0,2
2 Bespoke app companies,2,0,0,0
3 Data-broker- and infrastructure companies,1,0,0,0
4 Companies with specific digital part/app as part of service/product,0,0,0,0


## 2.5 Adjadency matrix (all connections, including intra)

In [10]:
labels = pd.Index(collaboration_edges_user_level['src_company_label'].tolist() + collaboration_edges_user_level['target_company_label'].to_list()).unique()

grouped = collaboration_edges_user_level.groupby(['src_company_label', 'target_company_label']).size().reset_index(name='count')

matrix = grouped.pivot(index='src_company_label', columns='target_company_label', values='count')

matrix = matrix.reindex(labels, axis=0).reindex(labels, axis=1).sort_index(axis=0).sort_index(axis=1)

matrix = matrix.fillna(0).astype(int)

matrix.columns = pd.MultiIndex.from_product([['Target'], matrix.columns])
matrix.index.name = 'Source'

# Connections in total
connections_total = matrix.sum().sum()
print(f"Total connections (including self-loops): {connections_total}")

matrix

Total connections (including self-loops): 106


Unnamed: 0_level_0,Target,Target,Target,Target
Unnamed: 0_level_1,1 Digital and marketing consultancies,2 Bespoke app companies,3 Data-broker- and infrastructure companies,4 Companies with specific digital part/app as part of service/product
Source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1 Digital and marketing consultancies,87,0,0,2
2 Bespoke app companies,2,2,0,0
3 Data-broker- and infrastructure companies,1,0,2,0
4 Companies with specific digital part/app as part of service/product,0,0,0,10


## 3 Attention Network Stats

## 3.1 General network statistics

In [None]:
# User level
no_users = len(pd.unique(attention_edges_user_level[['src', 'target']].values.ravel()))
no_unique_inter_user_to_user = attention_edges_user_level[attention_edges_user_level['d_inter_level'] == 1][['src', 'target']].drop_duplicates().shape[0]
no_unique_intra_user_to_user = attention_edges_user_level[attention_edges_user_level['d_intra_level'] == 1][['src', 'target']].drop_duplicates().shape[0]

# Company level
no_companies = len(set(attention_edges_user_level["src_company"]).union(attention_edges_user_level["target_company"]))
no_inter_company_edges_directed = len([(u,v) for u, v, d in attention_graph.edges(data=True) if d.get("d_inter_level") == 1])

# Total weight of inter-company edges (user-level, directed)
no_inter_gh = attention_edges_user_level[attention_edges_user_level['d_inter_level'] == 1].shape[0]

# Total weight of self-loop edges (src_company == tgt_company)z
no_intra_gh = attention_edges_user_level[attention_edges_user_level['d_intra_level'] == 1].shape[0]

# Calculate the weighted density of the collaboration graph
weighted_density_collaboration = calculate_weighted_density(attention_graph)

print(f"No. of users: {no_users}")
print(f"No. of companies: {no_companies}")
print(f"Inter-company GH actions: {no_inter_gh}")
print(f"Intra-company GH actions: {no_intra_gh}")
print(f"Unique inter-company edges (directed): {no_inter_company_edges_directed}")
print(f"Unique inter-company, user-to-user edges (directed): {no_unique_inter_user_to_user}")
print(f"Unique intra-company, user-to-user edges (directed): {no_unique_intra_user_to_user}")
print(f"Weighted density: {weighted_density_collaboration}")

No. of users: 357
No. of companies: 41
Unique inter-company, user-to-user edges (directed): 94
Unique intra-company, user-to-user edges (directed): 500
Inter-company GH actions: 306
Intra-company GH actions: 4321
Unique inter-company edges (directed): 53
Weighted density: 0.05731707317073171


## 3.2 Describing GitHub user's actions on company-level (incl. intra-company level)
- E.g. if *X*-user (working for company *Z*) has starred three repo's owned by *Y*-user (working for company *W*) this will give add three to the count of out-going edges for company *Z*, and three in-going edges for *W*. 

In [12]:
# Count outgoing and incoming edges and exclude self-loops
outgoing_counts = attention_edges_user_level.groupby('src_company').size().reset_index(name='outgoing_edges')
incoming_counts = attention_edges_user_level.groupby('target_company').size().reset_index(name='incoming_edges')

# Merge counts to get a single table
company_stats = pd.merge(outgoing_counts, incoming_counts, left_on='src_company', right_on='target_company', how='outer')

# Fill NaN values (in case some companies only have incoming or outgoing edges)
company_stats = company_stats.fillna(0)

# Rename columns for clarity
company_stats = company_stats[['src_company', 'outgoing_edges', 'incoming_edges']]
company_stats.columns = ['Company', 'Outgoing Edges', 'Incoming Edges']

# Sort by total edges
company_stats['Total Edges'] = company_stats['Outgoing Edges'] + company_stats['Incoming Edges']
company_stats = company_stats.sort_values('Total Edges', ascending=False)

# Convert to LaTeX table format
latex_table = company_stats.to_latex(index=False, caption="Company Network Edge Counts", label="tab:company_edges", column_format="lcc")

# Print the table
# print(latex_table)

# Display the company statis DataFrame
company_stats


Unnamed: 0,Company,Outgoing Edges,Incoming Edges,Total Edges
34,trifork,1197.0,1197.0,2394.0
21,netcompany,362.0,301.0,663.0
28,shape,299.0,294.0,593.0
24,pentia,194.0,273.0,467.0
15,jobindex,232.0,231.0,463.0
19,miracle,230.0,218.0,448.0
33,systematic,219.0,220.0,439.0
29,signifly,176.0,180.0,356.0
35,tv2,177.0,174.0,351.0
30,siteimprove,167.0,170.0,337.0


## 3.3 Describing GitHub user's actions on company-level (only inter-company level)

In [13]:
# Exclude self-loops
company_edges = attention_edges_user_level[attention_edges_user_level['src_company'] != attention_edges_user_level['target_company']]

# Count outgoing and incoming edges and exclude self-loops
outgoing_counts = company_edges.groupby('src_company').size().reset_index(name='outgoing_edges')
incoming_counts = company_edges.groupby('target_company').size().reset_index(name='incoming_edges')

# Merge counts to get a single table
company_stats = pd.merge(outgoing_counts, incoming_counts, left_on='src_company', right_on='target_company', how='outer')

# Fill NaN values (in case some companies only have incoming or outgoing edges)
company_stats = company_stats.fillna(0)

# Rename columns for clarity
company_stats = company_stats[['src_company', 'outgoing_edges', 'incoming_edges']]
company_stats.columns = ['Company', 'Outgoing Edges', 'Incoming Edges']

# Sort by total edges
company_stats['Total Edges'] = company_stats['Outgoing Edges'] + company_stats['Incoming Edges']
company_stats = company_stats.sort_values('Total Edges', ascending=False)

# Convert to LaTeX table format
latex_table = company_stats.to_latex(index=False, caption="Company Network Edge Counts", label="tab:company_edges", column_format="lcc")

# Print the table
# print(latex_table)

# Display the company statistics DataFrame
company_stats


Unnamed: 0,Company,Outgoing Edges,Incoming Edges,Total Edges
17,pentia,2.0,81.0,83.0
11,knowit,39.0,31.0,70.0
14,netcompany,63.0,2.0,65.0
13,miracle,36.0,24.0,60.0
8,house of code,58.0,1.0,59.0
30,0,0.0,52.0,52.0
9,jobindex,23.0,22.0,45.0
2,cbrain,20.0,4.0,24.0
15,nuuday,7.0,16.0,23.0
6,deondigital,4.0,13.0,17.0


## 3.4 Adjadency matrix (only inter-level conenctions)

In [14]:
# Filter the attention edges to exclude self-loops
mask = attention_edges_user_level['src_company'] != attention_edges_user_level['target_company']
attention_edges_user_level_inter = attention_edges_user_level[mask].copy()

# Get unique labels from source and target company labels
labels = pd.Index(
    attention_edges_user_level_inter['src_company_label'].tolist() +
    attention_edges_user_level_inter['target_company_label'].tolist()
).unique()

# Group by source and target labels, counting occurrences
grouped = attention_edges_user_level_inter.groupby(
    ['src_company_label', 'target_company_label']
).size().reset_index(name='count')

# Pivot into adjacency matrix
matrix = grouped.pivot(
    index='src_company_label',
    columns='target_company_label',
    values='count'
).fillna(0)

# Reindex matrix rows and columns to labels and sort
matrix = matrix.reindex(labels, axis=0).reindex(labels, axis=1).sort_index(axis=0).sort_index(axis=1)
matrix = matrix.fillna(0).astype(int)

# MultiIndex columns for clarity
matrix.columns = pd.MultiIndex.from_product([['Target'], matrix.columns])
matrix.index.name = 'Source'

# Total connections count (excluding self-loops)
connections_total = matrix.sum().sum()
print(f"Total connections (excluding self-loops): {connections_total}")

# Extract unique companies and categories from the filtered edges
src_df = attention_edges_user_level_inter[['src_company', 'src_company_category']].rename(
    columns={'src_company': 'company', 'src_company_category': 'category'}
)
target_df = attention_edges_user_level_inter[['target_company', 'target_company_category']].rename(
    columns={'target_company': 'company', 'target_company_category': 'category'}
)

# Concatenate, drop duplicates to get unique companies with categories
all_companies_with_categories = pd.concat([src_df, target_df]).drop_duplicates()

# Count companies per category
category_counts = all_companies_with_categories['category'].value_counts().sort_index()

print("\nNumber of unique companies per category in the matrix:")
print(category_counts)
print("\n")

# Make latex table from the matrix
latex_matrix = matrix.to_latex(
    caption="Collaboration Edges Matrix (Excluding Self-Loops)",
    label="tab:collaboration_matrix",
    column_format="l" + "c" * (len(matrix.columns)),
    index_names=False,
    float_format="%.0f"
)
# Print the LaTeX table
# print(latex_matrix)

# Show matrix
matrix

Total connections (excluding self-loops): 306

Number of unique companies per category in the matrix:
category
1    20
2     3
3     2
4     8
Name: count, dtype: int64




Unnamed: 0_level_0,Target,Target,Target,Target
Unnamed: 0_level_1,1 Digital and marketing consultancies,2 Bespoke app companies,3 Data-broker- and infrastructure companies,4 Companies with specific digital part/app as part of service/product
Source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1 Digital and marketing consultancies,58,24,6,74
2 Bespoke app companies,88,0,0,8
3 Data-broker- and infrastructure companies,3,0,0,4
4 Companies with specific digital part/app as part of service/product,15,1,13,12


## 3.5 Adjadency matrix (all connections, including intra-level)

In [15]:
labels = pd.Index(attention_edges_user_level['src_company_label'].tolist() + attention_edges_user_level['target_company_label'].to_list()).unique()

grouped = attention_edges_user_level.groupby(['src_company_label', 'target_company_label']).size().reset_index(name='count')

matrix = grouped.pivot(index='src_company_label', columns='target_company_label', values='count').fillna(0)

matrix = matrix.reindex(labels, axis=0).reindex(labels, axis=1).sort_index(axis=0).sort_index(axis=1)

matrix = matrix.fillna(0).astype(int)

matrix.columns = pd.MultiIndex.from_product([['Target'], matrix.columns])
matrix.index.name = 'Source'

# Connections in total
connections_total = matrix.sum().sum()
print(f"Total connections (excluding self-loops): {connections_total}")

matrix

Total connections (excluding self-loops): 4627


Unnamed: 0_level_0,Target,Target,Target,Target
Unnamed: 0_level_1,1 Digital and marketing consultancies,2 Bespoke app companies,3 Data-broker- and infrastructure companies,4 Companies with specific digital part/app as part of service/product
Source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1 Digital and marketing consultancies,3299,24,6,74
2 Bespoke app companies,88,208,0,8
3 Data-broker- and infrastructure companies,3,0,237,4
4 Companies with specific digital part/app as part of service/product,15,1,13,647
