# 0 Importing Packages

In [15]:
# Load the autoreload extension to automatically reload modules before executing code (to avoid restarting the kernel)
%load_ext autoreload 
# NB. uncomment the line above first time you run this cell
%autoreload 2

from pathlib import Path
import pandas as pd
import os
import networkx as nx

from resources.network_functions import GraphConstructor

# Set max row view for pandas to 100
pd.set_option('display.max_rows', 100)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 0.1 File Paths

In [16]:
fp_main = Path('/Volumes/SAM-SODAS-DISTRACT/Coding Distraction/github_as_a_market_device')
fp_main_output = Path(fp_main / 'output')   

# 1 Load in edgelist data

In [17]:
# Read in the data
all_edges_user_level = pd.read_parquet(fp_main_output / 'all_edges_user_level.gzip.parquet')
attention_edges_user_level = pd.read_parquet(fp_main_output / 'attention_edges_user_level.gzip.parquet')
collaboration_edges_user_level = pd.read_parquet(fp_main_output / 'collaboration_edges_user_level.gzip.parquet')

In [18]:
## 1.1 Construct the graphs
gc_attention = GraphConstructor(all_edges_user_level, graph_type='attention')
attention_graph = gc_attention.get_graph()

gc_collaboration = GraphConstructor(all_edges_user_level, graph_type='collaboration')
collaboration_graph = gc_collaboration.get_graph()

# 2 Descriptive statistics

## 2.1 Collaboration Network Stats

**General network statistics**

- no_users: Number of GitHub user-profiles in the graph
- no_companies: Number of companies in the graph
- no_users_edges: Number of unique user-to-user edges in the graph
- no_inter_company_edges_directed: Number of unique company-to-company edges in the graph
- no_inter_weight: Number of forks between users from different companies in the graph
- no_selfloops: Number of forks between users from the same company in the graph
- no_companies: Number of companies in the graph

In [19]:
# Nodes and edges
no_users = len(pd.unique(collaboration_edges_user_level[['src', 'target']].values.ravel()))
no_companies = len(set(collaboration_edges_user_level["src_company"]).union(collaboration_edges_user_level["target_company"]))
no_users_edge = int(sum(d.get("weight", 1) for _, _, d in collaboration_graph.edges(data=True)))
no_inter_company_edges_directed = len([(u,v) for u, v, d in collaboration_graph.edges(data=True) if d.get("d_inter_level") == 1])

# Total weight of inter-company edges (user-level, directed)
inter_company_mask = collaboration_edges_user_level['src_company'] != collaboration_edges_user_level['target_company']
no_inter_weight = collaboration_edges_user_level[inter_company_mask]['action'].value_counts().sum()

# Total weight of self-loop edges (src_company == tgt_company)
selfloop_mask = collaboration_edges_user_level['src_company'] == collaboration_edges_user_level['target_company']
no_selfloops = collaboration_edges_user_level[selfloop_mask]['action'].value_counts().sum()

def calculate_weighted_density(directed_graph):
    # Create a copy of the graph to avoid modifying the original
    graph = directed_graph.copy()

    # Remove self-loops
    self_loops = list(nx.selfloop_edges(graph))
    graph.remove_edges_from(self_loops)

    # Calculate unweighted density
    number_of_nodes = graph.number_of_nodes()
    possible_edges = number_of_nodes * (number_of_nodes - 1)

    # Calculate weighted density
    sum_of_weights = sum(data['weight'] for u, v, data in graph.edges(data=True))
    weighted_density = sum_of_weights / possible_edges if possible_edges != 0 else 0

    return weighted_density

weighted_density_collaboration = calculate_weighted_density(collaboration_graph)

print(f"No. users: {no_users}")
print(f"No. companies: {no_companies}")
print(f"User edges (directed): {no_users_edge}")
print(f"Inter-company edges (directed): {no_inter_company_edges_directed}")
print(f"Inter-company GH actions: {no_inter_weight}")
print(f"Intra-company GH actions: {no_selfloops}")
print(f"Weighted density (attention graph): {weighted_density_collaboration}")

No. users: 63
No. companies: 18
User edges (directed): 58
Inter-company edges (directed): 4
Inter-company GH actions: 5
Intra-company GH actions: 92
Weighted density (attention graph): 0.013071895424836602


**Describing GitHub user's actions on company-level (with self-loops/intra-company level)** 
- E.g. if *X*-user (working for company *Z*) has starred three repo's owned by *Y*-user (working for company *W*) this will give add three to the count of out-going edges for company *Z*, and three in-going edges for *W*. 

In [20]:
# Make tex

# Count outgoing and incoming edges and exclude self-loops
outgoing_counts = collaboration_edges_user_level.groupby('src_company').size().reset_index(name='outgoing_edges')
incoming_counts = collaboration_edges_user_level.groupby('target_company').size().reset_index(name='incoming_edges')

# Merge counts to get a single table
company_stats = pd.merge(outgoing_counts, incoming_counts, left_on='src_company', right_on='target_company', how='outer')

# Fill NaN values (in case some companies only have incoming or outgoing edges)
company_stats = company_stats.fillna(0)

# Rename columns for clarity
company_stats = company_stats[['src_company', 'outgoing_edges', 'incoming_edges']]
company_stats.columns = ['Company', 'Outgoing Edges', 'Incoming Edges']

# Sort by total edges
company_stats['Total Edges'] = company_stats['Outgoing Edges'] + company_stats['Incoming Edges']
company_stats = company_stats.sort_values('Total Edges', ascending=False)

# Convert to LaTeX table format
latex_table = company_stats.to_latex(index=False, caption="Company Network Edge Counts", label="tab:company_edges", column_format="lcc")

# Print the table
print(latex_table)
company_stats


\begin{table}
\caption{Company Network Edge Counts}
\label{tab:company_edges}
\begin{tabular}{lcc}
\toprule
Company & Outgoing Edges & Incoming Edges & Total Edges \\
\midrule
trifork & 39 & 39 & 78 \\
netcompany & 8 & 8 & 16 \\
delegateas & 6 & 7 & 13 \\
abtion & 6 & 6 & 12 \\
signifly & 5 & 5 & 10 \\
shape & 5 & 5 & 10 \\
nuuday & 4 & 4 & 8 \\
eg a s & 4 & 4 & 8 \\
must & 3 & 3 & 6 \\
miracle & 4 & 2 & 6 \\
kmd & 2 & 2 & 4 \\
siteimprove & 2 & 2 & 4 \\
skat & 2 & 2 & 4 \\
tv2 & 2 & 2 & 4 \\
uptime & 2 & 2 & 4 \\
knowit & 1 & 2 & 3 \\
saxo bank & 1 & 1 & 2 \\
jobindex & 1 & 1 & 2 \\
\bottomrule
\end{tabular}
\end{table}



Unnamed: 0,Company,Outgoing Edges,Incoming Edges,Total Edges
15,trifork,39,39,78
8,netcompany,8,8,16
1,delegateas,6,7,13
0,abtion,6,6,12
12,signifly,5,5,10
11,shape,5,5,10
9,nuuday,4,4,8
2,eg a s,4,4,8
7,must,3,3,6
6,miracle,4,2,6


**Describing GitHub user's actions on company-level (only inter-company level)** 

In [21]:
# Step 1: Exclude self-loops
company_edges = collaboration_edges_user_level[collaboration_edges_user_level['src_company'] != collaboration_edges_user_level['target_company']]

# Step 2: Count outgoing and incoming edges
outgoing_counts = company_edges.groupby('src_company').size().reset_index(name='outgoing_edges')
incoming_counts = company_edges.groupby('target_company').size().reset_index(name='incoming_edges')

# Step 3: Get all unique companies from both columns
all_companies = pd.DataFrame(pd.unique(company_edges[['src_company', 'target_company']].values.ravel()), columns=['src_company'])

# Step 4: Merge outgoing and incoming counts separately to ensure all companies are included
company_stats = all_companies.merge(outgoing_counts, on='src_company', how='left').merge(
    incoming_counts, left_on='src_company', right_on='target_company', how='left'
)

# Step 5: Drop duplicate target_company column and fill NaN with 0
company_stats = company_stats[['src_company', 'outgoing_edges', 'incoming_edges']].fillna(0)

# Step 6: Convert to integer type
company_stats[['outgoing_edges', 'incoming_edges']] = company_stats[['outgoing_edges', 'incoming_edges']].astype(int)

# Step 7: Sort by total edges
company_stats['total_edges'] = company_stats['outgoing_edges'] + company_stats['incoming_edges']
company_stats = company_stats.sort_values('total_edges', ascending=False).reset_index(drop=True)

# Step 7: Convert to LaTeX table format
latex_table = company_stats.to_latex(index=False, caption="Company Network Edge Counts (Excluding Self-Loops)", 
                                     label="tab:collaboration", column_format="lcc")

# Print LaTeX table
print(latex_table)
company_stats

\begin{table}
\caption{Company Network Edge Counts (Excluding Self-Loops)}
\label{tab:collaboration}
\begin{tabular}{lcc}
\toprule
src_company & outgoing_edges & incoming_edges & total_edges \\
\midrule
trifork & 2 & 2 & 4 \\
delegateas & 1 & 2 & 3 \\
miracle & 2 & 0 & 2 \\
knowit & 0 & 1 & 1 \\
\bottomrule
\end{tabular}
\end{table}



Unnamed: 0,src_company,outgoing_edges,incoming_edges,total_edges
0,trifork,2,2,4
1,delegateas,1,2,3
2,miracle,2,0,2
3,knowit,0,1,1


In [22]:
# Step 1: Filter for edges where companies belong to different categories
filtered_edges = collaboration_edges_user_level[
    collaboration_edges_user_level['src_company'] != collaboration_edges_user_level['target_company']
]

# Step 1: Group by 'src_company_label', 'target_company_label' and 'action', and count occurrences
category_action_counts = (
    filtered_edges
    .groupby(['src_company_label', 'target_company_label', 'action'])
    .size()
    .reset_index(name='count')
)

# Step 2: Pivot the table to get counts of each action as columns
pivoted_df = category_action_counts.pivot_table(
    index=['src_company_label', 'target_company_label'],
    columns='action',
    values='count',
    aggfunc='sum',
    fill_value=0
).reset_index()
pivoted_df.rename(columns={'forks': 'no_of_forks'}, inplace=True)

pivoted_df

# Optional: Save to .tex file
#with open("collaboration_table.tex", "w") as f:
#    f.write(latex_table)

action,src_company_label,target_company_label,no_of_forks
0,1 Digital and marketing consultancies,1 Digital and marketing consultancies,3
1,2 Bespoke app companies,1 Digital and marketing consultancies,2


## 2.2 Attention Network Stats

**General network statistics**

- no_users: Number of GitHub user-profiles in the graph
- no_companies: Number of companies in the graph
- no_users_edges: Number of unique user-to-user edges in the graph
- no_inter_company_edges_directed: Number of unique company-to-company edges in the graph
- no_inter_weight: Number of all attention GH-actions between users from different companies in the graph
- no_selfloops: Number of all attention GH-actions between users from the same company in the graph
- no_companies: Number of companies in the graph

In [23]:
# Nodes and edges
no_users = len(pd.unique(attention_edges_user_level[['src', 'target']].values.ravel()))
no_companies = len(set(attention_edges_user_level["src_company"]).union(attention_edges_user_level["target_company"]))
no_users_edge = int(sum(d.get("weight", 1) for _, _, d in attention_graph.edges(data=True)))
no_inter_company_edges_directed = len([(u,v) for u, v, d in attention_graph.edges(data=True) if d.get("d_inter_level") == 1])

# Total weight of inter-company edges (user-level, directed)
inter_company_mask = attention_edges_user_level['src_company'] != attention_edges_user_level['target_company']
no_inter_weight = attention_edges_user_level[inter_company_mask]['action'].value_counts().sum()

# Total weight of self-loop edges (src_company == tgt_company)
selfloop_mask = attention_edges_user_level['src_company'] == attention_edges_user_level['target_company']
no_selfloops = attention_edges_user_level[selfloop_mask]['action'].value_counts().sum()

# Density
weighted_density_attention = calculate_weighted_density(attention_graph)


print(f"No. users: {no_users}")
print(f"No. companies: {no_companies}")
print(f"User edges (directed): {no_users_edge}")
print(f"Inter-company edges (directed): {no_inter_company_edges_directed}")
print(f"Inter-company GH actions: {no_inter_weight}")
print(f"Intra-company GH actions: {no_selfloops}")
print(f"Weighted density (attention graph): {weighted_density_attention}")

No. users: 298
No. companies: 38
User edges (directed): 482
Inter-company edges (directed): 32
Inter-company GH actions: 220
Intra-company GH actions: 3749
Weighted density (attention graph): 0.04054054054054054


**Describing GitHub user's actions on company-level (with self-loops/intra-company level)** 
- E.g. if *X*-user (working for company *Z*) has starred three repo's owned by *Y*-user (working for company *W*) this will give add three to the count of out-going edges for company *Z*, and three in-going edges for *W*. 

In [24]:
# Count outgoing and incoming edges and exclude self-loops
outgoing_counts = attention_edges_user_level.groupby('src_company').size().reset_index(name='outgoing_edges')
incoming_counts = attention_edges_user_level.groupby('target_company').size().reset_index(name='incoming_edges')

# Merge counts to get a single table
company_stats = pd.merge(outgoing_counts, incoming_counts, left_on='src_company', right_on='target_company', how='outer')

# Fill NaN values (in case some companies only have incoming or outgoing edges)
company_stats = company_stats.fillna(0)

# Rename columns for clarity
company_stats = company_stats[['src_company', 'outgoing_edges', 'incoming_edges']]
company_stats.columns = ['Company', 'Outgoing Edges', 'Incoming Edges']

# Sort by total edges
company_stats['Total Edges'] = company_stats['Outgoing Edges'] + company_stats['Incoming Edges']
company_stats = company_stats.sort_values('Total Edges', ascending=False)

# Convert to LaTeX table format
latex_table = company_stats.to_latex(index=False, caption="Company Network Edge Counts", label="tab:company_edges", column_format="lcc")

# Print the table
print(latex_table)
company_stats


\begin{table}
\caption{Company Network Edge Counts}
\label{tab:company_edges}
\begin{tabular}{lcc}
\toprule
Company & Outgoing Edges & Incoming Edges & Total Edges \\
\midrule
trifork & 1164.000000 & 1165.000000 & 2329.000000 \\
netcompany & 324.000000 & 298.000000 & 622.000000 \\
jobindex & 229.000000 & 231.000000 & 460.000000 \\
pentia & 190.000000 & 270.000000 & 460.000000 \\
miracle & 223.000000 & 217.000000 & 440.000000 \\
shape & 192.000000 & 188.000000 & 380.000000 \\
tv2 & 160.000000 & 155.000000 & 315.000000 \\
charlie tango & 156.000000 & 155.000000 & 311.000000 \\
systematic & 151.000000 & 151.000000 & 302.000000 \\
nuuday & 109.000000 & 118.000000 & 227.000000 \\
abtion & 111.000000 & 111.000000 & 222.000000 \\
must & 108.000000 & 110.000000 & 218.000000 \\
eg a s & 105.000000 & 105.000000 & 210.000000 \\
relatel & 96.000000 & 96.000000 & 192.000000 \\
delegateas & 90.000000 & 96.000000 & 186.000000 \\
knowit & 89.000000 & 93.000000 & 182.000000 \\
uptime & 83.000000 & 83.0

Unnamed: 0,Company,Outgoing Edges,Incoming Edges,Total Edges
32,trifork,1164.0,1165.0,2329.0
21,netcompany,324.0,298.0,622.0
15,jobindex,229.0,231.0,460.0
24,pentia,190.0,270.0,460.0
19,miracle,223.0,217.0,440.0
28,shape,192.0,188.0,380.0
33,tv2,160.0,155.0,315.0
3,charlie tango,156.0,155.0,311.0
31,systematic,151.0,151.0,302.0
22,nuuday,109.0,118.0,227.0


**Describing GitHub user's actions on company-level (only inter-company level)** 

In [25]:
# Step 1: Exclude self-loops
filtered_edges = attention_edges_user_level[attention_edges_user_level['src_company'] != attention_edges_user_level['target_company']]

# Step 2: Count outgoing and incoming edges
outgoing_counts = filtered_edges.groupby('src_company').size().reset_index(name='outgoing_edges')
incoming_counts = filtered_edges.groupby('target_company').size().reset_index(name='incoming_edges')

# Step 3: Get all unique companies from both columns
all_companies = pd.DataFrame(pd.unique(filtered_edges[['src_company', 'target_company']].values.ravel()), columns=['src_company'])

# Step 4: Merge outgoing and incoming counts separately to ensure all companies are included
company_stats = all_companies.merge(outgoing_counts, on='src_company', how='left').merge(
    incoming_counts, left_on='src_company', right_on='target_company', how='left'
)

# Step 5: Drop duplicate target_company column and fill NaN with 0
company_stats = company_stats[['src_company', 'outgoing_edges', 'incoming_edges']].fillna(0)

# Step 6: Convert to integer type
company_stats[['outgoing_edges', 'incoming_edges']] = company_stats[['outgoing_edges', 'incoming_edges']].astype(int)

# Step 7: Sort by total edges
company_stats['total_edges'] = company_stats['outgoing_edges'] + company_stats['incoming_edges']
company_stats = company_stats.sort_values('total_edges', ascending=False).reset_index(drop=True)

# Step 7: Convert to LaTeX table format
latex_table = company_stats.to_latex(index=False, caption="Company Network Edge Counts (Excluding Self-Loops)", 
                                     label="tab:collaboration", column_format="lcc")

# Print LaTeX table
print(latex_table)
company_stats

\begin{table}
\caption{Company Network Edge Counts (Excluding Self-Loops)}
\label{tab:collaboration}
\begin{tabular}{lcc}
\toprule
src_company & outgoing_edges & incoming_edges & total_edges \\
\midrule
pentia & 0 & 80 & 80 \\
house of code & 58 & 1 & 59 \\
knowit & 27 & 31 & 58 \\
miracle & 30 & 24 & 54 \\
jobindex & 20 & 22 & 42 \\
netcompany & 28 & 2 & 30 \\
cbrain & 20 & 4 & 24 \\
nuuday & 7 & 16 & 23 \\
deondigital & 4 & 13 & 17 \\
delegateas & 2 & 8 & 10 \\
shape & 6 & 2 & 8 \\
tv2 & 5 & 0 & 5 \\
uptime & 2 & 2 & 4 \\
yousee & 4 & 0 & 4 \\
creuna & 0 & 3 & 3 \\
signifly & 0 & 3 & 3 \\
siteimprove & 2 & 0 & 2 \\
uni-soft & 0 & 2 & 2 \\
commentor & 2 & 0 & 2 \\
oxygen & 1 & 1 & 2 \\
skat & 0 & 2 & 2 \\
abtion & 1 & 1 & 2 \\
must & 0 & 2 & 2 \\
charlie tango & 1 & 0 & 1 \\
trifork & 0 & 1 & 1 \\
\bottomrule
\end{tabular}
\end{table}



Unnamed: 0,src_company,outgoing_edges,incoming_edges,total_edges
0,pentia,0,80,80
1,house of code,58,1,59
2,knowit,27,31,58
3,miracle,30,24,54
4,jobindex,20,22,42
5,netcompany,28,2,30
6,cbrain,20,4,24
7,nuuday,7,16,23
8,deondigital,4,13,17
9,delegateas,2,8,10


**Describing actions based on company categories**

In [26]:
# Step 1: Filter for edges where companies belong to different categories
filtered_edges = attention_edges_user_level[
    attention_edges_user_level['src_company'] != attention_edges_user_level['target_company']
]

# Step 2: Group by 'src_company_label', 'target_company_label' and 'action', and count occurrences
category_action_counts = (
    filtered_edges
    .groupby(['src_company_label', 'target_company_label', 'action'])
    .size()
    .reset_index(name='count')
)

# Step 3: Pivot the table to get counts of each action as columns
pivoted_df = category_action_counts.pivot_table(
    index=['src_company_label', 'target_company_label'],
    columns='action',
    values='count',
    aggfunc='sum',
    fill_value=0
).reset_index()

# Step 4: Sum the attention actions into a single column
pivoted_df['no_attention_actions'] = (
    pivoted_df.get('follows', 0) +
    pivoted_df.get('stars', 0) +
    pivoted_df.get('watches', 0)
)

# Generate and print LaTeX table
latex_table = pivoted_df.to_latex(index=False)
print(latex_table)

# Print table without index
pivoted_df

# Optional: Save to .tex file
#with open("attention_table.tex", "w") as f:
#    f.write(latex_table)

\begin{tabular}{llrrrr}
\toprule
src_company_label & target_company_label & follows & stars & watches & no_attention_actions \\
\midrule
1 Digital and marketing consultancies & 1 Digital and marketing consultancies & 7 & 10 & 23 & 40 \\
1 Digital and marketing consultancies & 2 Bespoke app companies & 2 & 0 & 22 & 24 \\
1 Digital and marketing consultancies & 4 Companies with specific digital part/app as part of service/product & 6 & 2 & 16 & 24 \\
2 Bespoke app companies & 1 Digital and marketing consultancies & 4 & 4 & 80 & 88 \\
2 Bespoke app companies & 4 Companies with specific digital part/app as part of service/product & 2 & 0 & 0 & 2 \\
3 Data-broker- and infrastructure companies & 1 Digital and marketing consultancies & 0 & 2 & 0 & 2 \\
3 Data-broker- and infrastructure companies & 4 Companies with specific digital part/app as part of service/product & 0 & 2 & 2 & 4 \\
4 Companies with specific digital part/app as part of service/product & 1 Digital and marketing consultancies

action,src_company_label,target_company_label,follows,stars,watches,no_attention_actions
0,1 Digital and marketing consultancies,1 Digital and marketing consultancies,7,10,23,40
1,1 Digital and marketing consultancies,2 Bespoke app companies,2,0,22,24
2,1 Digital and marketing consultancies,4 Companies with specific digital part/app as ...,6,2,16,24
3,2 Bespoke app companies,1 Digital and marketing consultancies,4,4,80,88
4,2 Bespoke app companies,4 Companies with specific digital part/app as ...,2,0,0,2
5,3 Data-broker- and infrastructure companies,1 Digital and marketing consultancies,0,2,0,2
6,3 Data-broker- and infrastructure companies,4 Companies with specific digital part/app as ...,0,2,2,4
7,4 Companies with specific digital part/app as ...,1 Digital and marketing consultancies,6,2,2,10
8,4 Companies with specific digital part/app as ...,2 Bespoke app companies,0,0,1,1
9,4 Companies with specific digital part/app as ...,3 Data-broker- and infrastructure companies,4,6,3,13
