# 0. Importing packages

In [2]:
# Load the autoreload extension to automatically reload modules before executing code (to avoid restarting the kernel)
%load_ext autoreload 
# NB. uncomment the line above first time you run this cell
%autoreload 2

In [31]:
from pathlib import Path
import pandas as pd

from tqdm import tqdm

from resources.network_functions import NetworkEdgeListConstructor
from resources.filter_functions import look_company_up_in_edgelist
from resources.github_functions import GithubScraper
from typing import Dict

# Instantiate GithubScraper
gs = GithubScraper()

GithubScraper initialized with 3 tokens.
First token in cycle. Initiating ACCESS_TOKEN_1.
GithubScraper initialized with 0 companies and 0 users already scraped.


## 0.1 File Paths

In [4]:
fp_main = Path('/Volumes/SAM-SODAS-DISTRACT/Coding Distraction/github_as_a_market_device')
fp_main_output = Path(fp_main / 'output')

# 1 Load in the data

In [None]:
# Read in the data
github_data = pd.read_parquet(fp_main_output / 'unique_users_first_second.gzip.parquet')

## 1.1 Create data

In [8]:
# 2 Create edgelist
constructor = NetworkEdgeListConstructor(github_data)
all_edges_user_level, _, _ = constructor.get_edge_lists()

## 1.3 Splitting the data into two: Collaboration and attention

In [11]:
# Collaboration: gh_action == fork
collaboration_edgelist = all_edges_user_level[all_edges_user_level['action'] == 'forks']

# Attention: gh_action != fork
attention_edgelist = all_edges_user_level[all_edges_user_level['action'].isin(['follows','watches','stars'])]

# Print stats
print(f'Attention edgelist: {attention_edgelist.shape}')
print(f'Collaboration edgelist: {collaboration_edgelist.shape}')

Attention edgelist: (2256, 13)
Collaboration edgelist: (65, 13)


# 2. Examining the networks

## 2.1 Collaboration

In [None]:
result_trifork = look_company_up_in_edgelist(
    company='trifork', 
    edgelist=collaboration_edgelist, 
    alternative_company='miracle', 
    direction='all', 
    exclude_self_loops=True
)
result_trifork

Unnamed: 0,src,target,src_company,target_company,src_company_category,src_company_label,target_company_category,target_company_label,d_intra_level,d_inter_level,edge_repo,action,created_at
610,miracle-as,chrklin,miracle,trifork,2,2 Bespoke app companies,1,1 Digital and marketing consultancies,0,1,kitos/miracle-as,forks,2016-05-03


## 2.2 Attention

In [None]:
result_trifork = look_company_up_in_edgelist(company='trifork', edgelist=attention_edgelist, direction='all')
result_trifork

Unnamed: 0,src,target,src_company,target_company,src_company_category,src_company_label,target_company_category,target_company_label,d_intra_level,d_inter_level,edge_repo,action,created_at
1280,AeroplaneMouse,anton-christensen,netcompany,trifork,1,1 Digital and marketing consultancies,1,1 Digital and marketing consultancies,0,1,xv6-docker/AeroplaneMouse,stars,2019-03-25


# 3. Get number of followers

In [28]:
# Get all unique GitHub usernames from the edge list
unique_users = pd.concat([all_edges_user_level['src'], all_edges_user_level['target']]).unique()

def get_followers(username: str) -> int | None:
    """
    Returns the number of followers for a given GitHub username.
    If the user does not exist or is private, returns None.
    """
    try:
        user_obj = gs.get_user(username)
        if user_obj is None:
            print(f"[!] User '{username}' not found.")
            return None
        return user_obj.followers
    except Exception as e:
        print(f"[!] Error retrieving user '{username}': {e}")
        return None

# Query follower counts for all unique users
followers = {user: get_followers(user) for user in tqdm(unique_users)}

100%|██████████| 132/132 [01:00<00:00,  2.20it/s]


In [32]:
# Filter out users with no follower data (None)
valid_followers: Dict[str, int] = {k: v for k, v in followers.items() if v is not None}

# Sort users by number of followers (descending)
followers_sorted = dict(sorted(valid_followers.items(), key=lambda item: item[1], reverse=True))

# Print the top 20 users with the most followers
print("Top 20 users by follower count:\n")
for user, count in list(followers_sorted.items())[:20]:
    print(f"{user}: {count}")

Top 20 users by follower count:

EG-A-S: 375
kennylevinsen: 172
thebuilder: 160
trifork: 66
jkiddo: 50
jeme: 40
shapehq: 40
kawaiipantsu: 38
AndersSpringborg: 26
madsthom: 25
bjarkehs: 21
gugi9000: 20
HusseinElZein: 20
fredelundbeck: 18
hcarreras: 17
signifly: 17
jacobe: 16
TusharRoy23: 13
mathiasandresen: 12
anderslime: 12
