# 0. Importing packages

In [1]:
# Load the autoreload extension to automatically reload modules before executing code (to avoid restarting the kernel)
%load_ext autoreload 
# NB. uncomment the line above first time you run this cell
%autoreload 2

In [2]:
from pathlib import Path
import pandas as pd
from typing import Dict
from tqdm import tqdm

from resources.filter_functions import look_company_up_in_edgelist
from resources.github_functions import GithubScraper


# Instantiate GithubScraper
gs = GithubScraper()

GitHub access token collected from config
GithubScraper initialized with 0 companies and 0 users already scraped.


## 0.1 File Paths

In [3]:
# File paths
import resources.filepaths as fp

fp_main = fp.fp_main
fp_main_output = fp.fp_main_output

# To output data that has to go to external s-drive
fp_main_external = fp.fp_main_external
fp_output_external = fp.fp_output_external

# 1 Load in the data

In [None]:
# Read in the data
all_edges_user_level = pd.read_parquet(
    fp_output_external / "all_edges_user_level.gzip.parquet"
)
attention_edges_user_level = pd.read_parquet(
    fp_output_external / "attention_edges_user_level.gzip.parquet"
)
collaboration_edges_user_level = pd.read_parquet(
    fp_output_external / "collaboration_edges_user_level.gzip.parquet"
)

# 2. Examining the networks

## 2.1 Collaboration

In [None]:
result_trifork = look_company_up_in_edgelist(
    company="trifork",
    edgelist=collaboration_edges_user_level,
    direction="out",
    exclude_self_loops=True,
)
result_trifork

Unnamed: 0,src,target,src_company,target_company,src_company_category,src_company_label,target_company_category,target_company_label,d_intra_level,d_inter_level,edge_repo,action,created_at


## 2.2 Attention

In [None]:
result_trifork = look_company_up_in_edgelist(
    company="trifork",
    edgelist=attention_edges_user_level,
    direction="all",
    exclude_self_loops=False,
)
result_trifork

Unnamed: 0,src,target,src_company,target_company,src_company_category,src_company_label,target_company_category,target_company_label,d_intra_level,d_inter_level,edge_repo,action,created_at
43,AndersSpringborg,madsthom,trifork,trifork,1,1 Digital and marketing consultancies,1,1 Digital and marketing consultancies,1,0,Ruslan-Bot/AndersSpringborg,stars,2015-10-11
44,mathiasandresen,madsthom,trifork,trifork,1,1 Digital and marketing consultancies,1,1 Digital and marketing consultancies,1,0,mmr-project/mathiasandresen,stars,2013-10-26
45,AndersSpringborg,madsthom,trifork,trifork,1,1 Digital and marketing consultancies,1,1 Digital and marketing consultancies,1,0,mmr-project/AndersSpringborg,stars,2015-10-11
46,AndersSpringborg,madsthom,trifork,trifork,1,1 Digital and marketing consultancies,1,1 Digital and marketing consultancies,1,0,dbs_g14/AndersSpringborg,watches,2015-10-11
47,AndersSpringborg,madsthom,trifork,trifork,1,1 Digital and marketing consultancies,1,1 Digital and marketing consultancies,1,0,Flashcard/AndersSpringborg,watches,2015-10-11
...,...,...,...,...,...,...,...,...,...,...,...,...,...
339,jacobe,jacobe,trifork,trifork,1,1 Digital and marketing consultancies,1,1 Digital and marketing consultancies,1,0,spot-playground/jacobe,watches,2021-03-18
340,jacobe,jacobe,trifork,trifork,1,1 Digital and marketing consultancies,1,1 Digital and marketing consultancies,1,0,advent-of-code-2021/jacobe,watches,2021-12-06
341,jacobe,jacobe,trifork,trifork,1,1 Digital and marketing consultancies,1,1 Digital and marketing consultancies,1,0,advent-of-code-2020/jacobe,watches,2021-12-06
342,jacobe,jacobe,trifork,trifork,1,1 Digital and marketing consultancies,1,1 Digital and marketing consultancies,1,0,twirpy-experiment/jacobe,watches,2022-08-18


# 3. Get number of followers

In [None]:
# Get all unique GitHub usernames from the edge list
unique_users = pd.concat(
    [all_edges_user_level["src"], all_edges_user_level["target"]]
).unique()


def get_followers(username: str) -> int | None:
    """
    Returns the number of followers for a given GitHub username.
    If the user does not exist or is private, returns None.
    """
    try:
        user_obj = gs.get_user(username)
        if user_obj is None:
            print(f"[!] User '{username}' not found.")
            return None
        return user_obj.followers
    except Exception as e:
        print(f"[!] Error retrieving user '{username}': {e}")
        return None


# Query follower counts for all unique users
followers = {user: get_followers(user) for user in tqdm(unique_users)}

  0%|          | 0/116 [00:00<?, ?it/s]

[NEW] GitHub ratelimit threshold set to 300 (max rate: 5000)


100%|██████████| 116/116 [00:57<00:00,  2.02it/s]


## 3.1 Filter the follower data

In [None]:
# Filter out users with no follower data (None)
valid_followers: Dict[str, int] = {k: v for k, v in followers.items() if v is not None}

# Sort users by number of followers (descending)
followers_sorted = dict(
    sorted(valid_followers.items(), key=lambda item: item[1], reverse=True)
)

# Print the top 20 users with the most followers
print("Top 20 users by follower count:\n")
for user, count in list(followers_sorted.items())[:20]:
    print(f"{user}: {count}")

Top 20 users by follower count:

EG-A-S: 375
kennylevinsen: 172
thebuilder: 160
trifork: 66
jkiddo: 50
jeme: 40
shapehq: 40
kawaiipantsu: 38
AndersSpringborg: 26
madsthom: 25
bjarkehs: 21
gugi9000: 20
HusseinElZein: 20
fredelundbeck: 18
hcarreras: 17
signifly: 17
jacobe: 16
TusharRoy23: 13
mathiasandresen: 12
anderslime: 12
