# 0. Importing packages

In [7]:
# Load the autoreload extension to automatically reload modules before executing code (to avoid restarting the kernel)
%load_ext autoreload 

# Enable autoreload for all modules
%autoreload 2

# Python 
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import json

# Custom Packages
from resources.github_functions import GithubScraper

GitHub access token collected from config: gith...
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 0.1 Set file paths ###

In [None]:
# File paths
import resources.filepaths as fp

fp_main = fp.fp_main
fp_main_output = fp.fp_main_output

# To output data that has to go to external s-drive
fp_main_external = fp.fp_main_external
fp_output_external = fp.fp_output_external

# 1. Load in the initial company list

## 1.2 Processing the company list

In [3]:
# Load the initial list of companies
initial_list = pd.read_csv(
    fp_main_output / "initial_list.csv"
)

# Subset relevant company info for initial list
list_of_company_info = initial_list[
    ['company_search_keyword', 'company_category', 'without_location_filter', 'company_label_name']
]

# Create company category map for initial list
company_category_map = dict(zip(
    list_of_company_info['company_search_keyword'],
    list_of_company_info['company_category']
))

# Output company category map
with open(fp_main_output / "company_category_map.jsonl", 'w', encoding='utf-8') as f:
    for keyword, category in company_category_map.items():
        f.write(json.dumps({'company_search_keyword': keyword, 'company_category': category}, ensure_ascii=False) + '\n')

In [4]:
# Create a zip object to loop through, when querying companies.
company_location_filter_bool_zip = zip(
    list_of_company_info['company_search_keyword'],
    list_of_company_info['without_location_filter'],
    list_of_company_info['company_label_name']
)

# 2. Queriyng the company names and scraping users

## 2.1 Loading in scrapelogs

In [5]:
# Collecting users already scraped
users_already_scraped = set()
companies_already_scraped = set()

user_log_file_name = 'first_tier_user_scrapelog.jsonl'
company_log_file_name = 'company_scrapelog.jsonl'
user_log_path = fp_output_external / user_log_file_name
company_log_path = fp_main_output / company_log_file_name

# Ensure files exist and print message
if user_log_path.exists():
    print(f"[INFO] User log exists: {user_log_path.name}")
else:
    user_log_path.touch(exist_ok=True)

if company_log_path.exists():
    print(f"[INFO] Company log exists: {company_log_path.name}")
else:
    company_log_path.touch(exist_ok=True)

# Read user log
with open(user_log_path, 'r', encoding='utf-8') as f:
    for line in f:
        try:
            user_info = json.loads(line)
            users_already_scraped.add(user_info['user_login'])
        except (json.JSONDecodeError, KeyError) as err:
            print(f"[WARNING] Skipping malformed user line: {err}")

# Read company log
with open(company_log_path, 'r', encoding='utf-8') as f:
    for line in f:
        try:
            company = json.loads(line)
            companies_already_scraped.add(company['company_name'])
        except (json.JSONDecodeError, KeyError) as err:
            print(f"[WARNING] Skipping malformed company line: {err}")

[INFO] User log exists: first_tier_user_scrapelog.jsonl
[INFO] Company log exists: company_scrapelog.jsonl


## 2.2 Instantiating the GithubScraper and scrape

In [6]:
# 1. Create instance of GithubScraper
gs = GithubScraper(
    users_already_scraped=users_already_scraped,
    companies_already_scraped=companies_already_scraped,
    repo_limit=50
)
print(f'GitHub REST API ratelimit reset time for token is {gs.reset_time_point}. '
      f'That will be in a little less than {gs.reset_time_in_minutes} minutes.')

# 2. Define output file name
file_name = 'first_tier_userinfo'

# 3. Loop through company queries
for search_query, skip_location_filter, company_label in tqdm(company_location_filter_bool_zip, total=len(initial_list), unit='company'):

    # 3.1 Skip company if already scraped
    if company_label in gs.companies_already_scraped:
        print(f'[INFO] Company {company_label} already scraped. Skipping.')
        continue

    print(f'[INFO] Scraping users for company: {company_label}')
    
    # 3.2 Get users for this company
    users = gs.get_gh_users(search_query, skip_location_filter)

    # 3.3 Loop through users
    for named_user, company in users:

        # 3.3.1 Skip user if already scraped
        if named_user.login in gs.users_already_scraped:
            print(f'[INFO] User {named_user.login} already scraped. Skipping.')
            continue

        print(f'[INFO] Scraping user: {named_user.login}')
        gs.users_already_scraped.add(named_user.login)

        # 3.3.2 Get user info (may return None if repo limit exceeded or no match)
        user_row = gs.get_user_info(named_user, company_label, company_filter=True)
        if user_row is None:
            continue  # Skip user if they don't meet scraping criteria

        # 3.3.3 Extract match data
        location_match = user_row.matched_location
        inferred_company = user_row.inferred_company
        matched_company_strings = user_row.matched_company_strings

        # 3.3.4 Save user info and log result
        gs.save_file(user_row, file_name, remove_existing_file=True)
        gs.log_user_w_match(named_user.login, inferred_company, matched_company_strings, location_match, user_log_path) # type: ignore

        print(f'[INFO] {gs.USERS_SCRAPED} users scraped so far.')

    # 3.4 Log company after scraping all users
    gs.log_company(company_label, company_log_path) # type: ignore

GithubScraper initialized with 8 companies and 38 users already scraped.
GitHub REST API ratelimit reset time for token is 2025-08-12 15:41:28. That will be in a little less than 55 minutes.


  0%|          | 0/33 [00:00<?, ?company/s]

[INFO] Company nodes already scraped. Skipping.
[INFO] Company abtion already scraped. Skipping.
[INFO] Company heyday already scraped. Skipping.
[INFO] Company trifork already scraped. Skipping.
[INFO] Company frontit already scraped. Skipping.
[INFO] Company holion already scraped. Skipping.
[INFO] Company kruso already scraped. Skipping.
[INFO] Company pandiweb already scraped. Skipping.
[INFO] Scraping users for company: uptime
[NEW] GitHub ratelimit threshold set to 300 (max rate: 5000)
[INFO] User andersmandersen already scraped. Skipping.
[INFO] User uptime-development already scraped. Skipping.
[INFO] Scraping user: uptimedk
[NEW] GitHub ratelimit threshold set to 5 (max rate: 30)
[NEW] GitHub ratelimit threshold set to 300 (max rate: 5000)
User match uptimedk logged.
[INFO] 39 users scraped so far.
[INFO] Scraping user: alertdesk
User match alertdesk logged.
[INFO] 40 users scraped so far.
[INFO] Scraping user: SubleGG


 27%|██▋       | 9/33 [00:34<01:32,  3.87s/company]

User match SubleGG logged.
[INFO] 41 users scraped so far.
Company uptime logged.
[INFO] Scraping users for company: charlie tango
[INFO] Scraping user: thebuilder
[NEW] GitHub ratelimit threshold set to 5 (max rate: 30)
[NEW] GitHub ratelimit threshold set to 300 (max rate: 5000)
[INFO] Scraping user: charlie-tango
User match charlie-tango logged.
[INFO] 42 users scraped so far.
[INFO] Scraping user: kristofferkjelde
User match kristofferkjelde logged.
[INFO] 43 users scraped so far.
[INFO] Scraping user: mariatlund
User match mariatlund logged.
[INFO] 44 users scraped so far.
[INFO] Scraping user: jacobcharlietango


 30%|███       | 10/33 [03:33<10:39, 27.79s/company]

User match jacobcharlietango logged.
[INFO] 45 users scraped so far.
Company charlie tango logged.
[INFO] Scraping users for company: ffw
[INFO] Scraping user: ffwagency
[NEW] GitHub ratelimit threshold set to 5 (max rate: 30)
[NEW] GitHub ratelimit threshold set to 300 (max rate: 5000)


 33%|███▎      | 11/33 [05:16<14:42, 40.11s/company]

User match ffwagency logged.
[INFO] 46 users scraped so far.
Company ffw logged.
[INFO] Scraping users for company: mysupport
[INFO] Scraping user: MySupport-aps
[NEW] GitHub ratelimit threshold set to 5 (max rate: 30)
[NEW] GitHub ratelimit threshold set to 300 (max rate: 5000)


 36%|███▋      | 12/33 [05:22<11:45, 33.61s/company]

User match MySupport-aps logged.
[INFO] 47 users scraped so far.
Company mysupport logged.
[INFO] Scraping users for company: shape
[INFO] Scraping user: shapehq
[NEW] GitHub ratelimit threshold set to 5 (max rate: 30)
[NEW] GitHub ratelimit threshold set to 300 (max rate: 5000)
[INFO] Scraping user: ShapeGames
User match ShapeGames logged.
[INFO] 48 users scraped so far.
[INFO] Scraping user: 3shape
User match 3shape logged.
[INFO] 49 users scraped so far.
[INFO] Scraping user: anderslime
User match anderslime logged.
[INFO] 50 users scraped so far.
[INFO] Scraping user: dkcas11
User match dkcas11 logged.
[INFO] 51 users scraped so far.
[INFO] Scraping user: sebastianlyserena
User match sebastianlyserena logged.
[INFO] 52 users scraped so far.
[INFO] Scraping user: flexshape
User match flexshape logged.
[INFO] 53 users scraped so far.
[INFO] Scraping user: kawaiipantsu
User match kawaiipantsu logged.
[INFO] 54 users scraped so far.
[INFO] Scraping user: TusharRoy23
[INFO] Scraping use

 39%|███▉      | 13/33 [17:16<59:32, 178.64s/company]

User match dolandinvest logged.
[INFO] 64 users scraped so far.
Company shape logged.
[INFO] Scraping users for company: makeable
[INFO] Scraping user: makeabledk
[NEW] GitHub ratelimit threshold set to 5 (max rate: 30)


 42%|████▏     | 14/33 [17:17<43:26, 137.18s/company]

[NEW] GitHub ratelimit threshold set to 300 (max rate: 5000)
Company makeable logged.
[INFO] Scraping users for company: mustache
[INFO] Scraping user: mustachedk
[NEW] GitHub ratelimit threshold set to 5 (max rate: 30)
[NEW] GitHub ratelimit threshold set to 300 (max rate: 5000)
User match mustachedk logged.
[INFO] 65 users scraped so far.
[INFO] Scraping user: mustachedkdev


 45%|████▌     | 15/33 [19:14<39:36, 132.03s/company]

User match mustachedkdev logged.
[INFO] 66 users scraped so far.
Company mustache logged.
[INFO] Scraping users for company: house of code


 48%|████▊     | 16/33 [19:14<27:36, 97.41s/company] 

0 - no users found
Company house of code logged.
[INFO] Scraping users for company: greener pastures
[NEW] GitHub ratelimit threshold set to 5 (max rate: 30)
[INFO] Scraping user: Husemeyer
[NEW] GitHub ratelimit threshold set to 300 (max rate: 5000)
[INFO] Scraping user: cspagnesi
[INFO] Scraping user: Team-Greener-Pastures
[INFO] Scraping user: cwru-greener-pastures
[INFO] Scraping user: greener-pastures
User match greener-pastures logged.
[INFO] 67 users scraped so far.
[INFO] Scraping user: GreenerPasturesConsulting
[INFO] Scraping user: rs-sh


 52%|█████▏    | 17/33 [19:29<19:56, 74.81s/company]

Company greener pastures logged.
[INFO] Scraping users for company: axla


 55%|█████▍    | 18/33 [19:29<13:28, 53.87s/company]

0 - no users found
Company axla logged.
[INFO] Scraping users for company: snapp
[NEW] GitHub ratelimit threshold set to 5 (max rate: 30)
[INFO] Scraping user: snappdk
[NEW] GitHub ratelimit threshold set to 300 (max rate: 5000)


 58%|█████▊    | 19/33 [19:44<09:56, 42.62s/company]

User match snappdk logged.
[INFO] 68 users scraped so far.
Company snapp logged.
[INFO] Scraping users for company: appscaptain


 61%|██████    | 20/33 [19:44<06:35, 30.40s/company]

0 - no users found
Company appscaptain logged.
[INFO] Scraping users for company: adtomic
[NEW] GitHub ratelimit threshold set to 5 (max rate: 30)


 64%|██████▎   | 21/33 [19:45<04:19, 21.65s/company]

0 - no users found
Company adtomic logged.
[INFO] Scraping users for company: signifly
[INFO] Scraping user: signifly
[NEW] GitHub ratelimit threshold set to 300 (max rate: 5000)
[INFO] Scraping user: Zagoman


 67%|██████▋   | 22/33 [21:24<08:11, 44.65s/company]

User match Zagoman logged.
[INFO] 69 users scraped so far.
Company signifly logged.
[INFO] Scraping users for company: creuna
[INFO] Scraping user: arla-creuna
[NEW] GitHub ratelimit threshold set to 5 (max rate: 30)
[NEW] GitHub ratelimit threshold set to 300 (max rate: 5000)


 70%|██████▉   | 23/33 [21:28<05:23, 32.39s/company]

User match arla-creuna logged.
[INFO] 70 users scraped so far.
Company creuna logged.
[INFO] Scraping users for company: strømlin
[INFO] Scraping user: headnet
[NEW] GitHub ratelimit threshold set to 5 (max rate: 30)


 73%|███████▎  | 24/33 [21:29<03:27, 23.05s/company]

[NEW] GitHub ratelimit threshold set to 300 (max rate: 5000)
Company strømlin logged.
[INFO] Scraping users for company: must
[INFO] Scraping user: kennylevinsen
[NEW] GitHub ratelimit threshold set to 5 (max rate: 30)
[NEW] GitHub ratelimit threshold set to 300 (max rate: 5000)
[INFO] Scraping user: mustass
[INFO] Scraping user: gugi9000
User match gugi9000 logged.
[INFO] 71 users scraped so far.
[INFO] Scraping user: mustafatemiz
[INFO] User mustachedk already scraped. Skipping.
[INFO] Scraping user: MustafaSidiqi
[INFO] Scraping user: morten-andersen
User match morten-andersen logged.
[INFO] 72 users scraped so far.
[INFO] Scraping user: mustafalani
[INFO] Scraping user: Musta-0
[INFO] Scraping user: mustafauskuplu
User match mustafauskuplu logged.
[INFO] 73 users scraped so far.
[INFO] Scraping user: sieTRIFORK
User match sieTRIFORK logged.
[INFO] 74 users scraped so far.
[INFO] User mustachedkdev already scraped. Skipping.
[INFO] Scraping user: mustiodk
[INFO] Scraping user: musti

 76%|███████▌  | 25/33 [24:15<08:45, 65.74s/company]

User match mbetrifork logged.
[INFO] 76 users scraped so far.
Company must logged.
[INFO] Scraping users for company: netcompany
[INFO] Scraping user: netcompany
[NEW] GitHub ratelimit threshold set to 5 (max rate: 30)
[NEW] GitHub ratelimit threshold set to 300 (max rate: 5000)
User match netcompany logged.
[INFO] 77 users scraped so far.
[INFO] Scraping user: AeroplaneMouse
User match AeroplaneMouse logged.
[INFO] 78 users scraped so far.
[INFO] Scraping user: ninkaninus
User match ninkaninus logged.
[INFO] 79 users scraped so far.
[INFO] Scraping user: ansolesen
User match ansolesen logged.
[INFO] 80 users scraped so far.
[INFO] Scraping user: HusseinElZein
User match HusseinElZein logged.
[INFO] 81 users scraped so far.
[INFO] Scraping user: barth010may
User match barth010may logged.
[INFO] 82 users scraped so far.
[INFO] Scraping user: Rene4100
User match Rene4100 logged.
[INFO] 83 users scraped so far.
[INFO] Scraping user: D-Kuzin
User match D-Kuzin logged.
[INFO] 84 users scrap

 79%|███████▉  | 26/33 [1:02:38<1:25:41, 734.43s/company]

User match UlmerDK logged.
[INFO] 133 users scraped so far.
Company netcompany logged.
[INFO] Scraping users for company: systematic
[INFO] Scraping user: jeme
[NEW] GitHub ratelimit threshold set to 5 (max rate: 30)
[NEW] GitHub ratelimit threshold set to 300 (max rate: 5000)
User match jeme logged.
[INFO] 134 users scraped so far.
[INFO] Scraping user: michaelbui99
User match michaelbui99 logged.
[INFO] 135 users scraped so far.
[INFO] Scraping user: JonasAxelsen
User match JonasAxelsen logged.
[INFO] 136 users scraped so far.
[INFO] Scraping user: andreasmalling


Request GET /repos/andreasmalling/iptv/subscribers failed with 403: Forbidden
Request GET /repos/andreasmalling/iptv failed with 403: Forbidden


User match andreasmalling logged.
[INFO] 137 users scraped so far.
[INFO] Scraping user: SkouRene
User match SkouRene logged.
[INFO] 138 users scraped so far.
[INFO] Scraping user: EfrinGonzalez
User match EfrinGonzalez logged.
[INFO] 139 users scraped so far.
[INFO] Scraping user: tobiasfrisenborg
User match tobiasfrisenborg logged.
[INFO] 140 users scraped so far.
[INFO] Scraping user: SvenNielsen
User match SvenNielsen logged.
[INFO] 141 users scraped so far.
[INFO] Scraping user: hasnaAlina
User match hasnaAlina logged.
[INFO] 142 users scraped so far.
[INFO] Scraping user: JacobBangSSE


 82%|████████▏ | 27/33 [1:10:26<1:05:27, 654.53s/company]

User match JacobBangSSE logged.
[INFO] 143 users scraped so far.
Company systematic logged.
[INFO] Scraping users for company: capgemini
[INFO] Scraping user: Capgemini-Denmark-I-D-MLOps
[NEW] GitHub ratelimit threshold set to 5 (max rate: 30)
[NEW] GitHub ratelimit threshold set to 300 (max rate: 5000)
User match Capgemini-Denmark-I-D-MLOps logged.
[INFO] 144 users scraped so far.
[INFO] Scraping user: sravansamudrala
User match sravansamudrala logged.
[INFO] 145 users scraped so far.
[INFO] Scraping user: saberesf


 85%|████████▍ | 28/33 [1:10:48<38:47, 465.42s/company]  

User match saberesf logged.
[INFO] 146 users scraped so far.
Company capgemini logged.
[INFO] Scraping users for company: sas institute


 88%|████████▊ | 29/33 [1:10:49<21:44, 326.12s/company]

0 - no users found
Company sas institute logged.
[INFO] Scraping users for company: eg a/s
[NEW] GitHub ratelimit threshold set to 5 (max rate: 30)
[INFO] Scraping user: EG-A-S
[NEW] GitHub ratelimit threshold set to 300 (max rate: 5000)
User match EG-A-S logged.
[INFO] 147 users scraped so far.
[INFO] Scraping user: EGByg
User match EGByg logged.
[INFO] 148 users scraped so far.
[INFO] Scraping user: EG-A-S-TEST


 91%|█████████ | 30/33 [1:15:01<15:12, 304.01s/company]

User match EG-A-S-TEST logged.
[INFO] 149 users scraped so far.
Company eg a/s logged.
[INFO] Scraping users for company: kmd


 94%|█████████▍| 31/33 [1:15:02<07:06, 213.21s/company]

0 - no users found
Company kmd logged.
[INFO] Scraping users for company: adform
[NEW] GitHub ratelimit threshold set to 5 (max rate: 30)


 97%|█████████▋| 32/33 [1:15:04<02:29, 149.62s/company]

1 - no users found
Company adform logged.
[INFO] Scraping users for company: proactivedk


100%|██████████| 33/33 [1:15:05<00:00, 136.52s/company]

0 - no users found
Company proactivedk logged.



