# 0. Importing packages

In [1]:
# Load the autoreload extension to automatically reload modules before executing code (to avoid restarting the kernel)
%load_ext autoreload 

# Enable autoreload for all modules
%autoreload 2

# Python 
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import json

# Custom Packages
from resources.github_functions import GithubScraper

## 0.1 File paths ###

In [2]:
fp_main = Path('/Volumes/SAM-SODAS-DISTRACT/Coding Distraction/github_as_a_market_device')
fp_main_output = Path(fp_main / 'output')

# 1. Scraping users from the initial company list

## 1.2 Processing the company list into:
1. Dictionary containing company categories --> company_category
2. A zip_file containing the company-string to query when scraping plus a location_filter dummy, indicating whether the location filter should apply when queriyng that company --> user_location_zip

In [3]:
# Load the company list from Excel
company_list_full = pd.read_excel(
    fp_main / 'company_list' / 'company_info_list211022.xlsx',
    usecols='A:M'
)

# Filter for companies that are part of our sample
company_list_sample = company_list_full.loc[
    company_list_full['part_of_firmaliste'] == 1
].reset_index(drop=True)

# Create a list of relevant company info
list_of_company_names = company_list_sample[
    ['søgeord', 'new_company_category', 'uden lokation']
]

# Create a dictionary mapping company name (lowercase) to category
company_category = dict(zip(
    company_list_sample['søgeord'].str.lower(),
    company_list_sample['new_company_category']
))

# Write the dictionary as JSONL
with open(fp_main_output / 'company_category_dict.jsonl', 'w', encoding='utf-8') as f:
    for company, category in company_category.items():
        json.dump({'søgeord': company, 'new_company_category': category}, f, ensure_ascii=False)
        f.write('\n')

In [4]:
# Create a zip object for name and location filter status
company_location_filter_bool_zip = zip(
    list_of_company_names['søgeord'],
    list_of_company_names['uden lokation'],
    company_list_sample['company_label_name']
)

# 2. Queriyng the company names and scraping users

## 2.1 Loading in logs in case scrape has been interrupted

In [5]:
# Collecting users already scraped
users_already_scraped = set()
companies_already_scraped = set()

user_log_file_name = 'first_tier_userinfo_user_log.jsonl'
company_log_file_name = 'first_tier_userinfo_company_log.jsonl'
user_log_path = fp_main_output / user_log_file_name
company_log_path = fp_main_output / company_log_file_name

# Ensure files exist and print message
if user_log_path.exists():
    print(f"[INFO] User log exists: {user_log_path.name}")
else:
    print(f"[INFO] User log does NOT exist. Creating new file: {user_log_path.name}")
    user_log_path.touch(exist_ok=True)

if company_log_path.exists():
    print(f"[INFO] Company log exists: {company_log_path.name}")
else:
    print(f"[INFO] Company log does NOT exist. Creating new file: {company_log_path.name}")
    company_log_path.touch(exist_ok=True)

# Read user log
with open(user_log_path, 'r', encoding='utf-8') as f:
    for line in f:
        try:
            user_info = json.loads(line)
            users_already_scraped.add(user_info['user_login'])
        except (json.JSONDecodeError, KeyError) as err:
            print(f"[WARNING] Skipping malformed user line: {err}")

# Read company log
with open(company_log_path, 'r', encoding='utf-8') as f:
    for line in f:
        try:
            company = json.loads(line)
            companies_already_scraped.add(company['company_name'])
        except (json.JSONDecodeError, KeyError) as err:
            print(f"[WARNING] Skipping malformed company line: {err}")

[INFO] User log exists: first_tier_userinfo_user_log.jsonl
[INFO] Company log exists: first_tier_userinfo_company_log.jsonl


## 2.2 Instantiating the GithubScraper and scrape

In [None]:
# 1. Create instance of GithubScraper
gs = GithubScraper(
    users_already_scraped=users_already_scraped,
    companies_already_scraped=companies_already_scraped,
    repo_limit=300
)
print(f'GitHub REST API ratelimit reset time for token {gs.current_token_key} is {gs.reset_time_point}. '
      f'That will be in a little less than {gs.reset_time_in_minutes} minutes.')

# 2. Define output file name
file_name = 'first_tier_userinfo'

# 3. Loop through company queries
for search_query, skip_location_filter, company_label in tqdm(company_location_filter_bool_zip, total=len(company_list_sample)):

    # 3.1 Skip company if already scraped
    if company_label in gs.companies_already_scraped:
        print(f'[INFO] Company {company_label} already scraped. Skipping.')
        continue

    print(f'[INFO] Scraping users for company: {company_label}')
    
    # 3.2 Get users for this company
    users = gs.get_gh_users(search_query, skip_location_filter)

    # 3.3 Loop through users
    for named_user, company in users:

        # 3.3.1 Skip user if already scraped
        if named_user.login in gs.users_already_scraped:
            print(f'[INFO] User {named_user.login} already scraped. Skipping.')
            continue

        print(f'[INFO] Scraping user: {named_user.login}')
        gs.users_already_scraped.add(named_user.login)

        # 3.3.2 Get user info (may return None if repo limit exceeded or no match)
        user_row = gs.get_user_info(named_user, company_label)
        if user_row is None:
            continue  # Skip user if they don't meet scraping criteria

        # 3.3.3 Extract match data
        location_match = user_row.location
        inferred_company = user_row.inferred_company
        matched_company_strings = user_row.matched_company_strings

        # 3.3.4 Save user info and log result
        gs.save_file(user_row, file_name, remove_existing_file=True)
        gs.log_user_w_match(named_user.login, inferred_company, matched_company_strings, location_match, user_log_path)
        
        print(f'[INFO] {gs.USERS_SCRAPED} users scraped so far.')

    # 3.4 Log company after scraping all users
    gs.log_company(company_label, company_log_path)

GithubScraper initialized with 3 tokens.
First token in cycle. Initiating ACCESS_TOKEN_1.
GithubScraper initialized with 35 companies and 185 users already scraped.
GitHub REST API ratelimit reset time for token ACCESS_TOKEN_1 is 2025-05-28 14:29:41. That will be in a little less than 30 minutes.


  0%|          | 0/64 [00:00<?, ?it/s]

[INFO] Company nodes already scraped. Skipping.
[INFO] Company abtion already scraped. Skipping.
[INFO] Company heyday already scraped. Skipping.
[INFO] Company trifork already scraped. Skipping.
[INFO] Company frontit already scraped. Skipping.
[INFO] Company holion already scraped. Skipping.
[INFO] Company kruso already scraped. Skipping.
[INFO] Company pandiweb already scraped. Skipping.
[INFO] Company uptime already scraped. Skipping.
[INFO] Company charlie tango already scraped. Skipping.
[INFO] Company ffw already scraped. Skipping.
[INFO] Company mysupport already scraped. Skipping.
[INFO] Company shape already scraped. Skipping.
[INFO] Company makeable already scraped. Skipping.
[INFO] Company mustache already scraped. Skipping.
[INFO] Company house of code already scraped. Skipping.
[INFO] Company greener pastures already scraped. Skipping.
[INFO] Company axla already scraped. Skipping.
[INFO] Company snapp already scraped. Skipping.
[INFO] Company appscaptain already scraped.

Request GET /users/elsewhat failed with 403: Forbidden
Setting next backoff to 581.257467s


User match PederHP logged.
[INFO] 186 users scraped so far.
[INFO] Scraping user: michelandresaxo
User match michelandresaxo logged.
[INFO] 187 users scraped so far.
[INFO] Scraping user: jorgeta
User match jorgeta logged.
[INFO] 188 users scraped so far.
[INFO] Scraping user: phccdk
User match phccdk logged.
[INFO] 189 users scraped so far.
[INFO] Scraping user: hannefolmer


 56%|█████▋    | 36/64 [31:24<24:25, 52.34s/it]

User match hannefolmer logged.
[INFO] 190 users scraped so far.
Company saxo bank logged.
[INFO] Scraping users for company: kabellmunk
[INFO] Scraping user: kabellmunk


 58%|█████▊    | 37/64 [31:27<22:42, 50.47s/it]

User match kabellmunk logged.
[INFO] 191 users scraped so far.
Company kabellmunk logged.
[INFO] Scraping users for company: dgi-it
[INFO] Scraping user: DGI-IT-zz


 59%|█████▉    | 38/64 [31:28<20:46, 47.93s/it]

Company dgi-it logged.
[INFO] Scraping users for company: ørsted
[INFO] Scraping user: HansOersted
[INFO] Scraping user: Orsted
User match Orsted logged.
[INFO] 192 users scraped so far.
[INFO] Scraping user: zlin
[INFO] Scraping user: 0rsted
[INFO] Scraping user: H-C-Orsted-Gym
[INFO] Scraping user: Programmering-B


 61%|██████    | 39/64 [31:42<18:59, 45.56s/it]

Company ørsted logged.
[INFO] Scraping users for company: nuuday
[INFO] Scraping user: nuuday
User match nuuday logged.
[INFO] 193 users scraped so far.
[INFO] Scraping user: HilderscheidNuuday
User match HilderscheidNuuday logged.
[INFO] 194 users scraped so far.
[INFO] Scraping user: niejo
User match niejo logged.
[INFO] 195 users scraped so far.
[INFO] Scraping user: A60753


 62%|██████▎   | 40/64 [34:22<22:19, 55.81s/it]

Company nuuday logged.
[INFO] Scraping users for company: yousee
[INFO] Scraping user: YouSee
User match YouSee logged.
[INFO] 196 users scraped so far.
[INFO] Scraping user: youseedk
User match youseedk logged.
[INFO] 197 users scraped so far.
[INFO] Scraping user: JakobGrosen
User match JakobGrosen logged.
[INFO] 198 users scraped so far.
[INFO] Scraping user: YouSeeThisName
User match YouSeeThisName logged.
[INFO] 199 users scraped so far.
[INFO] Scraping user: YouseeJenkinsCI
User match YouseeJenkinsCI logged.
[INFO] 200 users scraped so far.
[INFO] Scraping user: anirbantdc


 64%|██████▍   | 41/64 [39:15<31:41, 82.66s/it]

User match anirbantdc logged.
[INFO] 201 users scraped so far.
[INFO] User niejo already scraped. Skipping.
Company yousee logged.
[INFO] Scraping users for company: relatel
[INFO] Scraping user: relatel


 66%|██████▌   | 42/64 [42:08<34:56, 95.29s/it]

User match relatel logged.
[INFO] 202 users scraped so far.
Company relatel logged.
[INFO] Scraping users for company: cphapp
[INFO] Scraping user: cphapp


 67%|██████▋   | 43/64 [42:15<28:13, 80.65s/it]

User match cphapp logged.
[INFO] 203 users scraped so far.
Company cphapp logged.
[INFO] Scraping users for company: commentor
[INFO] Scraping user: commentorARM
User match commentorARM logged.
[INFO] 204 users scraped so far.
[INFO] Scraping user: MichaelBoPoulsen
User match MichaelBoPoulsen logged.
[INFO] 205 users scraped so far.
[INFO] Scraping user: PIHCommentor


 69%|██████▉   | 44/64 [42:25<22:21, 67.06s/it]

User match PIHCommentor logged.
[INFO] 206 users scraped so far.
Company commentor logged.
[INFO] Scraping users for company: nabto
[INFO] Scraping user: nabto


 70%|███████   | 45/64 [47:42<38:15, 120.82s/it]

User match nabto logged.
[INFO] 207 users scraped so far.
Company nabto logged.
[INFO] Scraping users for company: jobindex
[INFO] Scraping user: jobindex
User match jobindex logged.
[INFO] 208 users scraped so far.
[INFO] Scraping user: Eckankar
User match Eckankar logged.
[INFO] 209 users scraped so far.
[INFO] Scraping user: Jobindex-LH
[INFO] Scraping user: Jobindexworld


 72%|███████▏  | 46/64 [54:27<56:16, 187.56s/it]

Company jobindex logged.
[INFO] Scraping users for company: miracle
[INFO] User miracle-as already scraped. Skipping.
[INFO] User gahms already scraped. Skipping.
[INFO] Scraping user: WSAudiology
User match WSAudiology logged.
[INFO] 210 users scraped so far.
[INFO] Scraping user: Tahulrik


 73%|███████▎  | 47/64 [54:41<40:45, 143.86s/it]

User match Tahulrik logged.
[INFO] 211 users scraped so far.
Company miracle logged.
[INFO] Scraping users for company: immeo
[INFO] Scraping user: immeodk


 75%|███████▌  | 48/64 [54:48<28:43, 107.73s/it]

User match immeodk logged.
[INFO] 212 users scraped so far.
Company immeo logged.
[INFO] Scraping users for company: siteimprove
[INFO] Scraping user: Siteimprove
User match Siteimprove logged.
[INFO] 213 users scraped so far.
[INFO] Scraping user: henrikklarup
User match henrikklarup logged.
[INFO] 214 users scraped so far.
[INFO] Scraping user: mostergaard
User match mostergaard logged.
[INFO] 215 users scraped so far.
[INFO] Scraping user: dcamsiteimprove
User match dcamsiteimprove logged.
[INFO] 216 users scraped so far.
[INFO] Scraping user: martinatsiteimprove
User match martinatsiteimprove logged.
[INFO] 217 users scraped so far.
[INFO] Scraping user: SorenHarderQESI
User match SorenHarderQESI logged.
[INFO] 218 users scraped so far.
[INFO] Scraping user: platops-siteimprove


 77%|███████▋  | 49/64 [1:01:03<45:16, 181.08s/it]

User match platops-siteimprove logged.
[INFO] 219 users scraped so far.
Company siteimprove logged.
[INFO] Scraping users for company: cbrain
[INFO] Scraping user: cBrain-dk
User match cBrain-dk logged.
[INFO] 220 users scraped so far.
[INFO] Scraping user: cBrainAI


 78%|███████▊  | 50/64 [1:01:33<32:21, 138.65s/it]

User match cBrainAI logged.
[INFO] 221 users scraped so far.
Company cbrain logged.
[INFO] Scraping users for company: deon digital
[INFO] Scraping user: deondigital
[INFO] Scraping user: DeonDigitalMarketingandWebAgency


 80%|███████▉  | 51/64 [1:01:36<21:35, 99.67s/it] 

Company deon digital logged.
[INFO] Scraping users for company: pwc
[INFO] Scraping user: pwcdk-emil


 81%|████████▏ | 52/64 [1:01:39<14:20, 71.67s/it]

User match pwcdk-emil logged.
[INFO] 222 users scraped so far.
Company pwc logged.
[INFO] Scraping users for company: studiesandme
[INFO] Scraping user: StudiesAndMe
[INFO] Scraping user: studiesandme-machine


 83%|████████▎ | 53/64 [1:01:42<09:25, 51.37s/it]

Company studiesandme logged.
[INFO] Scraping users for company: tv2
[INFO] Scraping user: tv2
User match tv2 logged.
[INFO] 223 users scraped so far.
[INFO] Scraping user: quartercastle
Token cycled to ACCESS_TOKEN_2.
Cycle


Request GET /users/m110 failed with 403: Forbidden
Setting next backoff to 1222.600527s
