# 0. Importing packages

In [3]:
# Load the autoreload extension to automatically reload modules before executing code (to avoid restarting the kernel)
%load_ext autoreload 

# Enable autoreload for all modules
%autoreload 2

# Python 
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import json

# Custom Packages
from resources.github_functions import GithubScraper

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 0.1 File paths ###

In [4]:
fp_main = Path('/Volumes/SAM-SODAS-DISTRACT/Coding Distraction/github_as_a_market_device')
fp_main_output = Path(fp_main / 'output')

# 1. Scraping users from the initial company list

## 1.2 Processing the company list into:
1. Dictionary containing company categories --> company_category
2. A zip_file containing the company-string to query when scraping plus a location_filter dummy, indicating whether the location filter should apply when queriyng that company --> user_location_zip

In [5]:
# Load the company list from Excel
company_list_full = pd.read_excel(
    fp_main / 'company_list' / 'company_info_list211022.xlsx',
    usecols='A:M'
)

# Filter for companies that are part of our sample
company_list_sample = company_list_full.loc[
    company_list_full['part_of_firmaliste'] == 1
].reset_index(drop=True)

# Create a list of relevant company info
list_of_company_names = company_list_sample[
    ['søgeord', 'new_company_category', 'uden lokation']
]

# Create a dictionary mapping company name (lowercase) to category
company_category = dict(zip(
    company_list_sample['søgeord'].str.lower(),
    company_list_sample['new_company_category']
))

# Write the dictionary as JSONL
with open(fp_main_output / 'company_category_dict.jsonl', 'w', encoding='utf-8') as f:
    for company, category in company_category.items():
        json.dump({'søgeord': company, 'new_company_category': category}, f, ensure_ascii=False)
        f.write('\n')

In [6]:
# Create a zip object for name and location filter status
company_location_filter_bool_zip = zip(
    list_of_company_names['søgeord'],
    list_of_company_names['uden lokation'],
    company_list_sample['company_label_name']
)

# 2. Queriyng the company names and scraping users

## 2.1 Loading in logs in case scrape has been interrupted

In [7]:
# Collecting users already scraped
users_already_scraped = set()
companies_already_scraped = set()

user_log_file_name = 'first_tier_userinfo_user_log.jsonl'
company_log_file_name = 'first_tier_userinfo_company_log.jsonl'
user_log_path = fp_main_output / user_log_file_name
company_log_path = fp_main_output / company_log_file_name

# Ensure files exist and print message
if user_log_path.exists():
    print(f"[INFO] User log exists: {user_log_path.name}")
else:
    print(f"[INFO] User log does NOT exist. Creating new file: {user_log_path.name}")
    user_log_path.touch(exist_ok=True)

if company_log_path.exists():
    print(f"[INFO] Company log exists: {company_log_path.name}")
else:
    print(f"[INFO] Company log does NOT exist. Creating new file: {company_log_path.name}")
    company_log_path.touch(exist_ok=True)

# Read user log
with open(user_log_path, 'r', encoding='utf-8') as f:
    for line in f:
        try:
            user_info = json.loads(line)
            users_already_scraped.add(user_info['user_login'])
        except (json.JSONDecodeError, KeyError) as err:
            print(f"[WARNING] Skipping malformed user line: {err}")

# Read company log
with open(company_log_path, 'r', encoding='utf-8') as f:
    for line in f:
        try:
            company = json.loads(line)
            companies_already_scraped.add(company['company_name'])
        except (json.JSONDecodeError, KeyError) as err:
            print(f"[WARNING] Skipping malformed company line: {err}")

[INFO] User log exists: first_tier_userinfo_user_log.jsonl
[INFO] Company log exists: first_tier_userinfo_company_log.jsonl


## 2.2 Instantiating the GithubScraper and scrape

In [8]:
# 1. Create instance of GithubScraper
gs = GithubScraper(
    users_already_scraped=users_already_scraped,
    companies_already_scraped=companies_already_scraped,
    repo_limit=300
)
print(f'GitHub REST API ratelimit reset time for token {gs.current_token_key} is {gs.reset_time_point}. '
      f'That will be in a little less than {gs.reset_time_in_minutes} minutes.')

# 2. Define output file name
file_name = 'first_tier_userinfo'

# 3. Loop through company queries
for search_query, skip_location_filter, company_label in tqdm(company_location_filter_bool_zip, total=len(company_list_sample)):

    # 3.1 Skip company if already scraped
    if company_label in gs.companies_already_scraped:
        print(f'[INFO] Company {company_label} already scraped. Skipping.')
        continue

    print(f'[INFO] Scraping users for company: {company_label}')
    
    # 3.2 Get users for this company
    users = gs.get_gh_users(search_query, skip_location_filter)

    # 3.3 Loop through users
    for named_user, company in users:

        # 3.3.1 Skip user if already scraped
        if named_user.login in gs.users_already_scraped:
            print(f'[INFO] User {named_user.login} already scraped. Skipping.')
            continue

        print(f'[INFO] Scraping user: {named_user.login}')
        gs.users_already_scraped.add(named_user.login)

        # 3.3.2 Get user info (may return None if repo limit exceeded or no match)
        user_row = gs.get_user_info(named_user, company_label)
        if user_row is None:
            continue  # Skip user if they don't meet scraping criteria

        # 3.3.3 Extract match data
        location_match = user_row.location
        inferred_company = user_row.inferred_company
        matched_company_strings = user_row.matched_company_strings

        # 3.3.4 Save user info and log result
        gs.save_file(user_row, file_name, remove_existing_file=True)
        gs.log_user_w_match(named_user.login, inferred_company, matched_company_strings, location_match, user_log_path)
        
        print(f'[INFO] {gs.USERS_SCRAPED} users scraped so far.')

    # 3.4 Log company after scraping all users
    gs.log_company(company_label, company_log_path)

GithubScraper initialized with 3 tokens.
First token in cycle. Initiating ACCESS_TOKEN_1.
GithubScraper initialized with 53 companies and 223 users already scraped.
GitHub REST API ratelimit reset time for token ACCESS_TOKEN_1 is 2025-06-02 11:55:13. That will be in a little less than 37 minutes.


  0%|          | 0/64 [00:00<?, ?it/s]

[INFO] Company nodes already scraped. Skipping.
[INFO] Company abtion already scraped. Skipping.
[INFO] Company heyday already scraped. Skipping.
[INFO] Company trifork already scraped. Skipping.
[INFO] Company frontit already scraped. Skipping.
[INFO] Company holion already scraped. Skipping.
[INFO] Company kruso already scraped. Skipping.
[INFO] Company pandiweb already scraped. Skipping.
[INFO] Company uptime already scraped. Skipping.
[INFO] Company charlie tango already scraped. Skipping.
[INFO] Company ffw already scraped. Skipping.
[INFO] Company mysupport already scraped. Skipping.
[INFO] Company shape already scraped. Skipping.
[INFO] Company makeable already scraped. Skipping.
[INFO] Company mustache already scraped. Skipping.
[INFO] Company house of code already scraped. Skipping.
[INFO] Company greener pastures already scraped. Skipping.
[INFO] Company axla already scraped. Skipping.
[INFO] Company snapp already scraped. Skipping.
[INFO] Company appscaptain already scraped.

 84%|████████▍ | 54/64 [14:38<02:42, 16.26s/it]

User match ahorTV2 logged.
[INFO] 239 users scraped so far.
Company tv2 logged.
[INFO] Scraping users for company: pentia
[INFO] Scraping user: JakobChristensen
User match JakobChristensen logged.
[INFO] 240 users scraped so far.
[INFO] Scraping user: PentiaLabs
User match PentiaLabs logged.
[INFO] 241 users scraped so far.
[INFO] Scraping user: pentia-mobile
User match pentia-mobile logged.
[INFO] 242 users scraped so far.
[INFO] Scraping user: chhoejgaard


 86%|████████▌ | 55/64 [22:06<04:06, 27.39s/it]

User match chhoejgaard logged.
[INFO] 243 users scraped so far.
Company pentia logged.
[INFO] Scraping users for company: zervme
[INFO] Scraping user: ZervMe


 88%|████████▊ | 56/64 [22:11<03:32, 26.62s/it]

User match ZervMe logged.
[INFO] 244 users scraped so far.
Company zervme logged.
[INFO] Scraping users for company: skat
[INFO] Scraping user: skat
Token cycled to ACCESS_TOKEN_2.
Cycle
User match skat logged.
[INFO] 245 users scraped so far.
[INFO] Scraping user: skat-lab
User match skat-lab logged.
[INFO] 246 users scraped so far.
[INFO] Scraping user: skat-lj
User match skat-lj logged.
[INFO] 247 users scraped so far.
[INFO] Scraping user: nc-tpe


Request GET /users/nc-tpe/followers failed with 403: Forbidden
Setting next backoff to 622.094251s


User match nc-tpe logged.
[INFO] 248 users scraped so far.
[INFO] Scraping user: skatvaamsi
User match skatvaamsi logged.
[INFO] 249 users scraped so far.
[INFO] Scraping user: Skatteministeriet
[INFO] Scraping user: nc-law
User match nc-law logged.
[INFO] 250 users scraped so far.
[INFO] Scraping user: nc-llh


 89%|████████▉ | 57/64 [36:25<07:45, 66.56s/it]

User match nc-llh logged.
[INFO] 251 users scraped so far.
Company skat logged.
[INFO] Scraping users for company: codefort
[INFO] Scraping user: CodeForTravel
User match CodeForTravel logged.
[INFO] 252 users scraped so far.
[INFO] Scraping user: codefort-io
User match codefort-io logged.
[INFO] 253 users scraped so far.
[INFO] Scraping user: codefortbot


 91%|█████████ | 58/64 [37:50<06:46, 67.73s/it]

User match codefortbot logged.
[INFO] 254 users scraped so far.
Company codefort logged.
[INFO] Scraping users for company: reepay
[INFO] Scraping user: reepay


 92%|█████████▏| 59/64 [39:16<05:46, 69.30s/it]

User match reepay logged.
[INFO] 255 users scraped so far.
Company reepay logged.
[INFO] Scraping users for company: diviso
[INFO] User MySupport-aps already scraped. Skipping.
[INFO] Scraping user: lassewq


 94%|█████████▍| 60/64 [39:19<04:08, 62.18s/it]

User match lassewq logged.
[INFO] 256 users scraped so far.
Company diviso logged.
[INFO] Scraping users for company: uni-soft
[INFO] Scraping user: uni-soft
[INFO] Scraping user: uni-software
[INFO] Scraping user: UNI-Software-House
[INFO] Scraping user: UNI-Software-II


 95%|█████████▌| 61/64 [39:24<02:43, 54.47s/it]

Company uni-soft logged.
[INFO] Scraping users for company: delegateas
[INFO] Scraping user: delegateas
User match delegateas logged.
[INFO] 257 users scraped so far.
[INFO] Scraping user: JonasGLund99
User match JonasGLund99 logged.
[INFO] 258 users scraped so far.
[INFO] Scraping user: mkholt


 97%|█████████▋| 62/64 [49:16<04:41, 140.53s/it]

User match mkholt logged.
[INFO] 259 users scraped so far.
Company delegateas logged.
[INFO] Scraping users for company: proactivedk
[INFO] Scraping user: proactivedk


 98%|█████████▊| 63/64 [49:22<01:55, 115.50s/it]

User match proactivedk logged.
[INFO] 260 users scraped so far.
Company proactivedk logged.
[INFO] Scraping users for company: monstarlab


100%|██████████| 64/64 [49:23<00:00, 46.30s/it] 

0 - no users found
Company monstarlab logged.



