# 0. Importing packages

In [1]:
# Load the autoreload extension to automatically reload modules before executing code (to avoid restarting the kernel)
%load_ext autoreload 

# Enable autoreload for all modules
%autoreload 2

In [2]:
import pandas as pd
from pathlib import Path
from itertools import chain
from tqdm import tqdm

# Custom functions
from resources.github_functions import GithubScraper

## 0.1 File Paths

In [3]:
fp_main = Path('/Volumes/SAM-SODAS-DISTRACT/Coding Distraction/github_as_a_market_device')
fp_main_output = Path(fp_main / 'output')

# 1 Filtering users and making (named-user,company)-list

## 1.1 Loading in the data on first tier users
> For scrape and sorting of first tier see the notebooks:
* 1. Collecting first-tier users.ipynb
* 2. Sorting first-tier users.ipynb


In [4]:
first_tier_info = pd.read_parquet(fp_main_output / 'first_tier_ties_sorted.parquet.gzip')

### 1.2 Creating a dataframe where a row is a company with a list of potential second tier users

In [5]:
# Aggregate potential second users for each company in the second tier
second_tier_users_and_company = (
    first_tier_info.groupby('search_with_company', as_index=False)['unique_ties']
    .agg(lambda x: list(chain.from_iterable(x)))
)

# Calculate total number of potential second-tier users
numb_of_second_tier_users = second_tier_users_and_company['unique_ties'].str.len().sum()

print(numb_of_second_tier_users)

27752


# 2.2 Instantiating the GithubScraper and scrape

In [6]:
import json
from pathlib import Path

# Initialize sets for tracking
users_already_scraped = set()
companies_already_scraped = set()
users_attempted_scraped = set()

# Paths
first_tier_user_log_file = 'first_tier_userinfo_user_log.jsonl'
second_tier_user_log_file = 'second_tier_userinfo_user_log.jsonl'
users_attempted_scrape_file = 'users_attempted_scrape.jsonl'

first_tier_user_log_path = fp_main_output / first_tier_user_log_file
second_tier_user_log_path = fp_main_output / second_tier_user_log_file
users_attempted_scrape_path = fp_main_output / users_attempted_scrape_file

def ensure_file_exists(path: Path):
    if not path.exists():
        print(f"[INFO] File does not exist. Creating: {path.name}")
        path.touch(exist_ok=True)
    else:
        print(f"[INFO] File exists: {path.name}")

def load_users_from_log(path: Path):
    users = set()
    if path.exists():
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                try:
                    user_info = json.loads(line)
                    users.add(user_info["user_login"])
                except (json.JSONDecodeError, KeyError) as err:
                    print(f"[WARNING] Skipping malformed user line in {path.name}: {err}")
    return users

# Ensure all files exist
for path in [first_tier_user_log_path, second_tier_user_log_path, users_attempted_scrape_path]:
    ensure_file_exists(path)

# Populate sets
users_already_scraped |= load_users_from_log(first_tier_user_log_path)
users_already_scraped |= load_users_from_log(second_tier_user_log_path)
users_already_attempted = load_users_from_log(users_attempted_scrape_path)

[INFO] File exists: first_tier_userinfo_user_log.jsonl
[INFO] File exists: second_tier_userinfo_user_log.jsonl
[INFO] File exists: users_attempted_scrape.jsonl


In [None]:
# 1. Create instance of GithubScraper
gs = GithubScraper(
    users_already_scraped=users_already_scraped,
    companies_already_scraped=companies_already_scraped,
    users_already_attempted=users_already_attempted,
    repo_limit=300
)

second_tier_users_to_scrape = {
    user: row['search_with_company']
    for _, row in second_tier_users_and_company.iterrows()
    for user in row['unique_ties']
}

print(f'GitHub REST API ratelimit reset time for token {gs.current_token_key} is {gs.reset_time_point}. '
      f'That will be in a little less than {gs.reset_time_in_minutes} minutes.')

# 2. Define output file name
file_name = 'second_tier_userinfo'

# 3. Loop through company queries
for user, search_with_company in tqdm(second_tier_users_to_scrape.items()):

    # 3.3 Check if user is already scraped
    if user in gs.users_already_attempted:
        print(f'[INFO] User {user} already scraped. Skipping.')
        continue

    # Log user to the set of already attempted users
    gs.log_user_scrape_attempt(user, users_attempted_scrape_path)
    gs.users_already_attempted.add(user)

    # 3.1 Get user from the flattened dictionary
    named_user = gs.get_user(user)

    # 3.2 Check if user is None (e.g., if user is not found)
    if named_user is None:
        continue

    # 3.5 Check if user is a relevant user (DK + company)
    user_row = gs.get_user_info(named_user, search_with_company)
    if user_row is None:
        continue  # Skip user if they don't meet scraping criteria

    # 3.3.3 Extract match data
    location_match = user_row.location
    inferred_company = user_row.inferred_company
    matched_company_strings = user_row.matched_company_strings

    # 3.3.4 Save user info and log result
    gs.save_file(user_row, file_name, remove_existing_file=True)
    gs.log_user_w_match(named_user.login, inferred_company, matched_company_strings, location_match, second_tier_user_log_path)
    
    print(f'[INFO] {gs.USERS_SCRAPED} users scraped so far.')

GithubScraper initialized with 3 tokens.
First token in cycle. Initiating ACCESS_TOKEN_1.
GithubScraper initialized with 0 companies and 426 users already scraped.
GitHub REST API ratelimit reset time for token ACCESS_TOKEN_1 is 2025-06-16 12:08:17. That will be in a little less than 39 minutes.


  0%|          | 0/25924 [00:00<?, ?it/s]

[INFO] User karuncs already scraped. Skipping.
[INFO] User heatherm already scraped. Skipping.
[INFO] User runephilosof already scraped. Skipping.
[INFO] User jekyll already scraped. Skipping.
[INFO] User GeoffAbtion already scraped. Skipping.
[INFO] User Aberen already scraped. Skipping.
[INFO] User runephilosof-abtion already scraped. Skipping.
[INFO] User RobWu already scraped. Skipping.
[INFO] User jeppester already scraped. Skipping.
[INFO] User synth already scraped. Skipping.
[INFO] User slamidtfyn already scraped. Skipping.
[INFO] User adionditsak already scraped. Skipping.
[INFO] User bokh already scraped. Skipping.
[INFO] User namuit already scraped. Skipping.
[INFO] User GuldbekLEGO already scraped. Skipping.
[INFO] User martinvintherp already scraped. Skipping.
[INFO] User allowishus-dev already scraped. Skipping.
[INFO] User finnpedersenkazes already scraped. Skipping.
[INFO] User ozf already scraped. Skipping.
[INFO] User MadsZeneli already scraped. Skipping.
[INFO] User 

 75%|███████▌  | 19486/25924 [00:23<00:10, 588.75it/s] 

User ai-ml-architect has more than 300 repos, skipping.


 75%|███████▌  | 19495/25924 [00:53<00:41, 155.20it/s]

User fkorotkov has more than 300 repos, skipping.


 75%|███████▌  | 19528/25924 [02:12<10:17, 10.36it/s] 

User match adrian-shape logged.
[INFO] 427 users scraped so far.


 75%|███████▌  | 19529/25924 [02:14<10:46,  9.89it/s]

User boxen has more than 300 repos, skipping.


 75%|███████▌  | 19551/25924 [03:21<1:24:15,  1.26it/s]

User bbqsrc has more than 300 repos, skipping.


 75%|███████▌  | 19560/25924 [03:42<1:59:47,  1.13s/it]

User xiamuguizhi has more than 300 repos, skipping.


 76%|███████▌  | 19587/25924 [04:55<3:36:28,  2.05s/it]

User kauandotnet has more than 300 repos, skipping.


 76%|███████▌  | 19597/25924 [05:09<2:08:55,  1.22s/it]

User haikusw has more than 300 repos, skipping.


 76%|███████▌  | 19628/25924 [06:35<8:14:46,  4.72s/it]

User match shape-matheusfaleiro logged.
[INFO] 428 users scraped so far.


 76%|███████▌  | 19630/25924 [06:42<6:43:33,  3.85s/it]

User shuxiaokai has more than 300 repos, skipping.


 76%|███████▌  | 19636/25924 [06:55<3:23:18,  1.94s/it]

User ducky007 has more than 300 repos, skipping.


 76%|███████▌  | 19640/25924 [07:02<3:29:26,  2.00s/it]

User match gconde-shape logged.
[INFO] 429 users scraped so far.


 76%|███████▌  | 19674/25924 [08:35<3:01:49,  1.75s/it]

User tmm1 has more than 300 repos, skipping.


 76%|███████▌  | 19700/25924 [09:32<3:41:11,  2.13s/it]

User TomasHubelbauer has more than 300 repos, skipping.


 76%|███████▌  | 19717/25924 [10:34<10:32:52,  6.12s/it]

User match AresL logged.
[INFO] 430 users scraped so far.


 76%|███████▋  | 19772/25924 [15:42<75:36:00, 44.24s/it]

User match ulrikandersen logged.
[INFO] 431 users scraped so far.


 76%|███████▋  | 19801/25924 [16:45<2:15:27,  1.33s/it] 