# 0. Importing packages

In [5]:
# Load the autoreload extension to automatically reload modules before executing code (to avoid restarting the kernel)
%load_ext autoreload 

# Enable autoreload for all modules
%autoreload 2

In [6]:
import json
import pandas as pd
from pathlib import Path
from itertools import chain
from tqdm import tqdm

# Custom functions
from resources.github_functions import GithubScraper
from resources.filter_functions import filter_ties

GitHub access token collected from config


## 0.1 File Paths

In [7]:
# File paths
import resources.filepaths as fp

fp_main = fp.fp_main
fp_main_output = fp.fp_main_output

# To output data that has to go to external s-drive
fp_main_external = fp.fp_main_external
fp_output_external = fp.fp_output_external

# 1. Load in the second tier company list

## 1.2 Processing the company list

In [None]:
# Load the initial list of companies
second_tier_companies = pd.read_csv(fp_main_output / "second_tier_companies.csv")

# Subset relevant company info for initial list
list_of_company_info = second_tier_companies[
    [
        "company_search_keyword",
        "company_category",
        "without_location_filter",
        "company_label_name",
    ]
]

# Create company category map for initial list
company_category_map = dict(
    zip(
        list_of_company_info["company_search_keyword"],
        list_of_company_info["company_category"],
    )
)

# Output company category map
with open(fp_main_output / "company_category_map.jsonl", "a", encoding="utf-8") as f:
    for keyword, category in company_category_map.items():
        f.write(
            json.dumps(
                {"company_search_keyword": keyword, "company_category": category},
                ensure_ascii=False,
            )
            + "\n"
        )

In [None]:
# Create a zip object to loop through, when querying companies.
company_location_filter_bool_zip = zip(
    list_of_company_info["company_search_keyword"],
    list_of_company_info["without_location_filter"],
    list_of_company_info["company_label_name"],
)

# 2. Repeating the queriyng of second-tier company names and scraping users

## 2.1 Loading in scrapelogs

In [None]:
# Collecting users already scraped
users_already_scraped = set()
companies_already_scraped = set()

user_log_file_name = "first_tier_user_scrapelog.jsonl"
company_log_file_name = "company_scrapelog.jsonl"
user_log_path = fp_output_external / user_log_file_name
company_log_path = fp_main_output / company_log_file_name

# Ensure files exist and print message
if user_log_path.exists():
    print(f"[INFO] User log exists: {user_log_path.name}")
else:
    user_log_path.touch(exist_ok=True)

if company_log_path.exists():
    print(f"[INFO] Company log exists: {company_log_path.name}")
else:
    company_log_path.touch(exist_ok=True)

# Read user log
with open(user_log_path, "r", encoding="utf-8") as f:
    for line in f:
        try:
            user_info = json.loads(line)
            users_already_scraped.add(user_info["user_login"])
        except (json.JSONDecodeError, KeyError) as err:
            print(f"[WARNING] Skipping malformed user line: {err}")

# Read company log
with open(company_log_path, "r", encoding="utf-8") as f:
    for line in f:
        try:
            company = json.loads(line)
            companies_already_scraped.add(company["company_name"])
        except (json.JSONDecodeError, KeyError) as err:
            print(f"[WARNING] Skipping malformed company line: {err}")

[INFO] User log exists: first_tier_user_scrapelog.jsonl
[INFO] Company log exists: company_scrapelog.jsonl


## 2.2 Instantiating the GithubScraper and scrape

In [None]:
# 1. Create instance of GithubScraper
gs = GithubScraper(
    users_already_scraped=users_already_scraped,
    companies_already_scraped=companies_already_scraped,
    repo_limit=50,
)
print(
    f"GitHub REST API ratelimit reset time for token is {gs.reset_time_point}. "
    f"That will be in a little less than {gs.reset_time_in_minutes} minutes."
)

# 2. Define output file name
file_name = "first_tier_userinfo"

# 3. Loop through company queries
for search_query, skip_location_filter, company_label in tqdm(
    company_location_filter_bool_zip, total=len(second_tier_companies)
):
    # 3.1 Skip company if already scraped
    if company_label in gs.companies_already_scraped:
        print(f"[INFO] Company {company_label} already scraped. Skipping.")
        continue

    print(f"[INFO] Scraping users for company: {company_label}")

    # 3.2 Get users for this company
    users = gs.get_gh_users(search_query, skip_location_filter)

    # 3.3 Loop through users
    for named_user, company in users:
        # 3.3.1 Skip user if already scraped
        if named_user.login in gs.users_already_scraped:
            print(f"[INFO] User {named_user.login} already scraped. Skipping.")
            continue

        print(f"[INFO] Scraping user: {named_user.login}")
        gs.users_already_scraped.add(named_user.login)

        # 3.3.2 Get user info (may return None if repo limit exceeded or no match)
        user_row = gs.get_user_info(named_user, company_label, company_filter=True)
        if user_row is None:
            continue  # Skip user if they don't meet scraping criteria

        # 3.3.3 Extract match data
        location_match = user_row.matched_location
        inferred_company = user_row.inferred_company
        matched_company_strings = user_row.matched_company_strings

        # 3.3.4 Save user info and log result
        gs.save_file(user_row, file_name, remove_existing_file=True)
        gs.log_user_w_match(
            named_user.login,
            inferred_company,
            matched_company_strings,
            location_match,
            user_log_path,
        )  # type: ignore

        print(f"[INFO] {gs.USERS_SCRAPED} users scraped so far.")

    # 3.4 Log company after scraping all users
    gs.log_company(company_label, company_log_path)  # type: ignore

GithubScraper initialized with 33 companies and 149 users already scraped.
GitHub REST API ratelimit reset time for token is 2025-08-13 17:11:54. That will be in a little less than 49 minutes.


  0%|          | 0/31 [00:00<?, ?it/s]

[INFO] Scraping users for company: knowit
[NEW] GitHub ratelimit threshold set to 300 (max rate: 5000)
[INFO] Scraping user: miracle-as
[NEW] GitHub ratelimit threshold set to 5 (max rate: 30)
[NEW] GitHub ratelimit threshold set to 300 (max rate: 5000)
[INFO] Scraping user: gahms


# 3 Adding new first-tier users to first-tier dataset

*Note: Done by repeating notebook "2. Creating first-tier dataset.ipynb"*

## 3.1 Opening the file

In [11]:
# Opening the file
first_tier_file_name = "first_tier_userinfo.jsonl"
fp_first_tier = fp_output_external / first_tier_file_name

# Load the first tier data
with open(fp_first_tier, "r") as f:
    first_tier_data = [json.loads(line) for line in f]

# Convert to DataFrame
first_tier_userinfo = pd.DataFrame(first_tier_data)

## 3.2 Aggegating unique user connections for each first-tier user

In [None]:
fetch_ties_columns = [
    "follows_in",
    "follows_out",
    "watches_in",
    "watches_out",
    "stars_in",
    "stars_out",
    "forks_in",
    "forks_out",
]

first_tier_userinfo["unique_ties"] = first_tier_userinfo.apply(
    lambda row: filter_ties(row, fetch_ties_columns), axis=1
)

## 3.3 Save the sorted DataFrame to a parquet file

In [None]:
# Print number of users
print(f"Number of unique users in dataset: {len(first_tier_userinfo)}")

# Outputting sorted first-tier-user list with gzip (because of list within the dataframe)
first_tier_userinfo.to_parquet(
    fp_output_external / "first_tier_ties_extended.parquet.gzip"
)

Number of unique users in dataset: 149


# 4 Repeating second-tier collection 

*Note: Done by repeating notebook "2. Scraping second-tier users.ipynb"*

## 4.1 Load in first-tier users extended version

In [None]:
first_tier_extended_info = pd.read_parquet(
    fp_output_external / "first_tier_ties_extended.parquet.gzip"
)

## 4.2 Creating a dataframe where a row is a company with a list of potential second tier users

In [None]:
# Aggregate potential second users for each company in the second tier
second_tier_extended_users_and_company = first_tier_extended_info.groupby(
    "search_with_company", as_index=False
)["unique_ties"].agg(lambda x: list(chain.from_iterable(x)))

# Calculate total number of potential second-tier users
numb_of_second_tier_users = (
    second_tier_extended_users_and_company["unique_ties"].str.len().sum()
)

print(numb_of_second_tier_users)

4530


## 4.3 Instantiating the GithubScraper and scrape

In [None]:
# Initialize sets for tracking
users_already_scraped = set()
companies_already_scraped = set()
users_attempted_scraped = set()

# Paths
first_tier_user_log_file = "first_tier_userinfo_user_log.jsonl"
second_tier_user_log_file = "second_tier_userinfo_user_log.jsonl"
users_attempted_scrape_file = "users_attempted_scrape.jsonl"

first_tier_user_log_path = fp_output_external / first_tier_user_log_file
second_tier_user_log_path = fp_output_external / second_tier_user_log_file
users_attempted_scrape_path = fp_output_external / users_attempted_scrape_file


def ensure_file_exists(path: Path):
    if not path.exists():
        print(f"[INFO] File does not exist. Creating: {path.name}")
        path.touch(exist_ok=True)
    else:
        print(f"[INFO] File exists: {path.name}")


def load_users_from_log(path: Path):
    users = set()
    if path.exists():
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                try:
                    user_info = json.loads(line)
                    users.add(user_info["user_login"])
                except (json.JSONDecodeError, KeyError) as err:
                    print(
                        f"[WARNING] Skipping malformed user line in {path.name}: {err}"
                    )
    return users


# Ensure all files exist
for path in [
    first_tier_user_log_path,
    second_tier_user_log_path,
    users_attempted_scrape_path,
]:
    ensure_file_exists(path)

# Populate sets
users_already_scraped |= load_users_from_log(first_tier_user_log_path)
users_already_scraped |= load_users_from_log(second_tier_user_log_path)
users_already_attempted = load_users_from_log(users_attempted_scrape_path)

[INFO] File exists: first_tier_userinfo_user_log.jsonl
[INFO] File exists: second_tier_userinfo_user_log.jsonl
[INFO] File exists: users_attempted_scrape.jsonl


In [None]:
# 1. Create instance of GithubScraper
gs = GithubScraper(
    users_already_scraped=users_already_scraped,
    companies_already_scraped=companies_already_scraped,
    users_already_attempted=users_already_attempted,
    repo_limit=50,
)

second_tier_users_to_scrape = {
    user: row["search_with_company"]
    for _, row in second_tier_extended_users_and_company.iterrows()  # type: ignore
    for user in row["unique_ties"]
}

print(
    f"GitHub REST API ratelimit reset time for token is {gs.reset_time_point}. "
    f"That will be in a little less than {gs.reset_time_in_minutes} minutes."
)

# 2. Define output file name
file_name = "second_tier_userinfo"

# 3. Loop through company queries
for user, search_with_company in tqdm(second_tier_users_to_scrape.items()):
    # 3.3 Check if user is already scraped
    if user in gs.users_already_attempted:
        print(f"[INFO] User {user} already scraped. Skipping.")
        continue

    # Log user to the set of already attempted users
    gs.log_user_scrape_attempt(user, users_attempted_scrape_path)
    gs.users_already_attempted.add(user)

    # 3.1 Get user from the flattened dictionary
    named_user = gs.get_user(user)

    # 3.2 Check if user is None (e.g., if user is not found)
    if named_user is None:
        continue

    # 3.5 Check if user is a relevant user (DK + company)
    user_row = gs.get_user_info(named_user, search_with_company, company_filter=False)
    if user_row is None:
        continue  # Skip user if they don't meet scraping criteria

    # 3.3.3 Extract match data
    location_match = user_row.matched_location
    inferred_company = user_row.inferred_company
    matched_company_strings = user_row.matched_company_strings

    # 3.3.4 Save user info and log result
    gs.save_file(user_row, file_name, remove_existing_file=True)
    gs.log_user_w_match(
        named_user.login,
        inferred_company,
        matched_company_strings,
        location_match,
        second_tier_user_log_path,
    )  # type: ignore

    print(f"[INFO] {gs.USERS_SCRAPED} users scraped so far.")

GithubScraper initialized with 0 companies and 3 users already scraped.
GitHub REST API ratelimit reset time for token is 2025-08-12 14:39:58. That will be in a little less than 7 minutes.


  0%|          | 0/483 [00:00<?, ?it/s]

[INFO] User karuncs already scraped. Skipping.
[NEW] GitHub ratelimit threshold set to 300 (max rate: 5000)


  0%|          | 1/483 [00:06<55:40,  6.93s/it]


KeyboardInterrupt: 

## 4.4 Save the extended second-tiers

In [None]:
# Load in second-tier users
with open(fp_output_external / "second_tier_userinfo.jsonl", "r") as f:
    second_tier_users = pd.DataFrame([json.loads(line) for line in f])

# Print number of users
print(f"Number of unique users in dataset: {len(second_tier_users)}")

## Outputting sorted second-tier-user list with gzip (because of list within the dataframe)
second_tier_users.to_parquet(
    fp_output_external / "second_tier_ties_extended.parquet.gzip"
)

Number of unique users in dataset: 116
