# 0. Importing packages

In [1]:
# Load the autoreload extension to automatically reload modules before executing code (to avoid restarting the kernel)
%load_ext autoreload 

# Enable autoreload for all modules
%autoreload 2

In [2]:
import json
import pandas as pd
from pathlib import Path
from itertools import chain
from tqdm import tqdm

# Custom functions
from resources.github_functions import GithubScraper

GitHub access token collected from config


## 0.1 File Paths

In [3]:
# File paths
import resources.filepaths as fp

fp_main = fp.fp_main
fp_main_output = fp.fp_main_output

# To output data that has to go to external s-drive
fp_main_external = fp.fp_main_external
fp_output_external = fp.fp_output_external

# 1 Filtering users and making (named-user, company)-list

## 1.1 Loading in the data on first tier users

In [None]:
first_tier_info = pd.read_parquet(fp_output_external / "first_tier_ties.parquet.gzip")

## 1.2 Creating a dataframe where a row is a company with a list of potential second tier users

In [None]:
# Aggregate potential second users for each company in the second tier
second_tier_users_and_company = first_tier_info.groupby(
    "search_with_company", as_index=False
)["unique_ties"].agg(lambda x: list(chain.from_iterable(x)))

# Calculate total number of potential second-tier users
numb_of_second_tier_users = second_tier_users_and_company["unique_ties"].str.len().sum()

print(numb_of_second_tier_users)

4530


# 2 Instantiating the GithubScraper and scrape

## 2.1 Loading in scrapelogs

In [None]:
# Initialize sets for tracking
users_already_scraped = set()
companies_already_scraped = set()
users_attempted_scraped = set()

# Paths
first_tier_user_log_file = "first_tier_userinfo_user_log.jsonl"
second_tier_user_log_file = "second_tier_userinfo_user_log.jsonl"
users_attempted_scrape_file = "users_attempted_scrape.jsonl"

first_tier_user_log_path = fp_output_external / first_tier_user_log_file
second_tier_user_log_path = fp_output_external / second_tier_user_log_file
users_attempted_scrape_path = fp_output_external / users_attempted_scrape_file


def ensure_file_exists(path: Path):
    if not path.exists():
        print(f"[INFO] File does not exist. Creating: {path.name}")
        path.touch(exist_ok=True)
    else:
        print(f"[INFO] File exists: {path.name}")


def load_users_from_log(path: Path):
    users = set()
    if path.exists():
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                try:
                    user_info = json.loads(line)
                    users.add(user_info["user_login"])
                except (json.JSONDecodeError, KeyError) as err:
                    print(
                        f"[WARNING] Skipping malformed user line in {path.name}: {err}"
                    )
    return users


# Ensure all files exist
for path in [
    first_tier_user_log_path,
    second_tier_user_log_path,
    users_attempted_scrape_path,
]:
    ensure_file_exists(path)

# Populate sets
users_already_scraped |= load_users_from_log(first_tier_user_log_path)
users_already_scraped |= load_users_from_log(second_tier_user_log_path)
users_already_attempted = load_users_from_log(users_attempted_scrape_path)

[INFO] File exists: first_tier_userinfo_user_log.jsonl
[INFO] File exists: second_tier_userinfo_user_log.jsonl
[INFO] File exists: users_attempted_scrape.jsonl


## 2.2 Instantiating the GithubScraper and scraping

In [None]:
# 1. Create instance of GithubScraper
gs = GithubScraper(
    users_already_scraped=users_already_scraped,
    companies_already_scraped=companies_already_scraped,
    users_already_attempted=users_already_attempted,
    repo_limit=50,
)

second_tier_users_to_scrape = {
    user: row["search_with_company"]
    for _, row in second_tier_users_and_company.iterrows()  # type: ignore
    for user in row["unique_ties"]
}

print(
    f"GitHub REST API ratelimit reset time for token is {gs.reset_time_point}. "
    f"That will be in a little less than {gs.reset_time_in_minutes} minutes."
)

# 2. Define output file name
file_name = "second_tier_userinfo"

# 3. Loop through company queries
for user, search_with_company in tqdm(second_tier_users_to_scrape.items(), unit="user"):
    # 3.3 Check if user is already scraped
    if user in gs.users_already_attempted:
        print(f"[INFO] User {user} already scraped. Skipping.")
        continue

    # Log user to the set of already attempted users
    gs.log_user_scrape_attempt(user, users_attempted_scrape_path)
    gs.users_already_attempted.add(user)

    # 3.1 Get user from the flattened dictionary
    named_user = gs.get_user(user)

    # 3.2 Check if user is None (e.g., if user is not found)
    if named_user is None:
        continue

    # 3.5 Check if user is a relevant user (DK + company)
    user_row = gs.get_user_info(named_user, search_with_company, company_filter=False)
    if user_row is None:
        continue  # Skip user if they don't meet scraping criteria

    # 3.3.3 Extract match data
    location_match = user_row.matched_location
    inferred_company = user_row.inferred_company
    matched_company_strings = user_row.matched_company_strings

    # 3.3.4 Save user info and log result
    gs.save_file(user_row, file_name, remove_existing_file=True)
    gs.log_user_w_match(
        named_user.login,
        inferred_company,
        matched_company_strings,
        location_match,
        second_tier_user_log_path,
    )  # type: ignore

    print(f"[INFO] {gs.USERS_SCRAPED} users scraped so far.")

GithubScraper initialized with 0 companies and 105 users already scraped.
GitHub REST API ratelimit reset time for token is 2025-08-13 16:11:04. That will be in a little less than 5 minutes.


  0%|          | 0/4132 [00:00<?, ?user/s]

[INFO] User hcarreras already scraped. Skipping.
[INFO] User aboedker already scraped. Skipping.
[INFO] User adminabtion already scraped. Skipping.
[INFO] User martinvintherp already scraped. Skipping.
[INFO] User GuldbekLEGO already scraped. Skipping.
[INFO] User AngelleAbtion already scraped. Skipping.
[INFO] User finnpedersenkazes already scraped. Skipping.
[INFO] User karuncs already scraped. Skipping.
[INFO] User eemailme already scraped. Skipping.
[INFO] User Aberen already scraped. Skipping.
[INFO] User reinisla already scraped. Skipping.
[INFO] User AskeLange already scraped. Skipping.
[INFO] User morgenhaar already scraped. Skipping.
[INFO] User nauman already scraped. Skipping.
[INFO] User djuric already scraped. Skipping.
[INFO] User RobWu already scraped. Skipping.
[INFO] User ozf already scraped. Skipping.
[INFO] User MikkelHansenAbtion already scraped. Skipping.
[INFO] User Kosai106 already scraped. Skipping.
[INFO] User bohme already scraped. Skipping.
[INFO] User Tejs-A

 19%|█▉        | 804/4132 [06:10<26:05,  2.13user/s]

User match mnexo logged.
[INFO] 106 users scraped so far.


 19%|█▉        | 805/4132 [08:16<43:40,  1.27user/s]

User match jenswilly logged.
[INFO] 107 users scraped so far.


 20%|█▉        | 808/4132 [08:34<50:09,  1.10user/s]

User match nielslbeck logged.
[INFO] 108 users scraped so far.


 20%|██        | 838/4132 [09:54<11:36:30, 12.69s/user]

User match nielsdaw logged.
[INFO] 109 users scraped so far.


 20%|██        | 846/4132 [22:57<214:12:02, 234.67s/user]

User match ecederstrand logged.
[INFO] 110 users scraped so far.


 21%|██        | 854/4132 [23:53<26:42:42, 29.34s/user]  

User match rkk logged.
[INFO] 111 users scraped so far.


 21%|██        | 874/4132 [24:15<59:46,  1.10s/user]   Request GET /users/HeroMeiKong failed with 403: Forbidden
Setting next backoff to 1036.793931s


# 3.0 Scouting New Danish App Companies

## 3.1 Saving the second tiers

In [None]:
# Load in second-tier users
with open(fp_output_external / "second_tier_userinfo.jsonl", "r") as f:
    second_tier_users = [json.loads(line) for line in f]

# Print number of users
print(f"Number of unique users in dataset: {len(second_tier_users)}")

# Outputting sorted second-tier-user list with gzip (because of list within the dataframe)
second_tier_users.to_parquet(fp_output_external / "second_tier_ties.parquet.gzip")

Number of second-tier users filtered: 3


## 3.2 Make a filtered copy of second-tier users

In [None]:
# Make copy to subset
second_tier_users_subset = second_tier_users.copy()

# Making a company mask, indicating whether there is an entry or not in the GitHub location variable
company_mask = [bool(user.get("listed_company")) for user in second_tier_users_subset]

# Subset the second tier users based on the company mask
second_tier_users_filtered = [
    user for user, keep in zip(second_tier_users_subset, company_mask) if keep
]

print(f"Number of second-tier users filtered: {len(second_tier_users_filtered)}")

## 3.3 Output for manual inspection

**NOTE ON MANUAL INSPECTION**

The outputted file "second_tier_users_filtered_subset.csv" will be manually gone through in a csv-reader program. From this a list of second-tier companies, which we choose to include in our sample, will be produced and found in the file: ".resources/second_tier_companies.csv"

In [None]:
# Filter dataset on bio variables to output for manual inspection of new companies
variables = [
    "user_login",
    "listed_company",
    "inferred_company",
    "email",
    "bio",
    "blog",
    "github_location",
    "matched_location",
]

# Create a filtered dataset for manual inspection
second_tier_users_filtered_subset = [
    {var: user.get(var) for var in variables} for user in second_tier_users_filtered
]

# Create empty column to input new companies.
for user in second_tier_users_filtered_subset:
    user["new_company"] = None

# Output the subset for review as csv
df = pd.DataFrame(second_tier_users_filtered_subset)
df.to_csv(fp_output_external / "second_tier_users_filtered_subset.csv", index=False)
