# 0. Importing packages

In [4]:
# Load the autoreload extension to automatically reload modules before executing code (to avoid restarting the kernel)
%load_ext autoreload 
# NB. uncomment the line above first time you run this cell
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
import json
import pandas as pd
from pathlib import Path

# Custom functions
import resources.filter_functions as filter

## 0.1 File Paths

In [None]:
# File paths
import resources.filepaths as fp

fp_main = fp.fp_main
fp_main_output = fp.fp_main_output

# To output data that has to go to external s-drive
fp_main_external = fp.fp_main_external
fp_output_external = fp.fp_output_external

# 1. Importing data

*Loading in first and second tier data*

In [11]:
# Opening the files
first_tier_file_name = "first_tier_ties_extended.parquet.gzip"
second_tier_file_name = "second_tier_userinfo.jsonl"
fp_first_tier = fp_output_external / first_tier_file_name
fp_second_tier = fp_output_external / second_tier_file_name

# Load the first tier data
first_tier_data_clean = pd.read_parquet(fp_first_tier)

with open(fp_second_tier, 'r') as f:
    second_tier_list = [json.loads(line) for line in f]

# Type hint so Pylance can infer the type of the variable
second_tier_data: pd.DataFrame = pd.DataFrame(second_tier_list)


# 2. Filter second-tier users on company sample list

In [14]:
# Make a copy
second_tier_users_clean = second_tier_data.copy()

# Find matched company strings
second_tier_users_clean["matched_company_strings"] = second_tier_users_clean.apply(
    lambda user_row: filter.search_for_company([
        v for v in [
            user_row.get("user_login"),
            user_row.get("listed_company"),
            user_row.get("email"),
            user_row.get("bio"),
            user_row.get("blog"),
        ] if v and str(v).strip()
    ]),
    axis=1
)  # type: ignore

# Get the keys of the element in the dictionary of matched company strings
second_tier_users_clean['inferred_company'] = second_tier_users_clean['matched_company_strings'].apply(
    lambda x: list(x.keys()) if isinstance(x, dict) else []
)

# Filter on rows that have an inferred company
second_tier_data_clean = second_tier_users_clean[
    second_tier_users_clean['inferred_company'].apply(len) > 0
]

# 3. Merging first-tier and second-tier

In [15]:
# Assign tier labels to each dataset
first_tier_data_clean['tier'] = 1
second_tier_data_clean['tier'] = 2

# Combine both tiers into a single dataframe
merged_users = pd.concat(
    [first_tier_data_clean, second_tier_data_clean],
    ignore_index=True
)
print(f"[INFO] Merged user data shape: {merged_users.shape}")

# Remove duplicate users based on GitHub login
unique_users_data = merged_users.drop_duplicates(subset='user_login').reset_index(drop=True)
print(f"[INFO] Deduplicated user data shape: {unique_users_data.shape}")

[INFO] Merged user data shape: (16, 22)
[INFO] Deduplicated user data shape: (16, 22)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  second_tier_data_clean['tier'] = 2


# 4. Resolve multiple matches

In [18]:
# Access output path
output_path = '../resources/resolved_multicompany_cases.jsonl'

# Resolve multiple companies
unique_users_data_clean = filter.resolve_multiple_companies(
    unique_users_data,
    output_path=output_path,
)

# 5. Output file

In [19]:
# Output the filtered users
unique_users_data_clean.to_parquet(fp_main_output / 'final_dataset.gzip.parquet')

ArrowInvalid: ('cannot mix list and non-list, non-null values', 'Conversion failed for column inferred_company with type object')