# 0. Importing packages

In [None]:
# Load the autoreload extension to automatically reload modules before executing code (to avoid restarting the kernel)
%load_ext autoreload 
# NB. uncomment the line above first time you run this cell
%autoreload 2

In [None]:
import json
import pandas as pd
from pathlib import Path

# Custom functions
import resources.filter_functions as filter

## 0.1 File Paths

In [2]:
fp_main = Path('/Volumes/SAM-SODAS-DISTRACT/Coding Distraction/github_as_a_market_device')
fp_main_output = Path(fp_main / 'output')

# 1. Importing data

*Loading in first and second tier data*

In [3]:
# Opening the files
first_tier_file_name = "first_tier_ties_sorted.parquet.gzip"
second_tier_file_name = "second_tier_userinfo.jsonl"
fp_first_tier = fp_main_output / first_tier_file_name
fp_second_tier = fp_main_output / second_tier_file_name

# Load the first tier data
first_tier_data_clean = pd.read_parquet(fp_first_tier)

with open(fp_second_tier, 'r') as f:
    second_tier_list = [json.loads(line) for line in f]

# Type hint so Pylance can infer the type of the variable
second_tier_data: pd.DataFrame = pd.DataFrame(second_tier_list)


# 2. Resolve multiple matches

In [4]:
# Making a copy of the dataframe
second_tier_data_clean = second_tier_data.copy()

# Access output path
output_path = 'resolved_multicompany_cases.jsonl'

# Resolve multiple companies
second_tier_data_clean = filter.resolve_multiple_companies(
    second_tier_data,
    output_path=output_path,
)

# 3. Merging first-tier and second-tier

In [5]:
# Assign tier labels to each dataset
first_tier_data_clean['tier'] = 1
second_tier_data_clean['tier'] = 2

# Combine both tiers into a single dataframe
merged_users = pd.concat(
    [first_tier_data_clean, second_tier_data_clean],
    ignore_index=True
)
print(f"[INFO] Merged user data shape: {merged_users.shape}")

# Remove duplicate users based on GitHub login
unique_users_data = merged_users.drop_duplicates(subset='user_login').reset_index(drop=True)
print(f"[INFO] Deduplicated user data shape: {unique_users_data.shape}")

[INFO] Merged user data shape: (238, 21)
[INFO] Deduplicated user data shape: (228, 21)


In [6]:
merged_users

Unnamed: 0,user_login,search_with_company,listed_company,inferred_company,matched_company_strings,usertype,email,location,bio,blog,...,follows_in,follows_out,watches_in,watches_out,stars_in,stars_out,forks_in,forks_out,unique_ties,tier
0,NodesAMS,nodes,Nodes Agency,nodes,"{'abtion': None, 'capgemini': None, 'charlie t...",User,,[CPH],Nodes \r\nApplication Management Service,https://www.nodesagency.com,...,[],[],[],[],[],[],[],"[{'created_at': '2022-10-14', 'owner_login': '...",[ml-opensource],1
1,nodes-checkout,nodes,Nodes.dk,nodes,"{'abtion': None, 'capgemini': None, 'charlie t...",User,cw@nodes.dk,"[dk, dk, dk, Copenhagen, Denmark]",,http://www.nodes.dk/,...,[],[],[],"[{'created_at': '2015-01-29', 'owner_login': '...",[],[],[],[],[ml-archive],1
2,nodes-team,nodes,Nodes Agency,nodes,"{'abtion': None, 'capgemini': None, 'charlie t...",User,,[Copenhagen],,http://nodesagency.com,...,[],[],[],"[{'created_at': '2016-03-27', 'owner_login': '...",[],[],[],[],[ml-archive],1
3,abtion,abtion,,abtion,"{'abtion': ['abtion', 'iwanttowork@abtion.com'...",Organization,iwanttowork@abtion.com,[Copenhagen],Not another digital agency,https://abtion.com,...,"[{'created_at': '2014-01-12', 'owner_login': '...",[],"[{'created_at': '2012-02-23', 'owner_login': '...",[],"[{'created_at': '2015-08-26', 'owner_login': '...",[],"[{'created_at': '2025-05-05', 'owner_login': '...","[{'created_at': '2020-12-10', 'owner_login': '...","[Dynastig, magnusfriis, jeppester, djuric, Ale...",1
4,hcarreras,abtion,Abtion,abtion,"{'abtion': ['abtion', 'hc@abtion.com', 'abtion...",User,hc@abtion.com,[Copenhagen],,abtion.com,...,"[{'created_at': '2009-07-02', 'owner_login': '...","[{'created_at': '2008-03-31', 'owner_login': '...","[{'created_at': '2014-11-04', 'owner_login': '...","[{'created_at': '2010-10-26', 'owner_login': '...","[{'created_at': '2014-01-01', 'owner_login': '...","[{'created_at': '2018-05-31', 'owner_login': '...","[{'created_at': '2015-05-06', 'owner_login': '...","[{'created_at': '2018-04-13', 'owner_login': '...","[Citizen2028, apneadiving, IanAbildskou, LoneK...",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233,tfctrifork,trifork,Trifork,trifork,"{'trifork': ['tfctrifork', 'trifork', 'tfc@tri...",User,tfc@trifork.com,"[Aarhus, Denmark]",,,...,[],[],"[{'repo_name': 'PorscheLeMansVP', 'owner_login...","[{'repo_name': 'TriforkSwiftExtensions', 'owne...",[],[],[],[],,2
234,hcnp,trifork,@trifork,trifork,{'trifork': ['@trifork']},User,,"[Aarhus, Denmark]",,,...,"[{'repo_name': None, 'owner_login': 'ask0ldd',...","[{'repo_name': None, 'owner_login': 'kjuulh', ...","[{'repo_name': 'implementation-guide', 'owner_...","[{'repo_name': 'trireg2-import-clockify', 'own...","[{'repo_name': 'implementation-guide', 'owner_...","[{'repo_name': 'dagger', 'owner_login': 'dagge...","[{'repo_name': 'implementation-guide', 'owner_...","[{'repo_name': 'db-operator-charts', 'owner_lo...",,2
235,twantrifork,trifork,@trifork,trifork,"{'trifork': ['twantrifork', '@trifork']}",User,,[Aalborg],,,...,[],"[{'repo_name': None, 'owner_login': 'trifork',...",[],[],[],[],[],[],,2
236,aabl-trifork,trifork,Trifork,trifork,"{'trifork': ['aabl-trifork', 'trifork']}",User,,[Aarhus],,,...,[],"[{'repo_name': None, 'owner_login': 'trifork',...",[],[],[],"[{'repo_name': 'ai-testimonials', 'owner_login...",[],"[{'repo_name': 'smallrye-mutiny', 'owner_login...",,2


# 4. Output file

In [7]:
# Output the filtered users
unique_users_data.to_parquet(fp_main_output / 'unique_users_first_second.gzip.parquet')