# 0. Importing packages

In [1]:
# Load the autoreload extension to automatically reload modules before executing code (to avoid restarting the kernel)
%load_ext autoreload 
# NB. uncomment the line above first time you run this cell
%autoreload 2

import json
import pandas as pd
from pathlib import Path

# Custom functions
import resources.filter_functions as filter

## 0.1 File Paths

In [2]:
fp_main = Path('/Volumes/SAM-SODAS-DISTRACT/Coding Distraction/github_as_a_market_device')
fp_main_output = Path(fp_main / 'output')

# 1. Importing data

In [3]:
# Opening the file
first_tier_file_name = "first_tier_userinfo.jsonl"
fp_first_tier = fp_main_output / first_tier_file_name

# Load the first tier data
with open(fp_first_tier, "r") as f:
    first_tier_data = [json.loads(line) for line in f]

# Convert to DataFrame
first_tier_userinfo = pd.DataFrame(first_tier_data)

# 2. Resolve multiple matches

In [4]:
# Making a copy of the dataframe
first_tier_userinfo_clean = first_tier_userinfo.copy()

# Access output path
output_path = '../resources/resolved_multicompany_cases.jsonl'

# Resolve multiple companies
first_tier_userinfo_clean = filter.resolve_multiple_companies(
    first_tier_userinfo,
    output_path=output_path,
)

[nc-llh] has multiple company matches:
Inferred Companies: ['netcompany', 'skat']

Matched Strings:
  netcompany: ['@netcompany']
  skat: ['part of the technical onboarding team for ici at skat. responsible for assisisting public claimants with their integration with the new psrm system.']

Bio Information:
  user_login: nc-llh
  search_with_company: skat
  usertype: User
  listed_company: @Netcompany
  email: None
  bio: Part of the Technical Onboarding team for ICI at SKAT. Responsible for assisisting public claimants with their integration with the new PSRM system.
  blog: 


# 3. Aggegating unique users for each first-tier user

In [5]:
fetch_ties_columns = [
    "follows_in", "follows_out", "watches_in", "watches_out",
    "stars_in", "stars_out", "forks_in", "forks_out",
]

first_tier_userinfo_clean["unique_ties"] = first_tier_userinfo_clean.apply(
    lambda row: filter.filter_ties(row, fetch_ties_columns),
    axis=1
)

# 4. Save the sorted DataFrame to a parquet file

In [6]:
# Outputting sorted first-tier-user list with gzip (because of list within the dataframe)
first_tier_userinfo_clean.to_parquet(
    fp_main_output / "first_tier_ties_sorted.parquet.gzip"
    )