# 0. Importing packages

In [1]:
# Load the autoreload extension to automatically reload modules before executing code (to avoid restarting the kernel)
%load_ext autoreload 
# NB. uncomment the line above first time you run this cell
%autoreload 2

import json
import pandas as pd
from pathlib import Path

# Custom functions
from resources.filter_functions import filter_ties

## 0.1 File Paths

In [2]:
# File paths
import resources.filepaths as fp

fp_main = fp.fp_main
fp_main_output = fp.fp_main_output

# To output data that has to go to external s-drive
fp_main_external = fp.fp_main_external
fp_output_external = fp.fp_output_external

# 1. Importing data

In [3]:
# Opening the file
first_tier_file_name = "first_tier_userinfo.jsonl"
fp_first_tier = fp_output_external / first_tier_file_name

# Load the first tier data
with open(fp_first_tier, "r") as f:
    first_tier_data = [json.loads(line) for line in f]

# Convert to DataFrame
first_tier_userinfo = pd.DataFrame(first_tier_data)

# 2. Aggegating unique user connections for each first-tier user

In [4]:
fetch_ties_columns = [
    "follows_in", "follows_out", "watches_in", "watches_out",
    "stars_in", "stars_out", "forks_in", "forks_out",
]

first_tier_userinfo["unique_ties"] = first_tier_userinfo.apply(
    lambda row: filter_ties(row, fetch_ties_columns),
    axis=1
)

# 3. Save the sorted DataFrame to a parquet file

In [5]:
# Print number of users
print(f"Number of unique users in dataset: {len(first_tier_userinfo)}")

# Outputting sorted first-tier-user list with gzip (because of list within the dataframe)
first_tier_userinfo.to_parquet(
    fp_output_external / "first_tier_ties.parquet.gzip"
    )

Number of unique users in dataset: 149
