In [3]:
# 01_data_cleaning.ipynb
# filepath: notebooks/01_data_cleaning.ipynb

# ---
# # StrongTies Data Cleaning Notebook
# This notebook demonstrates loading, inspecting, and sanitizing LinkedIn-style connection data for analysis.
# ---

In [4]:
# Load dependencies
import sys
import os
sys.path.append(os.path.abspath(".."))

import pandas as pd
from src.data_loader import load_all_connections
from src.privacy_sanitizer import sanitize_csv, validate_csv_columns

Matplotlib is building the font cache; this may take a moment.


In [5]:
# 1. Load all sample connection CSVs

data_dir = "../data"
df_raw = load_all_connections(data_dir)
print("Raw combined data shape:", df_raw.shape)
df_raw.head()

Raw combined data shape: (100, 4)


Unnamed: 0,first_name,last_name,company,position
0,Debra,Kim,Ryan PLC,Software Engineer
1,Zachary,Martinez,Doyle Ltd,Consultant
2,Jeffrey,Morgan,Galloway-Wyatt,Analyst
3,Brianna,Hall,Garcia-James,Software Engineer
4,Jeffrey,Jones,Allen-Allen,Project Coordinator


In [6]:
# 2. Inspect columns and missing values

print("Columns:", df_raw.columns.tolist())
print("Missing values per column:\n", df_raw.isnull().sum())

Columns: ['first_name', 'last_name', 'company', 'position']
Missing values per column:
 first_name    0
last_name     0
company       0
position      0
dtype: int64


In [7]:
# 3. Validate columns against allowed schema

allowed = validate_csv_columns([col.replace('_', ' ').title() for col in df_raw.columns])
print("Columns valid for StrongTies:", allowed)

Columns valid for StrongTies: True


In [8]:
# 4. Sanitize and anonymize data

df_clean = sanitize_csv(
    df_raw.rename(columns={
        'first_name': 'First Name',
        'last_name': 'Last Name',
        'company': 'Company',
        'position': 'Position'
    }),
    hash_ids=True,
    obfuscate_names=False
)
print("Cleaned data shape:", df_clean.shape)
df_clean.head()

Cleaned data shape: (100, 5)


Unnamed: 0,First Name,Last Name,Company,Position,UserID
0,Debra,Kim,Ryan PLC,Software Engineer,5fec4150f53f6d3e13b8d4830bb0a26149f25a7de4ea2f...
1,Zachary,Martinez,Doyle Ltd,Consultant,5eed9ec756ccbfabcfb926ecc6471c5a6af43e1b834fb0...
2,Jeffrey,Morgan,Galloway-Wyatt,Analyst,a2ffffe830b690777f6f50b56822ceb9a08e8c29e63fa9...
3,Brianna,Hall,Garcia-James,Software Engineer,b046658f1dee7b07974854e74da8c968c0f7e0a1c413db...
4,Jeffrey,Jones,Allen-Allen,Project Coordinator,0802a63bbcf94d0decd757986ffa8aea7b725dce2a23ac...


In [10]:
# 5. Save cleaned data for downstream analysis

df_clean.to_csv("../results/reports/cleaned_connections.csv", index=False)
print("✅ Cleaned data saved to ../results/reports/cleaned_connections.csv")

✅ Cleaned data saved to ../results/reports/cleaned_connections.csv
