In [1]:
# 01_data_cleaning.ipynb
# filepath: notebooks/01_data_cleaning.ipynb

# ---
# # StrongTies Data Cleaning Notebook
# This notebook demonstrates loading, inspecting, and sanitizing LinkedIn-style connection data for analysis.
# ---

In [2]:
# Load dependencies
import sys
import os
sys.path.append(os.path.abspath(".."))

import pandas as pd
from src.data_loader import load_all_connections
from src.privacy_sanitizer import sanitize_csv, validate_csv_columns
from src.utils import clean_company_name, standardize_position_title, ensure_dir, save_dataframe

In [3]:
# 1. Load all sample connection CSVs

data_dir = "../data"
df_raw = load_all_connections(data_dir)
print("Raw combined data shape:", df_raw.shape)
df_raw.head()

Raw combined data shape: (100, 4)


Unnamed: 0,first_name,last_name,company,position
0,Debra,Kim,Ryan PLC,Software Engineer
1,Zachary,Martinez,Doyle Ltd,Consultant
2,Jeffrey,Morgan,Galloway-Wyatt,Analyst
3,Brianna,Hall,Garcia-James,Software Engineer
4,Jeffrey,Jones,Allen-Allen,Project Coordinator


In [4]:
# 2. Inspect columns and missing values

print("Columns:", df_raw.columns.tolist())
print("Missing values per column:\n", df_raw.isnull().sum())

Columns: ['first_name', 'last_name', 'company', 'position']
Missing values per column:
 first_name    0
last_name     0
company       0
position      0
dtype: int64


In [5]:
# 3. Validate columns against allowed schema

allowed = validate_csv_columns([col.replace('_', ' ').title() for col in df_raw.columns])
print("Columns valid for StrongTies:", allowed)

Columns valid for StrongTies: True


In [7]:
# Clean company and position columns

df_raw['company'] = df_raw['company'].apply(clean_company_name)
df_raw['cosition'] = df_raw['position'].apply(standardize_position_title)

In [8]:
# 4. Sanitize and anonymize data

df_clean = sanitize_csv(
    df_raw.rename(columns={
        'first_name': 'First Name',
        'last_name': 'Last Name',
        'company': 'Company',
        'position': 'Position'
    }),
    hash_ids=True,
    obfuscate_names=False
)
print("Cleaned data shape:", df_clean.shape)
df_clean.head()

Cleaned data shape: (100, 5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(str).apply(_normalize_name)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(str).apply(_normalize_name)


Unnamed: 0,First Name,Last Name,Company,Position,UserID
0,Debra,Kim,ryan plc,Software Engineer,5fec4150f53f6d3e13b8d4830bb0a26149f25a7de4ea2f...
1,Zachary,Martinez,doyle ltd,Consultant,5eed9ec756ccbfabcfb926ecc6471c5a6af43e1b834fb0...
2,Jeffrey,Morgan,gallowaywyatt,Analyst,2e65f27aec6c2564bc7147177efff2c39276cc427b5cd2...
3,Brianna,Hall,garciajames,Software Engineer,3af65f1f0da1d00d1bdde8243f755d6348116da22bb076...
4,Jeffrey,Jones,allenallen,Project Coordinator,73ea3f9564bb7fc627458114f25475d083eeb2adbabc40...


In [9]:
# Ensure output directory exists
ensure_dir("../results/reports")

INFO:strongties:Ensured directory exists: ../results/reports


In [10]:
# 5. Save cleaned data for downstream analysis

df_clean.to_csv("../results/reports/cleaned_connections.csv", index=False)
print("✅ Cleaned data saved to ../results/reports/cleaned_connections.csv")

✅ Cleaned data saved to ../results/reports/cleaned_connections.csv
