# Notebook 01: SWOW Data Loader
This notebook ingests the *Small World of Words* (SWOW) dataset, building a directed word-association graph.
- **Input:** Raw SWOW data files (e.g., `data/raw_public/swow/en/` for English, containing cue-response records).
- **Output:** Graph structures for each language (networkx graphs), summary statistics, and provenance (e.g., `provenance.yaml`).
- **Steps:** Load SWOW responses, construct a directed graph of word associations, and compute basic metrics (node count, edge count, degree distribution) for validation.
- *Reproducibility:* Set a fixed random seed for any sampling (if applicable) and log dataset version/URL.

In [1]:
# Import required libraries
import pandas as pd
import networkx as nx
import numpy as np
import os

# Parameters
LANG = 'en'  # example: English language
raw_dir = os.path.join(os.getcwd(), '..', 'data', 'raw_public', 'swow', LANG)
file_name = 'SWOW-EN.complete.20180827.csv'  # raw data file (all three responses)

# Load SWOW raw data
swow_df = pd.read_csv(os.path.join(raw_dir, file_name), sep=',')
print(f"Loaded {len(swow_df)} association responses from SWOW-{LANG.upper()}.")

Loaded 1356362 association responses from SWOW-EN.


In [2]:
# Build directed graph from SWOW data
G = nx.DiGraph()
# Assuming swow_df has columns: Cue, R1, R2, R3 for responses
for idx, row in swow_df.iterrows():
    cue = row['cue'] if 'cue' in row else row[0]
    if pd.isna(cue):
        continue
    cue = str(cue).strip()
    # Add edges for each response (first response R1 given most weight perhaps)
    for resp_col in ['R1', 'R2', 'R3']:
        if resp_col in row and not pd.isna(row[resp_col]):
            response = str(row[resp_col]).strip()
            if response:
                # Increment weight if edge exists
                if G.has_edge(cue, response):
                    G[cue][response]['weight'] += 1
                else:
                    G.add_edge(cue, response, weight=1)

# Graph summary
print(f"Graph constructed: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges.")
degree_sequence = [deg for _, deg in G.degree()]
print(f"Average degree: {np.mean(degree_sequence):.2f}")

Graph constructed: 166540 nodes, 1537892 edges.
Average degree: 18.47


In [3]:
# Save the graph for use in other notebooks
import pickle
processed_dir = os.path.join(os.getcwd(), '..', 'data', 'processed')
os.makedirs(processed_dir, exist_ok=True)
graph_file = os.path.join(processed_dir, f'swow_graph_{LANG}.pkl')
with open(graph_file, 'wb') as f:
    pickle.dump(G, f)
print(f"Graph saved to {graph_file}")

Graph saved to /home/agourakis82/workspace/pcs-meta-repo/notebooks/../data/processed/swow_graph_en.pkl


In [4]:
# Save the graph for use in other notebooks
import pickle
processed_dir = os.path.join(os.getcwd(), '..', 'data', 'processed')
os.makedirs(processed_dir, exist_ok=True)
graph_file = os.path.join(processed_dir, f'swow_graph_{LANG}.gpickle')
nx.write_gpickle(G, graph_file)
print(f"Graph saved to {graph_file}")

AttributeError: module 'networkx' has no attribute 'write_gpickle'