In [16]:
# ✅ Cell 1: Imports
from pathlib import Path
import pandas as pd
import networkx as nx


In [17]:
# ✅ Cell 2: Load frequency data and initialize graph
langs = ["kurmanci", "zazaki", "turkish"]

PROJECT_ROOT = Path.cwd().parents[0]
FREQ_DIR = PROJECT_ROOT / "outputs" / "frequencies"

graph = nx.Graph()

for lang in langs:
    freq_path = FREQ_DIR / lang / "gt_word_freq.csv"
    if freq_path.exists():
        df_freq = pd.read_csv(freq_path).dropna()
        for _, row in df_freq.iterrows():
            word = row[0]
            freq = int(row["gt_freq"]) if "gt_freq" in row else row[1]
            graph.add_node(word, label="Word", language=lang, frequency=freq)
            graph.add_node(lang, label="Language")
            graph.add_edge(word, lang, type="spoken_in")
    else:
        print(f"⚠️ Missing file: {freq_path}")


  word = row[0]
  freq = int(row["gt_freq"]) if "gt_freq" in row else row[1]
  word = row[0]
  freq = int(row["gt_freq"]) if "gt_freq" in row else row[1]
  word = row[0]


In [18]:
# ✅ Cell 3: Load co-occurrence graphs
COOCCURRENCE_DIR = PROJECT_ROOT / "outputs" / "visualizations"

for lang in langs:
    gexf_path = COOCCURRENCE_DIR / f"{lang}_gt_word_cooccurrence.gexf"
    if gexf_path.exists():
        subgraph = nx.read_gexf(gexf_path)
        for u, v, d in subgraph.edges(data=True):
            graph.add_edge(u, v, type="related_to", weight=d.get("weight", 1))
    else:
        print(f"⚠️ Missing co-occurrence file: {gexf_path}")

In [19]:
# ✅ Cell 4: Save Knowledge Graph
OUTPUT_DIR = PROJECT_ROOT / "outputs" / "knowledge_graph"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

output_path = OUTPUT_DIR / "dialectal_word_graph.gexf"
nx.write_gexf(graph, output_path)

print(f"✅ Knowledge graph saved to: {output_path}")
print(f"🔢 Nodes: {len(graph.nodes)}, Edges: {len(graph.edges)}")

✅ Knowledge graph saved to: c:\Users\berfi\Documents\GitHub\dialectalwordmining\outputs\knowledge_graph\dialectal_word_graph.gexf
🔢 Nodes: 24807, Edges: 106561
