In [None]:
import os
from pathlib import Path

# Project root
ROOT = Path('/Users/adam/Documents/GitHub/AI_Chat_Conversation_Analysis_Take_Home_Interview')
DATA = ROOT / 'data'
RAW = DATA / '01_raw'
INTERIM = DATA / '02_interim'
PROC = DATA / '03_processed'
ANAL = DATA / '04_analysis'
REPORTS = ROOT / 'reports'
IMAGES = REPORTS / 'images'

for p in [RAW, INTERIM, PROC, ANAL, IMAGES]:
    p.mkdir(parents=True, exist_ok=True)

print('Dirs ready:', RAW, INTERIM, PROC, ANAL, IMAGES, sep='\n')


In [None]:
# 1) Load and filter WildChat (English, non-toxic, 1000)
from src.hf_loader import load_filter_to_dataframe, save_dataframe_jsonl

limit = 1000
raw_df = load_filter_to_dataframe(language='English', toxic=False, limit=limit)
print('Loaded rows:', len(raw_df))
raw_path = str(RAW / 'wildchat_english_notoxic_1000.jsonl')
save_dataframe_jsonl(raw_df, raw_path)
print('Saved:', raw_path)


In [None]:
# 2) Preprocess text (if your pipeline has a text_cleaning step)
from src.preprocess import preprocess_and_save_from_raw
cleaned_jsonl = preprocess_and_save_from_raw(raw_path, output_dir=str(INTERIM))
print('Cleaned:', cleaned_jsonl)


In [None]:
# 3) BERTopic
from src.bertopic_pipeline import load_docs_from_jsonl, run_bertopic, save_outputs

# Load user-visible docs from cleaned jsonl
identities, docs = load_docs_from_jsonl(cleaned_jsonl)
print('Docs:', len(docs))

model, topics, probs = run_bertopic(docs)
paths = save_outputs(model, docs, topics, identities, output_dir=str(ROOT / 'reports'))
print('BERTopic outputs:', paths)


In [None]:
# 4) Rule-based sentiment (VADER/TextBlob)
from src.sentiment_rule_based import batch_process_subset

subset_jsonl = cleaned_jsonl  # using same cleaned file (IDs used)
rule_csv = str(ANAL / 'sentiment_rule_based_1000.csv')
batch_process_subset(subset_jsonl, rule_csv)
print('Rule-based sentiment saved:', rule_csv)


In [None]:
# 5) LLM analysis
# Assumes LLM JSONL already exists at PROC / 'llm_analysis_results_1000_v4.jsonl'
# If not, plug your LLM pipeline here to produce it (same schema used by _load_llm_df).
llm_jsonl = str(PROC / 'llm_analysis_results_1000_v4.jsonl')
print('LLM JSONL expected at:', llm_jsonl, '\nExists:', os.path.exists(llm_jsonl))


In [None]:
# 6) Build canonical unified table (entity-level)
from src.synthesis import build_unified_table

unified_csv = str(ANAL / 'unified_table.csv')
paths_bertopic = {
    'doc_topics': str(ROOT / 'reports' / 'bertopic_doc_topics.csv'),
    'topics': str(ROOT / 'reports' / 'bertopic_topics.csv'),
}

build_unified_table(
    cleaned_jsonl_path=cleaned_jsonl,
    doc_topics_csv_path=paths_bertopic['doc_topics'],
    topics_csv_path=paths_bertopic['topics'],
    rule_based_csv_path=rule_csv,
    llm_jsonl_path=llm_jsonl,
    output_csv_path=unified_csv,
)
print('Unified table:', unified_csv)


In [None]:
# 7) Entity clustering with labels
from src.entity_clustering import cluster_entities
clustered = cluster_entities(unified_csv, output_csv_path=unified_csv, min_cluster_size=4)
print('Clustered unified table:', clustered)


In [None]:
# 8) Generate visuals
from src.visualizations import generate_all
results = generate_all(unified_csv, out_dir=str(IMAGES))
results
