In [7]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from collections import Counter
import nltk

# Load structured dataset from Layer 1
df = pd.read_csv("structured_transcripts.csv")

analyzer = SentimentIntensityAnalyzer()

def repetition_score(text):
    words = nltk.word_tokenize(text.lower())
    counts = Counter(words)
    repeated = sum(c for w, c in counts.items() if c > 2)
    return repeated / (len(words) + 1)

def urgency_score(text):
    urgent_words = [
        "urgent", "immediately", "asap", "now",
        "today", "important", "complaint"
    ]
    text = text.lower()
    return sum(w in text for w in urgent_words)

rows = []

for _, row in df.iterrows():
    text = str(row["text"])

    sentiment = analyzer.polarity_scores(text)["compound"]
    repetition = repetition_score(text)
    urgency = urgency_score(text)

    importance = abs(sentiment) + repetition + urgency

    rows.append({
        "transcript_id": row["transcript_id"],
        "turn_id": row["turn_id"],
        "speaker": row["speaker"],
        "text": text,
        "sentiment": sentiment,
        "repetition_score": repetition,
        "urgency_score": urgency,
        "importance": importance
    })

causal_df = pd.DataFrame(rows)

# Rank turns inside each transcript
causal_df["rank"] = causal_df.groupby("transcript_id")["importance"] \
                             .rank(ascending=False)

# Mark top evidence
causal_df["is_top_evidence"] = causal_df["rank"] <= 3

# Save output
causal_df.to_csv("layer3_causal_scores.csv", index=False)

print("Layer 3 complete: causal signals generated.")

Layer 3 complete: causal signals generated.
