In [None]:
import os, re, io, json, requests, zipfile, nltk
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from collections import Counter

nltk.download('punkt')

# --- 1️⃣ Fetch performers folder from GitHub ---
url = "https://github.com/ahp9/social_graphs_02805_ass_2/archive/refs/heads/main.zip"
r = requests.get(url)
r.raise_for_status()
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()

DATA_DIR = "social_graphs_02805_ass_2-main/performers"

# --- 2️⃣ Load LabMT sentiment dataset ---
labmt_url = "https://raw.githubusercontent.com/ahp9/social_graphs_02805_ass_2/main/Data_Set_S1.txt"
labmt = pd.read_csv(labmt_url, sep="\t", skiprows=3)
word_to_scores = labmt.set_index("word").to_dict(orient="index")

# --- 3️⃣ Define helper functions ---
WORD_RE = re.compile(r"\w+")
LINK_RE = re.compile(r"\[\[([^\]|#]+)\]")
INFOBOX_NAMES = (
    "Infobox musical artist",
    "Infobox musical artist2",
    "Infobox musical artist/Person",
    "Infobox person",
)

def file_to_title(fname): return fname.replace(".txt", "").replace("_", " ")
def normalize(title): return re.sub(r"[^a-z0-9]+", "", title.lower())

# --- Infobox extraction helpers ---
def extract_infobox(text):
    start_pat = r"\{\{\s*(?:" + "|".join(map(re.escape, INFOBOX_NAMES)) + r")\b"
    m = re.search(start_pat, text, flags=re.IGNORECASE)
    if not m: return None
    i = m.start(); j = i; depth = 0
    while j < len(text) - 1:
        two = text[j:j+2]
        if two == "{{": depth += 1; j += 2; continue
        if two == "}}": depth -= 1; j += 2
        if depth == 0: return text[i:j]
        j += 1
    return None

def split_top_level_params(infobox):
    first_bar = infobox.find("|", infobox.find("{{") + 2)
    if first_bar == -1: return {}
    body = infobox[first_bar:]
    params, buf, depth, i, L = {}, [], 0, 0, len(body)

    def flush_field(s):
        s = s.lstrip("|").strip()
        if "=" in s:
            k, v = s.split("=", 1)
            params[k.strip().casefold()] = v.strip()

    while i < L:
        two = body[i:i+2]
        if two == "{{": depth += 1; buf.append(two); i += 2; continue
        if two == "}}": depth = max(0, depth - 1); buf.append(two); i += 2; continue
        if body[i:i+2] == "\n|" and depth == 0:
            flush_field("".join(buf)); buf = []; i += 2; continue
        buf.append(body[i]); i += 1
    flush_field("".join(buf))
    return params

def extract_genre_block_from_infobox(infobox_text):
    params = split_top_level_params(infobox_text)
    return params.get("genre", "")

LIST_WRAPPERS = ("hlist", "flatlist", "ubl", "unbulleted list", "plainlist", "plain list")

def clean_genre_text(s):
    if not s: return ""
    s = re.sub(r"<!--.*?-->", "", s, flags=re.DOTALL)
    s = re.sub(r"<br\s*/?>", ",", s)
    s = re.sub(r"<[^>]+>", "", s)
    s = re.sub(r"\{\{\s*(?:" + "|".join(LIST_WRAPPERS) + r")\s*\|", "", s)
    s = s.replace("{{", "").replace("}}", "")
    s = re.sub(r"^\s*\*+\s*", "", s, flags=re.MULTILINE)
    s = s.replace("\n*", "\n")
    return s

def extract_genres(genre_block):
    s = clean_genre_text(genre_block)
    s = re.sub(r"cite\s+web[\s\S]*?(?=(\[\[|$))", "", s)
    s = re.sub(r"\b(?:url|title|publisher|access-date|work|last|first)\s*=\s*[^|}\n]*", "", s)
    link_matches = re.findall(r"\[\[([^\]]+)\]\]", s)
    genres = []
    for match in link_matches:
        parts = match.split("|")
        genre = parts[-1].strip()
        if genre and not any(x in genre for x in ("http", "cite", "url", "access-date")):
            genres.append(genre)
    out, seen = [], set()
    for g in genres:
        g2 = re.sub(r"\s+", " ", g).strip()
        key = g2.casefold()
        if key not in seen:
            seen.add(key); out.append(g2)
    return out

def canonicalize_genre(g):
    s = g.strip().replace("’", "'").replace("&amp;", "&")
    s = re.sub(r"\s+", " ", s).strip().casefold()
    s = re.sub(r"\brock\s*(?:'n'|’n’|n’|’n|n'|&|and)\s*roll\b", "rock and roll", s)
    s = re.sub(r"\bhip[-\s]?hop\b", "hip hop", s)
    s = re.sub(r"\brhythm and blues\b", "r&b", s)
    return s.strip()

# --- 4️⃣ Build the artist graph ---
files = [f for f in os.listdir(DATA_DIR) if f.endswith(".txt")]
performers = [file_to_title(f) for f in files]
norm2canon = {normalize(t): t for t in performers}

graph = nx.DiGraph()
graph.add_nodes_from(performers)

for fname in files:
    src = file_to_title(fname)
    path = os.path.join(DATA_DIR, fname)
    with open(path, "r", encoding="utf-8") as fh:
        text = fh.read()
    graph.nodes[src]["text"] = text

    infobox = extract_infobox(text)
    if infobox:
        genre_block = extract_genre_block_from_infobox(infobox)
        genres = [canonicalize_genre(g) for g in extract_genres(genre_block)]
        if genres:
            graph.nodes[src]["genres"] = list(set(genres))

    # find outgoing links
    unique_targets = set()
    for raw in LINK_RE.findall(text):
        tgt_norm = normalize(raw)
        if tgt_norm in norm2canon:
            dst = norm2canon[tgt_norm]
            if dst != src:
                unique_targets.add(dst)
    for dst in unique_targets:
        graph.add_edge(src, dst)

graph = graph.to_undirected()
print(f"Graph: {graph.number_of_nodes()} nodes, {graph.number_of_edges()} edges")

# --- 5️⃣ Sentiment analysis ---
def calculate_sentiment(tokens):
    sentiment_score = 0
    count = 0
    for token in tokens:
        token = token.lower()
        if token in word_to_scores:
            sentiment_score += word_to_scores[token]["happiness_average"]
            count += 1
    return sentiment_score / count if count > 0 else 0

for node in graph.nodes:
    text = graph.nodes[node].get("text", "")
    tokens = nltk.word_tokenize(text.lower())
    sentiment = calculate_sentiment(tokens)
    graph.nodes[node]["sentiment"] = sentiment

sentiments = [graph.nodes[n]["sentiment"] for n in graph.nodes if graph.nodes[n]["sentiment"] is not None]
mean_sent, median_sent = np.mean(sentiments), np.median(sentiments)
variance_sent = np.var(sentiments)
q25, q75 = np.percentile(sentiments, [25, 75])

print(f"Mean: {mean_sent:.2f}, Median: {median_sent:.2f}, Var: {variance_sent:.2f}")

# --- 6️⃣ Plot the sentiment histogram ---
fig, ax = plt.subplots()
ax.hist(sentiments, bins=20, color="lightgray", edgecolor="black")
ax.axvline(mean_sent, color="darkred", linestyle="--", linewidth=2, label=f"Mean = {mean_sent:.2f}")
ax.axvline(median_sent, color="black", linestyle="-", linewidth=2, label=f"Median = {median_sent:.2f}")
ax.set_xlabel("Sentiment")
ax.set_ylabel("Frequency")
ax.set_title("Sentiment Histogram")
ax.legend()
plt.tight_layout()
plt.show()

# --- 7️⃣ (Optional) Save graph with genres + sentiment ---
nx.write_gexf(graph, "artist_graph_with_genres_sentiment.gexf")
print("✅ Saved artist_graph_with_genres_sentiment.gexf")
