In [5]:
# Imports
import pandas as pd
import networkx as nx
from pathlib import Path
import community.community_louvain as louvain

# 1) Carregando o dataset
df = pd.read_csv("netflix_titles.csv")

# 2) Filtro: paises 
PAISES_PERMITIDOS = {
    "Australia", "Brazil", "Canada", "France", "Italy", "Japan",
    "South Korea", "Spain", "United Kingdom", "United States"
}

# 3) Filtros: generos
GENEROS_PERMITIDOS = {
    "Action & Adventure",
    "Classic Movies",
    "Cult Movies",
    "Comedies",
    "Dramas",
    "Horror Movies",
    "International Movies",
    "Romantic Movies",
    "Sci-Fi & Fantasy"
}

# 4) Prepara o df
def preparar_df(df, col_pessoa):
    df2 = df.dropna(subset=[col_pessoa, "listed_in", "country"]).copy()
    df2["pessoa_list"] = df2[col_pessoa].str.split(", ")
    df2["genre_list"] = df2["listed_in"].str.split(", ")
    df2["country_list"] = df2["country"].str.split(", ")
    
    df2["genre_list"] = df2["genre_list"].apply(lambda L: [g for g in L if g in GENEROS_PERMITIDOS])
    df2 = df2[df2["genre_list"].apply(len) > 0]
    
    return df2

# 5) Cria o grafico bipartido geral
def criar_grafo(df_prep):
    G = nx.Graph()
    for _, row in df_prep.iterrows():
        for pessoa in row["pessoa_list"]:
            for genero in row["genre_list"]:
                G.add_edge(pessoa, genero)
    return G


# 6) Mapeia pessoa → paises
def mapear_paises(df_prep):
    mapping = {}
    for _, row in df_prep.iterrows():
        for pessoa in row["pessoa_list"]:
            paises_validos = set(row["country_list"]) & PAISES_PERMITIDOS
            if paises_validos:
                mapping.setdefault(pessoa, set()).update(paises_validos)
    return mapping


# 7) Top 10 por pais
def topN_por_pais(G, pais_map, N=10):
    pr = nx.pagerank(G)
    agrupado = {}
    
    for pessoa, score in pr.items():
        for pais in pais_map.get(pessoa, []):
            agrupado.setdefault(pais, []).append((pessoa, score))
    
    selecionados = set()
    for pais, lista in agrupado.items():
        lista_sorted = sorted(lista, key=lambda x: x[1], reverse=True)
        selecionados.update([p for p, _ in lista_sorted[:N]])
    
    return selecionados

# 8) Top 10 por genero
def top10_por_genero(G, pessoas_sel):
    resultados = {}
    generos = [n for n in G.nodes() if n in GENEROS_PERMITIDOS]
    
    for genero in generos:
        vizinhos = [n for n in G.neighbors(genero) if n in pessoas_sel]
        if not vizinhos:
            continue
        ranking = sorted([(p, G.degree(p)) for p in vizinhos],
                         key=lambda x: x[1], reverse=True)[:10]
        resultados[genero] = ranking
    return resultados

# 9) Cria subgrafo final
def criar_subgrafo_final(G, ranking):
    nodes = set()
    for genero, lista in ranking.items():
        nodes.add(genero)
        nodes.update([p for p,_ in lista])
    return G.subgraph(nodes).copy()


# 10) Cria csv para o flourish
def exportar_flourish(G, pessoas_set, country_map, pasta):
    
    out = Path(pasta)
    out.mkdir(parents=True, exist_ok=True)

    # edges.csv
    pd.DataFrame(G.edges(), columns=["source","target"]).to_csv(out/"edges.csv", index=False)

    # nodes.csv
    nodes = list(G.nodes())
    pd.DataFrame({
        "id": nodes,
        "type": ["A" if n in pessoas_set else "B" for n in nodes]
    }).to_csv(out/"nodes.csv", index=False)

    # metricas
    degree = dict(G.degree())
    closeness = nx.closeness_centrality(G)
    pagerank = nx.pagerank(G)
    k = min(30, max(1, len(G.nodes())))
    betweenness = nx.betweenness_centrality(G, k=k)

    metrics = pd.DataFrame({
        "id": nodes,
        "Degree": [degree[n] for n in nodes],
        "Closeness": [closeness[n] for n in nodes],
        "PageRank": [pagerank[n] for n in nodes],
        "Betweenness": [betweenness[n] for n in nodes],
    })

    for col in ["Degree","Closeness","PageRank","Betweenness"]:
        mn, mx = metrics[col].min(), metrics[col].max()
        metrics[col+"_norm"] = (metrics[col]-mn)/(mx-mn) if mx!=mn else 0

    # country e genres_list
    country = []
    genres_list = []
    for n in nodes:
        if n in pessoas_set:
            country.append(", ".join(sorted(country_map.get(n, []))))
            neigh = [x for x in G.neighbors(n) if x in GENEROS_PERMITIDOS]
            genres_list.append(", ".join(sorted(neigh)))
        else:
            country.append("")
            neigh = [x for x in G.neighbors(n) if x in pessoas_set]
            genres_list.append(", ".join(sorted(neigh)))

    metrics["country"] = country
    metrics["genres_list"] = genres_list

    metrics.to_csv(out/"graph_metrics.csv", index=False)

    points = metrics[[
        "id","country","genres_list",
        "Degree_norm","PageRank_norm","Closeness_norm","Betweenness_norm"
    ]]

    points.to_csv(out/"points.csv", index=False)

    print("Exportado para:", pasta)


# 11) Executando para atores
df_actors_prep = preparar_df(df, "cast")
G_actors = criar_grafo(df_actors_prep)
actor_country = mapear_paises(df_actors_prep)
topA = topN_por_pais(G_actors, actor_country, N=10)
rankingA = top10_por_genero(G_actors, topA)
G_finalA = criar_subgrafo_final(G_actors, rankingA)
exportar_flourish(G_finalA, topA, actor_country, "output/atores_final_filtrado_generos_especificos")


# 12) Executando para diretores
df_dir_prep = preparar_df(df, "director")
G_directors = criar_grafo(df_dir_prep)
director_country = mapear_paises(df_dir_prep)
topD = topN_por_pais(G_directors, director_country, N=10)
rankingD = top10_por_genero(G_directors, topD)
G_finalD = criar_subgrafo_final(G_directors, rankingD)
exportar_flourish(G_finalD, topD, director_country, "output/diretores_final_filtrado_generos_especificos")

✅ Exportado para: output/atores_final_filtrado_generos_especificos
✅ Exportado para: output/diretores_final_filtrado_generos_especificos
