In [None]:
#@title Connect to gdrive
import google.colab.drive
google.colab.drive.mount('/content/drive/')

In [None]:
#@title Data loading utility
import os
import tarfile
import pandas as pd
import json
import networkx as nx

class DatasetLoader:
    def __init__(self, data_dir):
        self.data_dir = data_dir

    def extract_dataset(self, file_name):
        full_path = os.path.join(self.data_dir, file_name)
        with tarfile.open(full_path) as tar:
            tar.extractall(path=self.data_dir)
        print(f"Dataset extracted to {self.data_dir}")
        return self.data_dir

    def load_data(self, locale, subfolder = ''):  # 'HR' for Croatia, 'HU' for Hungary, 'RO' for Romania
        edges_file = os.path.join(self.data_dir, subfolder, locale + '_edges.csv')
        genres_file = os.path.join(self.data_dir, subfolder, locale + '_genres.json')

        edges = pd.read_csv(edges_file)
        with open(genres_file, 'r') as f:
            genres = json.load(f)
        return edges, genres

# Constants
DATASET_ARCHIVE = "gemsec_deezer_dataset.tar.gz"
EXTRACT_DIR = "/content/drive/MyDrive/network_science/group_project_deezer/"

dataset_loader = DatasetLoader(EXTRACT_DIR)
dataset_loader.extract_dataset(DATASET_ARCHIVE)

# Preprocess data for RO

In [None]:
#@title load data for RO
LOCALE = "RO"

edges, genres = dataset_loader.load_data(LOCALE, subfolder='deezer_clean_data')
G_RO = nx.from_pandas_edgelist(edges, source='node_1', target='node_2')

In [None]:
#@title find communities using louvain
from networkx.algorithms import community
communities = list(community.louvain_communities(G_RO, seed=42))

In [None]:
#@title Precompute graph node details and save them to CSV
import csv

community_data = {}
for i, community in enumerate(communities):
    subgraph = G_RO.subgraph(community)

    degree_centrality = nx.degree_centrality(subgraph)
    closeness_centrality = nx.closeness_centrality(subgraph)
    betweenness_centrality = nx.betweenness_centrality(subgraph)

    for node in community:
        if str(node) in genres:  # Ensure genres are available
            node_data = {
                "node_id": node,
                "community_id": i,
                "community_size": len(community),
                "betweenness_centrality": betweenness_centrality.get(node, None),
                "degree_centrality": degree_centrality.get(node, None),
                "closeness_centrality": closeness_centrality.get(node, None),
                "genres": genres[str(node)]
            }
            community_data[node] = node_data
        else:
            print(f"Genres not found for node {node}")

output_csv_file = "/content/drive/MyDrive/network_science/group_project_deezer/node_data_RO_v2.csv"
with open(output_csv_file, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames={
                                                  "node_id",
                                                  "community_id",
                                                  "community_size",
                                                  "betweenness_centrality",
                                                  "degree_centrality",
                                                  "closeness_centrality",
                                                  "genres"
                                              })
    writer.writeheader()
    for node, node_data in community_data.items():
      writer.writerow(node_data)

print(f"Node data saved to {output_csv_file}.")


# Preprocess data for HR

In [None]:
#@title load data for HR
LOCALE = "HR"
edges, genres = dataset_loader.load_data(LOCALE, subfolder='deezer_clean_data')
G_HR = nx.from_pandas_edgelist(edges, source='node_1', target='node_2')

In [None]:
#@title find communities using louvain
from networkx.algorithms import community
communities = list(community.louvain_communities(G_HR, seed=42))

In [None]:
#@title Precompute graph node details and save them to CSV
import csv

community_data = {}
for i, community in enumerate(communities):
    subgraph = G_HR.subgraph(community)

    degree_centrality = nx.degree_centrality(subgraph)
    closeness_centrality = nx.closeness_centrality(subgraph)
    betweenness_centrality = nx.betweenness_centrality(subgraph)

    for node in community:
        if str(node) in genres:
            node_data = {
                "node_id": node,
                "community_id": i,
                "community_size": len(community),
                "betweenness_centrality": betweenness_centrality.get(node, None),
                "degree_centrality": degree_centrality.get(node, None),
                "closeness_centrality": closeness_centrality.get(node, None),
                "genres": genres[str(node)]
            }
            community_data[node] = node_data
        else:
            print(f"Genres not found for node {node}")

output_csv_file = "/content/drive/MyDrive/network_science/group_project_deezer/node_data_HR_v2.csv"
with open(output_csv_file, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames={
                                                  "node_id",
                                                  "community_id",
                                                  "community_size",
                                                  "betweenness_centrality",
                                                  "degree_centrality",
                                                  "closeness_centrality",
                                                  "genres"
                                              })
    writer.writeheader()
    for node, node_data in community_data.items():
      writer.writerow(node_data)

print(f"Node data saved to {output_csv_file}.")


# Preprocess data for HU

In [None]:
#@title load data for HU
LOCALE = "HU"
edges, genres = dataset_loader.load_data(LOCALE, subfolder='deezer_clean_data')
G_HU = nx.from_pandas_edgelist(edges, source='node_1', target='node_2')

In [None]:
#@title find communities using louvain
from networkx.algorithms import community
communities = list(community.louvain_communities(G_HU, seed=42))

In [None]:
#@title Precompute graph node details and save them to CSV
import csv

community_data = {}
for i, community in enumerate(communities):
    subgraph = G_HU.subgraph(community)

    degree_centrality = nx.degree_centrality(subgraph)
    closeness_centrality = nx.closeness_centrality(subgraph)
    betweenness_centrality = nx.betweenness_centrality(subgraph)

    for node in community:
        if str(node) in genres:
            node_data = {
                "node_id": node,
                "community_id": i,
                "community_size": len(community),
                "betweenness_centrality": betweenness_centrality.get(node, None),
                "degree_centrality": degree_centrality.get(node, None),
                "closeness_centrality": closeness_centrality.get(node, None),
                "genres": genres[str(node)]
            }
            community_data[node] = node_data
        else:
            print(f"Genres not found for node {node}")

output_csv_file = "/content/drive/MyDrive/network_science/group_project_deezer/node_data_HU_v2.csv"
with open(output_csv_file, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames={
                                                  "node_id",
                                                  "community_id",
                                                  "community_size",
                                                  "betweenness_centrality",
                                                  "degree_centrality",
                                                  "closeness_centrality",
                                                  "genres"
                                              })
    writer.writeheader()
    for node, node_data in community_data.items():
      writer.writerow(node_data)

print(f"Node data saved to {output_csv_file}.")
