In [None]:
import tarfile
import os
import pandas as pd
import networkx as nx
from networkx.algorithms import community
from collections import Counter
import matplotlib.pyplot as plt
import requests
import json

In [None]:
class DatasetLoader:
  @staticmethod
  def download_dataset(url, output_path):
      response = requests.get(url, stream=True)
      if response.status_code == 200:
          with open(output_path, 'wb') as f:
              for chunk in response.iter_content(chunk_size=1024):
                  f.write(chunk)
          print(f"Dataset downloaded successfully to {output_path}")
      else:
          raise Exception(f"Failed to download dataset. Status code: {response.status_code}")
  @staticmethod
  def extract_dataset(file_path, extract_dir):
      with tarfile.open(file_path) as tar:
          tar.extractall(path=extract_dir)
      print(f"Dataset extracted to {extract_dir}")
      return extract_dir

  @staticmethod
  def load_data(data_dir, locale:str): # for locale - HR -> for Croatia, HU for Hungary, RO for Romania
      edges_file = os.path.join(data_dir, locale+'_edges.csv')  # Assuming dataset contains edges.csv
      genres_file = os.path.join(data_dir, locale+'_genres.json')  # Assuming dataset contains genres.csv

      edges = pd.read_csv(edges_file)
      with open(genres_file, 'r') as f:
        genres = json.load(f)
      return edges, genres

In [None]:
import google.colab.drive
google.colab.drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
url = "https://snap.stanford.edu/data/gemsec_deezer_dataset.tar.gz"  # Dataset URL
local_tar_path = "/content/drive/MyDrive/deezer_dataset.tar.gz"  # Replace with your preferred Colab path
extract_dir = "/content/drive/MyDrive/deezer_data/deezer_clean_data"

DatasetLoader.download_dataset(url, local_tar_path)
data_dir = DatasetLoader.extract_dataset(local_tar_path, extract_dir)

Dataset downloaded successfully to /content/drive/MyDrive/deezer_dataset.tar.gz
Dataset extracted to /content/drive/MyDrive/deezer_data/deezer_clean_data


In [None]:
# Network building

class NetworkBuilder:
  def __init__(self, edges, genres):
    self.edges = edges
    self.genres = genres

  # Build Network
  def build_network(self):
      # Create a Graph
      # Check if expected columns exist
      if 'user1' not in self.edges.columns or 'user2' not in self.edges.columns:
          # Rename columns if necessary
          self.edges.rename(columns={self.edges.columns[0]: 'user1', self.edges.columns[1]: 'user2'}, inplace=True)
          print(f"Renamed columns to: {self.edges.columns}")  # Confirm renaming

      G = nx.Graph()

      for _, row in edges.iterrows():
        G.add_edge(row['user1'], row['user2'])
      nx.set_node_attributes(G, self.genres, 'genre')

      return G


  # Community Detection
  def detect_communities(self, G):
      return list(community.louvain_communities(G))


In [None]:
# HR
edges, genres = DatasetLoader.load_data(data_dir,'HR')

builder = NetworkBuilder(edges, genres)
graph = builder.build_network()

Renamed columns to: Index(['user1', 'user2'], dtype='object')
