In [1]:
import tarfile
import os
import pandas as pd
import networkx as nx
from networkx.algorithms import community
from collections import Counter
import matplotlib.pyplot as plt
import requests
import json

In [2]:
class DatasetLoader:
  @staticmethod
  def download_dataset(url, output_path):
      response = requests.get(url, stream=True)
      if response.status_code == 200:
          with open(output_path, 'wb') as f:
              for chunk in response.iter_content(chunk_size=1024):
                  f.write(chunk)
          print(f"Dataset downloaded successfully to {output_path}")
      else:
          raise Exception(f"Failed to download dataset. Status code: {response.status_code}")
  @staticmethod
  def extract_dataset(file_path, extract_dir):
      with tarfile.open(file_path) as tar:
          tar.extractall(path=extract_dir)
      print(f"Dataset extracted to {extract_dir}")
      return extract_dir

  @staticmethod
  def load_data(data_dir, locale:str): # for locale - HR -> for Croatia, HU for Hungary, RO for Romania
      edges_file = os.path.join(data_dir, locale+'_edges.csv')  # Assuming dataset contains edges.csv
      genres_file = os.path.join(data_dir, locale+'_genres.json')  # Assuming dataset contains genres.csv

      edges = pd.read_csv(edges_file)
      with open(genres_file, 'r') as f:
        genres = json.load(f)
      return edges, genres

In [None]:
import google.colab.drive
google.colab.drive.mount('/content/drive')

In [6]:
url = "https://snap.stanford.edu/data/gemsec_deezer_dataset.tar.gz"  # Dataset URL
local_tar_path = "/content/drive/MyDrive/deezer_dataset.tar.gz"
extract_dir = "/content/drive/MyDrive/deezer_data/deezer_clean_data"

DatasetLoader.download_dataset(url, local_tar_path)
data_dir = DatasetLoader.extract_dataset(local_tar_path, extract_dir)

Dataset downloaded successfully to /content/drive/MyDrive/deezer_dataset.tar.gz
Dataset extracted to /content/drive/MyDrive/deezer_data/deezer_clean_data


In [27]:
# Network building

class NetworkBuilder:
  def __init__(self, edges, genres):
    self.edges = edges
    self.genres = genres

  # Build Network
  def build_network(self):
      # Create a Graph
      # Check if expected columns exist
      if 'user1' not in self.edges.columns or 'user2' not in self.edges.columns:
          # Rename columns if necessary
          self.edges.rename(columns={self.edges.columns[0]: 'user1', self.edges.columns[1]: 'user2'}, inplace=True)
          print(f"Renamed columns to: {self.edges.columns}")  # Confirm renaming

      G = nx.Graph()

      for _, row in edges.iterrows():
        G.add_edge(row['user1'], row['user2'])
      nx.set_node_attributes(G, self.genres, 'genre')

      return G

  def analyze_communities(self, partition):
      """
      Analyze the detected communities.

      Args:
          partition (dict): Community assignment for each node.

      Returns:
          pd.DataFrame: Summary of community sizes.
      """
      community_sizes = pd.Series(partition).value_counts()
      community_summary = pd.DataFrame({
          'Community': community_sizes.index,
          'Size': community_sizes.values
      })
      return community_summary

  def visualize_communities(self, G, partition, title="Community Structure"):
      """
      Visualize the detected communities on the network.

      Args:
          G (networkx.Graph): The network graph.
          partition (dict): Community assignment for each node.
          title (str): Title for the plot.
      """
      pos = nx.spring_layout(G)
      cmap = plt.cm.get_cmap('viridis', max(partition.values()) + 1)
      nx.draw_networkx_nodes(G, pos, node_color=[partition[node] for node in G.nodes()], cmap=cmap, node_size=50)
      nx.draw_networkx_edges(G, pos, alpha=0.5)
      plt.title(title)
      plt.show()

  # Community Detection
  def detect_communities(self, G):
      return list(community.louvain_communities(G))


In [28]:
# HR
edges, genres = DatasetLoader.load_data(data_dir,'HR')

builder = NetworkBuilder(edges, genres)
graph = builder.build_network()

Renamed columns to: Index(['user1', 'user2'], dtype='object')


In [29]:
# inspection of the graph
display(graph)

<networkx.classes.graph.Graph at 0x7e75d1e07160>

In [30]:
edges.head()

Unnamed: 0,user1,user2
0,0,4076
1,0,29861
2,0,53717
3,0,23820
4,0,39945


In [31]:
display(genres)

{'13357': ['Pop'],
 '11542': ['Indie Rock',
  'Indie Pop/Folk',
  'International Pop',
  'Rap/Hip Hop',
  'Pop',
  'Rock',
  'Indie Pop',
  'Alternative'],
 '11543': ['Dance', 'Pop', 'Rock'],
 '11540': ['International Pop', 'Jazz', 'Pop'],
 '11541': ['Rap/Hip Hop'],
 '11546': ['Dance',
  'Pop',
  'Techno/House',
  'International Pop',
  'Electro',
  'Singer & Songwriter'],
 '11547': ['Dance', 'Alternative', 'Indie Rock', 'Pop', 'Rock'],
 '11544': ['Films/Games',
  'Dance',
  'Rap/Hip Hop',
  'Contemporary R&B',
  'Pop',
  'International Pop',
  'R&B',
  'Rock',
  'Film Scores'],
 '11545': ['Pop'],
 '11548': ['Reggae', 'Rap/Hip Hop', 'Pop', 'Rock', 'Alternative', 'Folk'],
 '11549': ['Dance', 'Electro', 'Pop'],
 '5988': ['Pop', 'Rock'],
 '5989': ['Dance',
  'Rap/Hip Hop',
  'Pop',
  'Disco',
  'International Pop',
  'Rock',
  'Electro'],
 '19399': ['Folk', 'Pop'],
 '19398': ['Dance', 'Electro', 'Pop', 'Rock'],
 '5982': ['Pop'],
 '5983': ['Indie Pop/Folk', 'Dance', 'Rap/Hip Hop', 'Pop', '

In [32]:
def calculate_centrality_and_influence(G):
    # Degree centrality
    degree_centrality = nx.degree_centrality(G)

    # Betweenness centrality
    betweenness_centrality = nx.betweenness_centrality(G)

    # Collect data for analysis
    centrality_data = []
    for node in G.nodes(data=True):
        genre = node[1].get('genre', 'Unknown')
        centrality_data.append({
            'Node': node[0],
            'Genre': genre,
            'DegreeCentrality': degree_centrality[node[0]],
            'BetweennessCentrality': betweenness_centrality[node[0]]
        })

    # Create a DataFrame
    centrality_df = pd.DataFrame(centrality_data)

    # Group by genre to summarize influence
    genre_influence = centrality_df.groupby('Genre').mean().sort_values(by='DegreeCentrality', ascending=False)
    print("Genre Influence Summary:")
    print(genre_influence)

    return centrality_df, genre_influence

def visualize_centrality(G, centrality_df, centrality_metric='DegreeCentrality', title="Centrality Visualization"):
    pos = nx.spring_layout(G)
    node_sizes = centrality_df.set_index('Node')[centrality_metric] * 1000

    nx.draw(G, pos,
            node_size=[node_sizes.get(node, 10) for node in G.nodes()],
            with_labels=False, alpha=0.7)
    plt.title(title)
    plt.show()

In [35]:
locale = "HR"
partition = builder.detect_communities(graph)
community_summary = builder.analyze_communities(partition)
print(f"Community analysis for {locale}:")
print(community_summary)


Community analysis for HR:
                                            Community  Size
0   {49157, 16390, 32775, 16394, 49163, 32781, 327...     1
1   {16386, 12300, 12302, 32787, 4119, 32792, 4508...     1
2   {40193, 15009, 34407, 52394, 38445, 26641, 188...     1
3   {49155, 16389, 16396, 4117, 53274, 12315, 2870...     1
4   {49152, 32768, 24580, 36870, 32785, 12305, 19,...     1
5   {40960, 16384, 3, 7, 8202, 10, 8204, 40973, 40...     1
6   {18433, 8194, 2049, 36868, 6146, 26630, 12301,...     1
7   {8193, 14347, 36886, 36887, 8214, 28699, 38943...     1
8   {51203, 29700, 30727, 27657, 43024, 46102, 399...     1
9   {32770, 36867, 24582, 24583, 24584, 4105, 3688...     1
10  {16391, 49160, 40971, 49165, 24592, 40978, 491...     1
11                          {38022, 6748, 4429, 5102}     1
12  {5, 49158, 8, 16395, 12, 16397, 24599, 8221, 1...     1
13  {49156, 32777, 8201, 24589, 15, 40977, 40980, ...     1
14  {4097, 12290, 4, 20486, 16393, 36873, 11, 3277...     1
15  {0, 8195,

In [36]:
output_path = f"{locale}_community_analysis.csv"
community_summary.to_csv(output_path, index=False)
print(f"Community analysis saved to {output_path}")

Community analysis saved to HR_community_analysis.csv


In [None]:
# centrality for Hungary
centrality_df, genre_influence = calculate_centrality_and_influence(graph)
centrality_output_path = f"{locale}_centrality_analysis.csv"
centrality_df.to_csv(centrality_output_path, index=False)
print(f"Centrality analysis saved to {centrality_output_path}")

visualize_centrality(graph, centrality_df, title=f"{locale} Centrality Visualization")