In [2]:
from lib import *
from tqdm.autonotebook import tqdm

# reload imported files automatically without restarting the kernel
%load_ext autoreload
%autoreload 2

## 1. get all datasets infos from networks.skewed.d

### 1.1.define base values

In [79]:
BASE_URL = "https://networks.skewed.de/net/"

NetworkProperties = namedtuple(
    "NetworkProperties",
    [
        "group",
        "name",
        "nodes",
        "edges",
        "avg_degree",
        "std_dev_degree",
        "largest_eigenvalue_hashimoto",
        "random_walk_mixing_time",
        "degree_assortativity",
        "global_clustering",
        "diameter",
        "relative_size_of_largest_component",
        "kind",
        "mode",
        "node_properties",
        "edge_properties",
        "gml_file_size",
        
    ],
)  # create tuple to hold network properties

### 1.2.get column-names, define new column names and replace old columns names

In [120]:
#get column names 
df = pd.read_html("https://networks.skewed.de/")[
    0
]  

#define new column names
column_names = [
    "name",
    "title",
    "nodes",
    "edges",
    "avg_degree",
    "std_dev_degree",
    "largest_eigenvalue_hashimoto",
    "random walk mixing time",
    "degree assortativity",
    "global clustering",
    "diameter",
    "relative_size_of_largest_component",
    "kind",
    "mode",
    "number_of_networks",
    "tags"
]

#replace column names
df.columns = column_names  # set column names

### 1.3. Create and use filter for online, offline  networks

#### 1.3.1. create social, online, offline, unweighted, undirected filter 

In [82]:
social_networks = df["tags"].str.contains("Social")
unweighted = df["tags"].str.contains("Unweighted")
undirected = df["kind"] == "Undirected"
online = df["tags"].str.contains("Online")
offline = df["tags"].str.contains("Offline")

online_social_filter = (
    online & social_networks & unweighted & undirected
)  # filter for desired online social networks
offline_social_filter = (
    offline & social_networks & unweighted & undirected
)  # filter for desired offline social networks

#### 1.3.2. use online_social and offline_social filter, define name list for online_social, offline_social

In [83]:
#get online social netwroks, by applying online social networks filter and define name list
online_social = df[online_social_filter]
online_network_group_names = online_social["name"].to_numpy()

#get offline social netwroks, by applying offline social networks filter and define name list
offline_social = df[offline_social_filter]
offline_network_group_names = offline_social["name"].to_numpy()
print("Number of offline social network groups:", len(offline_social))

Number of offline social network groups: 17


### 1.4. download online and offline social network data

##### 1.4.1. define function to get data

In [84]:
def get_network_data(network_group_names: list) -> pd.DataFrame:
    """
    Get info of all networks contained in the network groups.
    """
    result_tuples = []

    for network_group in network_group_names:
        URL = BASE_URL + network_group
        df = pd.read_html(URL)[0]
        for i, row in df.iterrows():
            curr_result = NetworkProperties(
                #replace the current column names with the new ones
                group=network_group,
                name=row["Name"],
                nodes=row["Nodes"],
                edges=row["Edges"],
                avg_degree=row["$\\left<k\\right>$"],
                std_dev_degree=row["$\\sigma_k$"],
                largest_eigenvalue_hashimoto=row["$\\lambda_h$"],
                random_walk_mixing_time=row["$\\tau$"],
                degree_assortativity=row["$r$"],
                global_clustering=row["$c$"],
                diameter=row["$\\oslash$"],
                relative_size_of_largest_component=row["$S$"],
                kind=row["Kind"],
                mode=row["Mode"],
                node_properties=row["NPs"],
                edge_properties=row["EPs"],
                gml_file_size=row["GML"],
                
            )
            result_tuples.append(curr_result)

    return pd.DataFrame(result_tuples)

##### 1.4.1. apply get data function, get all online and all offline netwroks

In [103]:
all_offline_networks = get_network_data(offline_network_group_names)
all_online_networks = get_network_data(online_network_group_names)

### 1.5. Create column for networks size category and create sub datasets, define name list for small,medium,large online/offline networks, get data

#### 1.5.1. create small, medium, large network size category and add collumn 

In [104]:
#function to add column for node size category

def add_q_cut_column(df: pd.DataFrame, column: str, q: int, labels) -> pd.DataFrame:
    """
    Add a column to the dataframe with the quantile cut of the column.
    """
    df[column + "_qcut"] = pd.qcut(df[column], q=q, labels=labels)
    return df

online_social['nodes_qcut']="0"
offline_social['nodes_qcut']="0"

add_q_cut_column(online_social, "nodes", 3, ["small", "medium", "large"])
add_q_cut_column(offline_social, "nodes", 3, ["small", "medium", "large"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  online_social['nodes_qcut']="0"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  offline_social['nodes_qcut']="0"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column + "_qcut"] = pd.qcut(df[column], q=q, labels=labels)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using 

Unnamed: 0,name,title,nodes,edges,avg_degree,std_dev_degree,largest_eigenvalue_hashimoto,random walk mixing time,degree assortativity,global clustering,diameter,relative_size_of_largest_component,kind,mode,number_of_networks,tags,nodes_qcut
41,contact,Haggle human proximity network (2006),274,28244,206.16,460.93,50.52,2.49,-0.05,0.9,4,1.0,Undirected,Unipartite,1,Social Offline Unweighted Timestamps,medium
43,copenhagen,Copenhagen Networks Study,800,6429,16.07,13.38,30.19,8.18,0.18,0.24,7,1.0,Undirected,Unipartite,4,Social Offline Unweighted Weighted Timesta...,large
46,crime,Rosenfeld crime network (1991),1380,1476,2.14,2.14,3.28,1385.5,-0.17,0.0,32,0.92,Undirected,Bipartite,1,Social Offline Unweighted Metadata,large
75,dutch_criticism,Dutch literacy criticism (1976),35,81,4.63,2.97,5.57,5.29,0.04,0.16,7,1.0,Undirected,Unipartite,1,Social Offline Unweighted,small
146,karate,Zachary Karate Club,34,77,4.53,3.75,5.25,6.98,-0.47,0.26,5,1.0,Undirected,Unipartite,2,Social Offline Unweighted,small
184,november17,November17 members (2009),22,66,6.0,3.58,6.78,2.67,-0.25,0.53,4,1.0,Undirected,Unipartite,1,Social Offline Unweighted Metadata,small
205,reality_mining,Reality mining proximity network (2004),96,1086404,22633.42,21814.46,58.67,19.34,0.47,0.84,3,1.0,Undirected,Unipartite,1,Social Offline Unweighted Timestamps,medium
220,sp_colocation,Social co-locations (2018),403,1417485,7034.67,4468.01,370.8,0.89,0.11,0.97,2,1.0,Undirected,Unipartite,6,Social Offline Unweighted Weighted Tempora...,large
221,sp_high_school,High school temporal contacts (2013),329,188508,1145.95,879.5,40.12,88.29,0.47,0.61,4,0.99,Undirected,Unipartite,4,Social Offline Unweighted Weighted Tempora...,large
222,sp_high_school_new,High school dynamic contacts (2011-2012),180,45047,500.52,491.43,29.01,45.58,0.57,0.59,4,1.0,Undirected,Unipartite,2,Social Offline Unweighted Temporal Metadata,medium


#### 1.5.2. create small, medium, large filter  

In [105]:
small_online2 = online_social["nodes_qcut"].str.contains("small")
medium_online2 = online_social["nodes_qcut"].str.contains("medium")
large_online2= online_social["nodes_qcut"].str.contains("large")


small_offline2 = offline_social["nodes_qcut"].str.contains("small")
medium_offline2 = offline_social["nodes_qcut"].str.contains("medium")
large_offline2 = offline_social["nodes_qcut"].str.contains("large")

online_small_social_filter = (
    small_online2 & online & social_networks & unweighted & undirected
)  # filter for desired small online social networks

offline_small_social_filter = (
    small_offline2 & offline & social_networks & unweighted & undirected
)  # filter for desired small offline social networks

online_medium_social_filter = (
    medium_online2 & online & social_networks & unweighted & undirected
)  # filter for desired medium online social networks

offline_medium_social_filter = (
    medium_offline2 & offline & social_networks & unweighted & undirected
)  # filter for desired medium offline social networks

online_large_social_filter = (
    large_online2 & online & social_networks & unweighted & undirected
)  # filter for desired large online social networks

offline_large_social_filter = (
    large_offline2 & offline & social_networks & unweighted & undirected
)  # filter for desired large offline social networks

#### 1.5.3. apply small, medium large filter

Unnamed: 0,name,title,nodes,edges,avg_degree,std_dev_degree,largest_eigenvalue_hashimoto,random walk mixing time,degree assortativity,global clustering,diameter,relative_size_of_largest_component,kind,mode,number_of_networks,tags,nodes_qcut
96,facebook_friends,Maier Facebook friends (2014),362,1988,10.98,10.83,23.3,78.68,0.1,0.51,9,0.91,Undirected,Unipartite,1,Social Online Unweighted Metadata,0
97,facebook_organizations,Within-organization Facebook friendships (2013),5793,45266,15.63,30.36,56.27,2931.02,0.18,0.31,16,1.0,Undirected,Unipartite,6,Social Online Unweighted Metadata,0
114,foursquare,Foursquare NYC restaurants (2012),6410,10377,3.24,5.67,9.47,92.11,-0.04,0.0,19,0.84,Undirected,Bipartite,2,Social Online Unweighted Metadata,0
157,livemocha,Livemocha friendship network (2010),104103,2193083,42.13,109.68,272.53,9.08,-0.15,0.01,6,1.0,Undirected,Unipartite,1,Social Online Unweighted,0
166,marker_cafe,The Marker Cafe (2011),69413,1644849,47.39,176.63,465.81,30.71,-0.15,0.05,9,1.0,Undirected,Unipartite,1,Social Online Unweighted,0


In [138]:
small_online = df[online_small_social_filter]
small_online_network_group_names = small_online["name"].to_numpy()

small_offline = df[offline_small_social_filter]
small_offline_network_group_names = small_offline["name"].to_numpy()

medium_online = df[online_medium_social_filter]
medium_online_network_group_names = medium_online["name"].to_numpy()

medium_offline = df[offline_medium_social_filter]
medium_offline_network_group_names = medium_offline["name"].to_numpy()

large_online = df[online_large_social_filter]
large_online_network_group_names = large_online["name"].to_numpy()

large_offline = df[offline_large_social_filter]
large_offline_network_group_names = large_offline["name"].to_numpy()

Unnamed: 0,Total,Small,Medium,Large
Online,15,5,5,5
Offline,17,6,5,6


#### 1.5.3. get small, medium large online, offline data

In [109]:
small_online_networks = get_network_data(small_online_network_group_names)
medium_online_networks = get_network_data(medium_online_network_group_names)
large_online_networks = get_network_data(large_online_network_group_names)

small_offline_networks = get_network_data(small_offline_network_group_names)
medium_offline_networks = get_network_data(medium_offline_network_group_names)
large_offline_networks = get_network_data(large_offline_network_group_names)

### 1.6. save to csv

In [110]:
# save to file
all_offline_networks.to_csv(DATA_DIR_PATH / "offline_social_networks.csv", index=False)
all_online_networks.to_csv(DATA_DIR_PATH / "online_social_networks.csv", index=False)

small_online_networks.to_csv(DATA_DIR_PATH / "small_online_social_networks.csv", index=False)
medium_online_networks.to_csv(DATA_DIR_PATH / "medium_online_social_networks.csv", index=False)
large_online_networks.to_csv(DATA_DIR_PATH / "large_online_social_networks.csv", index=False)

small_offline_networks.to_csv(DATA_DIR_PATH / "small_offline_social_networks.csv", index=False)
medium_offline_networks.to_csv(DATA_DIR_PATH / "medium_offline_social_networks.csv", index=False)
large_offline_networks.to_csv(DATA_DIR_PATH / "large_offline_social_networks.csv", index=False)


## 2. download data

## 2.1. download offline data

### 2.1.0 load small,medium,large online, offline dataset names

In [96]:
small_offline_networks = pd.read_csv(DATA_DIR_PATH / "small_offline_social_networks.csv")
small_offline_networks['nodes_qcut']='small'

medium_offline_networks = pd.read_csv(DATA_DIR_PATH / "medium_offline_social_networks.csv")
medium_offline_networks['nodes_qcut']='medium'

#no data here
#large_offline_networks = pd.read_csv(DATA_DIR_PATH / "large_offline_social_networks.csv")
#large_offline_networks['nodes_qcut']='large'

small_online_networks = pd.read_csv(DATA_DIR_PATH / "small_online_social_networks.csv")
small_online_networks['nodes_qcut']='small'

medium_online_networks = pd.read_csv(DATA_DIR_PATH / "medium_online_social_networks.csv")
medium_online_networks['nodes_qcut']='medium'

large_online_networks = pd.read_csv(DATA_DIR_PATH / "large_online_social_networks.csv")
large_online_networks['nodes_qcut']='large'

### 2.1.1 download small offline data

In [145]:
# download all small offline networks
small_offline_networks = pd.read_csv(DATA_DIR_PATH / "small_offline_social_networks.csv")
small_offline_networks['nodes_qcut']='small'

progress = tqdm(unit="networks", total=len(small_offline_networks))
for i, row in small_offline_networks.iterrows():
    size = row["nodes_qcut"]
    url = BASE_URL + row["group"] + "/files/" + row["name"] + ".gml.zst"
    groupname = Path(get_group_name_from_url(url))
    filename = Path(get_filename_from_url(url))
    local_dir = DATA_DIR_PATH / "offline" / size / groupname
    local_dir.mkdir(parents=True, exist_ok=True)
    local_path = remove_file_suffix(str(local_dir / filename))
    download_and_extract(url, local_path)
    progress.update(1)

  0%|          | 0/7 [00:00<?, ?networks/s]

### 2.1.2 download medium offline data

In [150]:
# download all medium offline networks
medium_offline_networks = pd.read_csv(DATA_DIR_PATH / "medium_offline_social_networks.csv")
medium_offline_networks['nodes_qcut']='medium'

progress = tqdm(unit="networks", total=len(medium_offline_networks))
for i, row in medium_offline_networks.iterrows():
    size = row["nodes_qcut"]
    url = BASE_URL + row["group"] + "/files/" + row["name"] + ".gml.zst"
    groupname = Path(get_group_name_from_url(url))
    filename = Path(get_filename_from_url(url))
    local_dir = DATA_DIR_PATH / "offline" / size / groupname
    local_dir.mkdir(parents=True, exist_ok=True)
    local_path = remove_file_suffix(str(local_dir / filename))
    download_and_extract(url, local_path)
    progress.update(1)

  0%|          | 0/7 [00:00<?, ?networks/s]

### 2.1.3 download large offline data

In [None]:
# download all medium offline networks
large_offline_networks = pd.read_csv(DATA_DIR_PATH / "large_offline_social_networks.csv")
large_offline_networks['nodes_qcut']='large'

progress = tqdm(unit="networks", total=len(large_offline_networks))
for i, row in large_offline_networks.iterrows():
    size = row["nodes_qcut"]
    url = BASE_URL + row["group"] + "/files/" + row["name"] + ".gml.zst"
    groupname = Path(get_group_name_from_url(url))
    filename = Path(get_filename_from_url(url))
    local_dir = DATA_DIR_PATH / "offline" / size / groupname
    local_dir.mkdir(parents=True, exist_ok=True)
    local_path = remove_file_suffix(str(local_dir / filename))
    download_and_extract(url, local_path)
    progress.update(1)

## 2.2. download online data

### 2.2.1 download small online data

In [146]:
# download all small online networks
small_online_networks = pd.read_csv(DATA_DIR_PATH / "small_online_social_networks.csv")
small_online_networks['nodes_qcut']='small'

progress = tqdm(unit="networks", total=len(small_online_networks))
for i, row in small_online_networks.iterrows():
    size = row["nodes_qcut"]
    url = BASE_URL + row["group"] + "/files/" + row["name"] + ".gml.zst"
    groupname = Path(get_group_name_from_url(url))
    filename = Path(get_filename_from_url(url))
    local_dir = DATA_DIR_PATH / "online" / size / groupname
    local_dir.mkdir(parents=True, exist_ok=True)
    local_path = remove_file_suffix(str(local_dir / filename))
    download_and_extract(url, local_path)
    progress.update(1)

  0%|          | 0/11 [00:00<?, ?networks/s]

### 2.2.2 download medium online data

In [None]:
# download all medium online networks
medium_online_networks = pd.read_csv(DATA_DIR_PATH / "medium_online_social_networks.csv")
medium_online_networks['nodes_qcut']='medium'

progress = tqdm(unit="networks", total=len(medium_online_networks))
for i, row in medium_online_networks.iterrows():
    size = row["nodes_qcut"]
    url = BASE_URL + row["group"] + "/files/" + row["name"] + ".gml.zst"
    groupname = Path(get_group_name_from_url(url))
    filename = Path(get_filename_from_url(url))
    local_dir = DATA_DIR_PATH / "online" / size / groupname
    local_dir.mkdir(parents=True, exist_ok=True)
    local_path = remove_file_suffix(str(local_dir / filename))
    download_and_extract(url, local_path)
    progress.update(1)

### 2.2.3 download large online data

In [None]:
# download all large online networks
large_online_networks = pd.read_csv(DATA_DIR_PATH / "large_online_social_networks.csv")
large_online_networks['nodes_qcut']='large'

progress = tqdm(unit="networks", total=len(large_online_networks))
for i, row in large_online_networks.iterrows():
    size = row["nodes_qcut"]
    url = BASE_URL + row["group"] + "/files/" + row["name"] + ".gml.zst"
    groupname = Path(get_group_name_from_url(url))
    filename = Path(get_filename_from_url(url))
    local_dir = DATA_DIR_PATH / "online" / size / groupname
    local_dir.mkdir(parents=True, exist_ok=True)
    local_path = remove_file_suffix(str(local_dir / filename))
    download_and_extract(url, local_path)
    progress.update(1)

## 3. Number of offline small, offline medium, offline large and online small, online medium, online large social networks

In [141]:
number_of_networks_group = [[0,0,0,0],[0,0,0,0]]
classes=["Online","Offline"]
number_of_networks_group[0]=[len(online_social),len(small_online_networks),len(medium_online_networks),len(large_online_networks)]
number_of_networks_group[1]=[len(offline_social),len(small_offline_networks),len(medium_offline_networks),len(large_offline_networks)]
pd.DataFrame(number_of_networks_group, classes,columns=["Total","Small", "Medium", "Large"])

Unnamed: 0,Total,Small,Medium,Large
Online,15,11,148,11
Offline,17,7,7,50
