In [1]:
from networkx.algorithms.components import connected_components
from scipy.stats import normaltest
from pickle import dump

def comp_sizes(graph, max_c_size=20):
    comps = []
    for c in connected_components(graph):
        comps.append(len(c))
    return comps

In [4]:
import sys
sys.path.append('../src')
from utils import parse_twitter, parse_6dfb
from tqdm import tqdm
import networkx

def parse_amazon(path='../datasets/amazon-meta.txt'):
    fhr = open(path, 'r', encoding='utf-8', errors='ignore')
    amazonProducts = {}
    (Id, ASIN, Copurchased) = ("", "", "")
    for line in tqdm(fhr):
        line = line.strip()
        # a product block started
        if(line.startswith("Id")):
            Id = line[3:].strip()
        elif(line.startswith("ASIN")):
            ASIN = line[5:].strip()
        elif(line.startswith("similar")):
            ls = line.split()
            Copurchased = ' '.join([c for c in ls[2:]])
        elif (line==""):
            try:
                MetaData = {}
                if (ASIN != ""):
                    amazonProducts[ASIN]=MetaData
                MetaData['Copurchased'] = Copurchased
            except NameError:
                continue
            (Id, ASIN, Copurchased) = ("", "", "")
    fhr.close()
    for asin, metadata in tqdm(amazonProducts.items()): 
        amazonProducts[asin]['Copurchased'] = \
        ' '.join([cp for cp in metadata['Copurchased'].split() \
            if cp in amazonProducts.keys()])
    copurchaseGraph = networkx.Graph()
    for asin,metadata in tqdm(amazonProducts.items()):
        copurchaseGraph.add_node(asin)
        for a in metadata['Copurchased'].split():
            copurchaseGraph.add_node(a.strip())
            similarity = 0        
            copurchaseGraph.add_edge(asin, a.strip())
    return copurchaseGraph

In [5]:
cs_amazon = comp_sizes(parse_amazon())

15010574it [00:46, 321383.37it/s]
100%|██████████| 548552/548552 [00:04<00:00, 111049.93it/s]
100%|██████████| 548552/548552 [00:16<00:00, 32430.54it/s]


In [6]:
cs_twitter = comp_sizes(parse_twitter())

In [7]:
cs_6dfb = comp_sizes(parse_6dfb())

In [8]:
cs_list = [cs_amazon, cs_twitter, cs_6dfb]
for i in cs_list:
    print(len(i))

187565
22878
2792


In [12]:
with open("../tmp_files/cmp_sizes.bin", "wb") as f:
    dump(cs_amazon, f)
    dump(cs_6dfb, f)
    dump(cs_twitter, f)

In [10]:
for i in cs_list:  
    print(normaltest(i))

NormaltestResult(statistic=1132042.4563803389, pvalue=0.0)
NormaltestResult(statistic=103203.36699831608, pvalue=0.0)
NormaltestResult(statistic=9048.345824934895, pvalue=0.0)


In [11]:
import numpy as np
for i in cs_list:
    i = np.array(i)
    i = (i - i.mean()) / i.std()
    print(normaltest(i))

NormaltestResult(statistic=1132042.4563803384, pvalue=0.0)
NormaltestResult(statistic=103203.36699831608, pvalue=0.0)
NormaltestResult(statistic=9048.345824934897, pvalue=0.0)
