In [1]:
import os
import json
import matplotlib.pyplot as plt
from scipy import stats
import pickle

from matplotlib import rcParams
import numpy as np
import seaborn as sns
import pandas as pd
from tqdm import tqdm

rcParams["font.family"] = "serif"
rcParams["grid.linestyle"] = ':'
rcParams["xtick.direction"] = 'in'
rcParams["ytick.direction"] = 'in'
rcParams["legend.fontsize"] = 9
rcParams["axes.labelsize"] = 20
rcParams["axes.titlesize"] = 20
rcParams["xtick.labelsize"] = 15
rcParams["ytick.labelsize"] = 15

In [2]:
search_counts_dir = '../search_counts'

In [3]:
image_search_counts = [os.path.join(search_counts_dir, x) for x in os.listdir(search_counts_dir) if '0.7_' in x and 'integrated' not in x]

text_search_counts = [os.path.join(search_counts_dir, x) for x in os.listdir(search_counts_dir) if 'lemmatized' in x and 'integrated' not in x]

integrated_search_counts = [os.path.join(search_counts_dir, x) for x in os.listdir(search_counts_dir) if '0.7_' in x and 'lemmatized' in x and 'integrated' in x]

In [42]:
cc3m_relevant_count_files = [x for x in integrated_search_counts if 'cc3m' in x]
data = []
processed_concepts = set()
cc3m_counts = []

for rfile in sorted(cc3m_relevant_count_files):
    dataset = rfile.split('/')[-1].split('_')[0]
    with open(rfile) as f:
        concepts = json.load(f)
        for concept, count in concepts.items():
            if concept not in processed_concepts:
                data.append({'id': concept, 'concept_name': concept, 'dataset': dataset, 'count': count})
                processed_concepts.add(concept)
                cc3m_counts.append(count)

cc3m_df = pd.DataFrame(data)
print(len(cc3m_df))

3775


In [39]:
cc12m_relevant_count_files = [x for x in image_search_counts if 'cc12m' in x]
data = []
processed_concepts = set()
cc12m_counts = []

for rfile in sorted(cc12m_relevant_count_files):
    dataset = rfile.split('/')[-1].split('_')[0]
    with open(rfile) as f:
        concepts = json.load(f)
        for concept, count in concepts.items():
            if concept not in processed_concepts:
                data.append({'id': concept, 'concept_name': concept, 'dataset': dataset, 'count': count})
                processed_concepts.add(concept)
                cc12m_counts.append(count)

cc12m_df = pd.DataFrame(data)
print(len(cc12m_df))

3811


In [40]:
yfcc15m_relevant_count_files = [x for x in image_search_counts if 'yfcc15m' in x]
data = []
processed_concepts = set()
yfcc15m_counts = []

for rfile in sorted(yfcc15m_relevant_count_files):
    dataset = rfile.split('/')[-1].split('_')[0]
    with open(rfile) as f:
        concepts = json.load(f)
        for concept, count in concepts.items():
            if concept not in processed_concepts:
                data.append({'id': concept, 'concept_name': concept, 'dataset': dataset, 'count': count})
                processed_concepts.add(concept)
                yfcc15m_counts.append(count)

yfcc15m_df = pd.DataFrame(data)
print(len(yfcc15m_df))

3811


In [41]:
laion400m_relevant_count_files = [x for x in image_search_counts if 'laion400m' in x]
data = []
processed_concepts = set()
laion400m_counts = []

for rfile in sorted(laion400m_relevant_count_files):
    dataset = rfile.split('/')[-1].split('_')[0]
    with open(rfile) as f:
        concepts = json.load(f)
        for concept, count in concepts.items():
            if concept not in processed_concepts:
                data.append({'id': concept, 'concept_name': concept, 'dataset': dataset, 'count': count})
                processed_concepts.add(concept)
                laion400m_counts.append(count)

laion400m_df = pd.DataFrame(data)
print(len(laion400m_df))

3776


In [30]:
print(len(laion400m_counts))
print(len(yfcc15m_counts))

res = stats.spearmanr(laion400m_counts, yfcc15m_counts)
print(res)

3840
3840
SignificanceResult(statistic=0.7644588019322125, pvalue=0.0)


In [31]:

print(len(cc3m_counts))
print(len(cc12m_counts))

res = stats.spearmanr(cc3m_counts, cc12m_counts)
print(res)

3775
3775
SignificanceResult(statistic=0.7924258926755035, pvalue=0.0)


In [37]:

print(len(yfcc15m_counts))
print(len(cc12m_counts))

res = stats.spearmanr(yfcc15m_counts, cc12m_counts)
print(res)

3811
3811
SignificanceResult(statistic=0.9637908005672169, pvalue=0.0)


In [44]:

print(len(laion400m_counts))
print(len(cc3m_counts))

res = stats.spearmanr(laion400m_counts[:3775], cc3m_counts)
print(res)

3776
3775
SignificanceResult(statistic=0.6282374756265174, pvalue=0.0)


In [48]:

print(len(laion400m_counts))
print(len(cc12m_counts))

res = stats.spearmanr(laion400m_counts, cc12m_counts[:3776])
print(res)

3776
3811
SignificanceResult(statistic=0.7353264196231468, pvalue=0.0)


In [51]:

print(len(yfcc15m_counts))
print(len(cc12m_counts))

res = stats.spearmanr(yfcc15m_counts, cc12m_counts)
print(res)

3811
3811
SignificanceResult(statistic=0.9637908005672169, pvalue=0.0)
