In [15]:
import pandas as pd
import numpy as np
import time
import pickle

In [16]:
organizations = pd.read_csv('cluster/scratch/bandonov/crunchbase/organizations.csv')
print(organizations.columns)

Index(['uuid', 'name', 'type', 'primary_role', 'cb_url', 'domain',
       'homepage_url', 'logo_url', 'facebook_url', 'twitter_url',
       'linkedin_url', 'combined_stock_symbols', 'city', 'region',
       'country_code', 'short_description'],
      dtype='object')


In [3]:
counts = organizations.groupby(['domain']).count()
counts_sorted = counts.sort_values('uuid', ascending=False)
repeating_domains = set(counts_sorted[counts_sorted['uuid'] > 1].index.to_list())

In [4]:
organizations['domain'] = organizations.apply(lambda x: x['domain'] if x['domain'] not in repeating_domains else np.nan, axis=1)

In [5]:
reverse_domain_reference = dict(zip(organizations['domain'], organizations.index))
crunchbase_domain_to_commoncrawl_node = dict(zip(organizations['domain'],  [None] * len(organizations)))
del reverse_domain_reference[np.nan]
del crunchbase_domain_to_commoncrawl_node[np.nan]

In [6]:
print(len(reverse_domain_reference))

3424685


In [7]:
nodes_path = 'cluster/scratch/bandonov/common-crawl/cc-main-2024-aug-sep-oct-domain-vertices.txt'

start = time.time()
with open(nodes_path) as file:
    for line in file:
        node_params = line.split()
        
        index = int(node_params[0])
        reversed_domain = node_params[1]
        num_hosts = int(node_params[2])
        
        domain = '.'.join(reversed(reversed_domain.split('.')))
        if domain in crunchbase_domain_to_commoncrawl_node.keys():
            crunchbase_domain_to_commoncrawl_node[domain] = index

end = time.time()
print(int(end - start), 's')

118 s


In [34]:
for key, value in crunchbase_domain_to_commoncrawl_node.items():
    print(key, '->', value)
    break

wetpaint.com -> 53837717


In [8]:
commoncrawl_node_to_domain = {value: key for key, value in crunchbase_domain_to_commoncrawl_node.items()}

In [71]:
edges_path = 'cluster/scratch/bandonov/common-crawl/cc-main-2024-aug-sep-oct-domain-edges.txt'
number_of_in_edges = {}

start = time.time()
with open(edges_path) as file:
    for line in file:
        edge_params = line.split()
        from_node = int(edge_params[0])
        to_node = int(edge_params[1])

        if to_node not in commoncrawl_node_to_domain:
            continue

        if to_node not in number_of_in_edges:
            number_of_in_edges[to_node] = 0
        
        number_of_in_edges[to_node] += 1

end = time.time()
print(int(end - start))

1230


In [77]:
with open('mentions.pkl', 'wb') as file:
    pickle.dump(number_of_in_edges, file)

print('Number of in edges saved!')

Number of in edges saved!


In [9]:
number_of_in_edges = None
with open('mentions.pkl', 'rb') as file:
    number_of_in_edges = pickle.load(file)

print('Loaded in edges')

Loaded in edges


In [10]:
n_website_references = [0] * len(organizations)

for node, n_in_edges in number_of_in_edges.items():
    domain = commoncrawl_node_to_domain[node]
    row_index = reverse_domain_reference[domain]
    n_website_references[row_index] = n_in_edges

organizations['n_website_references'] = n_website_references

In [11]:
print(organizations.head())

                                   uuid       name          type primary_role  \
0  e1393508-30ea-8a36-3f96-dd3226033abd   Wetpaint  organization      company   
1  bf4d7b0e-b34d-2fd8-d292-6049c4f7efc7       Zoho  organization      company   
2  5f2b40b8-d1b3-d323-d81a-b7a8e89553d0       Digg  organization      company   
3  df662812-7f97-0b43-9d3e-12f64f504fbb       Meta  organization      company   
4  60485007-8856-bbac-aa1b-c535c41f5f47  Omnidrive  organization      company   

                                              cb_url         domain  \
0  https://www.crunchbase.com/organization/wetpai...   wetpaint.com   
1  https://www.crunchbase.com/organization/zoho?u...       zoho.com   
2  https://www.crunchbase.com/organization/digg?u...       digg.com   
3  https://www.crunchbase.com/organization/facebo...       meta.com   
4  https://www.crunchbase.com/organization/omnidr...  omnidrive.com   

               homepage_url  \
0  http://www.wetpaint.com/   
1      https://www.zoho.

In [12]:
organizations.to_csv('cluster/scratch/bandonov/crunchbase/organizations_with_n_website_references.csv')