# Seed List Cleanup

Prepare a clean list of seeds (candidates for pseudo-crawls)
- add columns required to get page locations and metrics from Common Crawl
- remove duplicated seeds
- normalize URLs

In [8]:
import pandas as pd
import json

with open('seeds_batch_2.json', "r") as fi:
    data = json.load(fi)
    for idx in range(len(data)):
        data[idx]["link"] = data[idx]["description"]["homepage"]
    df = pd.DataFrame.from_dict(data)

df.head()

Unnamed: 0,uid,type,description,link
0,bbc_swahili,primary,{'homepage': 'https://www.bbc.com/swahili'},https://www.bbc.com/swahili
1,bbc_gahuza,primary,{'homepage': 'https://www.bbc.com/gahuza'},https://www.bbc.com/gahuza
2,bbc_igbo,primary,{'homepage': 'https://www.bbc.com/igbo'},https://www.bbc.com/igbo
3,bbc_yoruba,primary,{'homepage': 'https://www.bbc.com/yoruba'},https://www.bbc.com/yoruba
4,global_voices_yoruba,primary,{'homepage': 'https://yo.globalvoices.org'},https://yo.globalvoices.org


In [10]:
df.shape

(9, 4)

In [12]:
# normalize URLs and look for obsolete path prefixes
from urllib.parse import urlparse

def normalize_url(url):
    if url == 'reddit.com/r/singapore':
        url = 'https://www.reddit.com/r/singapore/'
    u = urlparse(url)
    path = u.path
    path = path.replace('//', '/')
    # normalize empty path (root path)
    if path == '':
        path = '/'
    # remove trailing file name
    if path[-1] != '/' and '.' in path.split('/')[-1]:
        path = '/'.join(path.split('/')[:-2])
    return '%s://%s%s' % (u.scheme, u.netloc, path)

def get_path_prefix(url):
    return urlparse(url).path

df['link'] = df['link'].apply(normalize_url)
df['url_path_prefix'] = df['link'].apply(get_path_prefix)

df['url_path_prefix'].value_counts().to_frame()
df.head()

Unnamed: 0,uid,type,description,link,url_path_prefix
0,bbc_swahili,primary,{'homepage': 'https://www.bbc.com/swahili'},https://www.bbc.com/swahili,/swahili
1,bbc_gahuza,primary,{'homepage': 'https://www.bbc.com/gahuza'},https://www.bbc.com/gahuza,/gahuza
2,bbc_igbo,primary,{'homepage': 'https://www.bbc.com/igbo'},https://www.bbc.com/igbo,/igbo
3,bbc_yoruba,primary,{'homepage': 'https://www.bbc.com/yoruba'},https://www.bbc.com/yoruba,/yoruba
4,global_voices_yoruba,primary,{'homepage': 'https://yo.globalvoices.org'},https://yo.globalvoices.org/,/


Some path prefixes seem to be mandatory
- language selectors: `/es/`, `/spanish/`
- location selectors: `/r/singapore/` (reddit.com)

Others only point to the homepage and would limit the recall to just this page:
- `/search/label/inicio`, `/pagina/bienvenidos-al-comite-de-sanidad-vegetal-cosave`

For now: we keep only prefixes up to 16 characters. However, clean curated URL prefixes might improve the data set in future runs.

In [14]:
def normalize_path_prefix(url):
    u = urlparse(url)
    path = u.path
    if len(path) > 16:
        path = '/'
    return '%s://%s%s' % (u.scheme, u.netloc, path)

df['link'] = df['link'].apply(normalize_path_prefix)
df['url_path_prefix'] = df['link'].apply(get_path_prefix)

df['url_path_prefix'].value_counts().to_frame()
df.head()

Unnamed: 0,uid,type,description,link,url_path_prefix
0,bbc_swahili,primary,{'homepage': 'https://www.bbc.com/swahili'},https://www.bbc.com/swahili,/swahili
1,bbc_gahuza,primary,{'homepage': 'https://www.bbc.com/gahuza'},https://www.bbc.com/gahuza,/gahuza
2,bbc_igbo,primary,{'homepage': 'https://www.bbc.com/igbo'},https://www.bbc.com/igbo,/igbo
3,bbc_yoruba,primary,{'homepage': 'https://www.bbc.com/yoruba'},https://www.bbc.com/yoruba,/yoruba
4,global_voices_yoruba,primary,{'homepage': 'https://yo.globalvoices.org'},https://yo.globalvoices.org/,/


In [15]:
# add columns required to get the counts from Common Crawl

import surt
import tldextract

def get_host(url):
    return urlparse(url).netloc.lower().lstrip('.')

def get_surtkey(url):
    return surt.surt(url)

def get_registered_domain(host):
    return tldextract.extract(host).registered_domain


df['url_host_name'] = df['link'].apply(get_host)
df['url_host_registered_domain'] = df['url_host_name'].apply(get_registered_domain)
df['url_surtkey'] = df['link'].apply(get_surtkey)

df.head()

Unnamed: 0,uid,type,description,link,url_path_prefix,url_host_name,url_host_registered_domain,url_surtkey
0,bbc_swahili,primary,{'homepage': 'https://www.bbc.com/swahili'},https://www.bbc.com/swahili,/swahili,www.bbc.com,bbc.com,"com,bbc)/swahili"
1,bbc_gahuza,primary,{'homepage': 'https://www.bbc.com/gahuza'},https://www.bbc.com/gahuza,/gahuza,www.bbc.com,bbc.com,"com,bbc)/gahuza"
2,bbc_igbo,primary,{'homepage': 'https://www.bbc.com/igbo'},https://www.bbc.com/igbo,/igbo,www.bbc.com,bbc.com,"com,bbc)/igbo"
3,bbc_yoruba,primary,{'homepage': 'https://www.bbc.com/yoruba'},https://www.bbc.com/yoruba,/yoruba,www.bbc.com,bbc.com,"com,bbc)/yoruba"
4,global_voices_yoruba,primary,{'homepage': 'https://yo.globalvoices.org'},https://yo.globalvoices.org/,/,yo.globalvoices.org,globalvoices.org,"org,globalvoices,yo)/"


In [16]:
# look for duplicates
df[df.duplicated(subset=['url_surtkey'], keep=False)]

Unnamed: 0,uid,type,description,link,url_path_prefix,url_host_name,url_host_registered_domain,url_surtkey


In [17]:
# deduplicate
df.drop_duplicates(subset=['url_surtkey'], inplace=True)
df.shape

(9, 8)

In [21]:
# generate ids
offset = 698 # max id from batch 1 was 697
df["id"] = [offset + i for i in range(len(df))]
df.head()

Unnamed: 0,uid,type,description,link,url_path_prefix,url_host_name,url_host_registered_domain,url_surtkey,id
0,bbc_swahili,primary,{'homepage': 'https://www.bbc.com/swahili'},https://www.bbc.com/swahili,/swahili,www.bbc.com,bbc.com,"com,bbc)/swahili",698
1,bbc_gahuza,primary,{'homepage': 'https://www.bbc.com/gahuza'},https://www.bbc.com/gahuza,/gahuza,www.bbc.com,bbc.com,"com,bbc)/gahuza",699
2,bbc_igbo,primary,{'homepage': 'https://www.bbc.com/igbo'},https://www.bbc.com/igbo,/igbo,www.bbc.com,bbc.com,"com,bbc)/igbo",700
3,bbc_yoruba,primary,{'homepage': 'https://www.bbc.com/yoruba'},https://www.bbc.com/yoruba,/yoruba,www.bbc.com,bbc.com,"com,bbc)/yoruba",701
4,global_voices_yoruba,primary,{'homepage': 'https://yo.globalvoices.org'},https://yo.globalvoices.org/,/,yo.globalvoices.org,globalvoices.org,"org,globalvoices,yo)/",702


In [22]:
# export the clean seed list
df.to_csv('seeds_batch_2.csv', index=False)
df.to_parquet('seeds_batch_2.gz.parquet', compression='gzip', index=False)