# Seed List Cleanup

Prepare a clean list of seeds (candidates for pseudo-crawls)
- add columns required to get page locations and metrics from Common Crawl
- remove duplicated seeds
- normalize URLs

In [1]:
import pandas as pd

df = pd.read_csv('candidate_websites_for_crawling.csv')

df.head()

Unnamed: 0,#,Dataset title,"Domain Name / link\n(if highlighted in red, it's a duplicate! So don't add it...)",License\n(default is UNKNOWN),Release (Issue date),Glottocode,Language(s) (or family),Dialect/accent (if known),Subject,Format,Collection Style (Manual Curation vs Crowdsourced(web)) ?,What it is / why we want it (5-25 words),Volume (estimates),"Contains Personal Information? (-1=unlikely, 0=neutral, 1=likely)",Owner,Usage and relation to other datasets
0,12,Fundacion Cajasol,https://fundacioncajasol.com/,unknown,multiple releases,stan1288,es,Spain,General News,text (web),manual,,unknown,unknown,Fundacion Cajasol,
1,13,Asamblea Nacional del Ecuador,http://www.confirmado.net/,unknown,multiple releases,,es,Ecuador,General News,text (web),manual,,unknown,unknown,Asamblea Nacional del Ecuador,
2,14,el periodico de Tlaxcala,https://elperiodicodetlaxcala.com/,unknown,multiple releases,,es,Mexico,General News,text (web),manual,,unknown,unknown,el periodico de Tlaxcala,
3,15,mispeces,https://www.mispeces.com/,unknown,multiple releases,,es,Spain,General News,text (web),manual,,unknown,unknown,mispeces,
4,16,DiarioVasco,https://www.diariovasco.com/,unknown,multiple releases,,es,Spain,General News,text (web),manual,,unknown,unknown,DiarioVasco,


In [None]:
import json



In [2]:
# select mandatory columns and assign simple and SQL-compatible column names
df = df.iloc[:, [0,1,2,6]]
df.columns = ['id', 'title', 'link', 'language']
df.head()

Unnamed: 0,id,title,link,language
0,12,Fundacion Cajasol,https://fundacioncajasol.com/,es
1,13,Asamblea Nacional del Ecuador,http://www.confirmado.net/,es
2,14,el periodico de Tlaxcala,https://elperiodicodetlaxcala.com/,es
3,15,mispeces,https://www.mispeces.com/,es
4,16,DiarioVasco,https://www.diariovasco.com/,es


In [3]:
df.shape

(456, 4)

In [4]:
# normalize URLs and look for obsolete path prefixes
from urllib.parse import urlparse

def normalize_url(url):
    if url == 'reddit.com/r/singapore':
        url = 'https://www.reddit.com/r/singapore/'
    u = urlparse(url)
    path = u.path
    path = path.replace('//', '/')
    # normalize empty path (root path)
    if path == '':
        path = '/'
    # remove trailing file name
    if path[-1] != '/' and '.' in path.split('/')[-1]:
        path = '/'.join(path.split('/')[:-2])
    return '%s://%s%s' % (u.scheme, u.netloc, path)

def get_path_prefix(url):
    return urlparse(url).path

df['link'] = df['link'].apply(normalize_url)
df['url_path_prefix'] = df['link'].apply(get_path_prefix)

df['url_path_prefix'].value_counts().to_frame()

Unnamed: 0,url_path_prefix
/,391
,13
/es/,6
/wps/portal/rielcano_es,2
/forums/,2
/web/,1
/es,1
/singapore,1
/informa,1
/forum/,1


Some path prefixes seem to be mandatory
- language selectors: `/es/`, `/spanish/`
- location selectors: `/r/singapore/` (reddit.com)

Others only point to the homepage and would limit the recall to just this page:
- `/search/label/inicio`, `/pagina/bienvenidos-al-comite-de-sanidad-vegetal-cosave`

For now: we keep only prefixes up to 16 characters. However, clean curated URL prefixes might improve the data set in future runs.

In [5]:
def normalize_path_prefix(url):
    u = urlparse(url)
    path = u.path
    if len(path) > 16:
        path = '/'
    return '%s://%s%s' % (u.scheme, u.netloc, path)

df['link'] = df['link'].apply(normalize_path_prefix)
df['url_path_prefix'] = df['link'].apply(get_path_prefix)

df['url_path_prefix'].value_counts().to_frame()

Unnamed: 0,url_path_prefix
/,400
,13
/es/,6
/forums/,2
/web/,1
/presidencia/,1
/es,1
/informa,1
/forum/,1
/mining/,1


In [6]:
# add columns required to get the counts from Common Crawl

import surt
import tldextract

def get_host(url):
    return urlparse(url).netloc.lower().lstrip('.')

def get_surtkey(url):
    return surt.surt(url)

def get_registered_domain(host):
    return tldextract.extract(host).registered_domain


df['url_host_name'] = df['link'].apply(get_host)
df['url_host_registered_domain'] = df['url_host_name'].apply(get_registered_domain)
df['url_surtkey'] = df['link'].apply(get_surtkey)

df.head()

Unnamed: 0,id,title,link,language,url_path_prefix,url_host_name,url_host_registered_domain,url_surtkey
0,12,Fundacion Cajasol,https://fundacioncajasol.com/,es,/,fundacioncajasol.com,fundacioncajasol.com,"com,fundacioncajasol)/"
1,13,Asamblea Nacional del Ecuador,http://www.confirmado.net/,es,/,www.confirmado.net,confirmado.net,"net,confirmado)/"
2,14,el periodico de Tlaxcala,https://elperiodicodetlaxcala.com/,es,/,elperiodicodetlaxcala.com,elperiodicodetlaxcala.com,"com,elperiodicodetlaxcala)/"
3,15,mispeces,https://www.mispeces.com/,es,/,www.mispeces.com,mispeces.com,"com,mispeces)/"
4,16,DiarioVasco,https://www.diariovasco.com/,es,/,www.diariovasco.com,diariovasco.com,"com,diariovasco)/"


In [7]:
# look for duplicates
df[df.duplicated(subset=['url_surtkey'], keep=False)]

Unnamed: 0,id,title,link,language,url_path_prefix,url_host_name,url_host_registered_domain,url_surtkey
60,72,elcano royal insitute (real istituto elcano),http://www.realinstitutoelcano.org/,es,/,www.realinstitutoelcano.org,realinstitutoelcano.org,"org,realinstitutoelcano)/"
82,94,real instituto elcano,http://www.realinstitutoelcano.org/,es,/,www.realinstitutoelcano.org,realinstitutoelcano.org,"org,realinstitutoelcano)/"


In [8]:
# deduplicate
df.drop_duplicates(subset=['url_surtkey'], inplace=True)
df.shape

(455, 8)

In [9]:
# export the clean seed list
df.to_csv('seeds.csv', index=False)
df.to_parquet('seeds.gz.parquet', compression='gzip', index=False)