In [1]:
import pandas as pd
import numpy as np

import sqlite3

## Load in ODP dataset

In [2]:
odpcontentpath = "webdata/odp/content.rdf.u8"

In [19]:
PAGELEAD = "  <ExternalPage"
LINKPOS = len(PAGELEAD) + len(" about=")

TITLELEAD = "    <d:Title>"
DESCLEAD = "    <d:Description>"
TOPICLEAD = "    <topic>Top/"

ENDPAGE = "  </ExternalPage"


data =[]

link = ""
revdomain = ""
title = ""
description = ""
topic = ""

foundpage = False
n = 0
with open(odpcontentpath, "r", encoding="utf-8") as file:
    for line in file:
        if foundpage:
            if line.startswith(TITLELEAD):
                title = line[len(TITLELEAD):line.find("</")]
            elif line.startswith(DESCLEAD):
                description = line[len(DESCLEAD):line.find("</")]
            elif line.startswith(TOPICLEAD):
                topic = line[len(TOPICLEAD):line.find("</")]
                # remove the subtopics
                slash = topic.find("/")
                if slash != -1:
                    topic = topic[:slash]
            elif line.startswith(ENDPAGE):
                data.append([revdomain, link, title, description, topic])
                foundpage = False
                
        if line.startswith(PAGELEAD):
            foundpage = True
            n += 1
            link = line[LINKPOS+1:line.find(">")-1]
    
            domain = link.split("://")[-1].split("/")[0]
            if domain.startswith("www."):
                domain = domain[4:]
            revdomain = ".".join(reversed(domain.split(".")))
            if revdomain[0] == ".":
                revdomain = revdomain[1:]

In [20]:
odp_df = pd.DataFrame(data, columns = ["ReversedDomain", "URL", "Title", "Description", "Topic"])
odp_df.head(20)

Unnamed: 0,ReversedDomain,URL,Title,Description,Topic
0,com.awn,http://www.awn.com/,Animation World Network,Provides information resources to the internat...,Arts
1,com.about.animation,http://animation.about.com/,About.com: Animation Guide,Keep up with developments in online animation ...,Arts
2,com.toonhound,http://www.toonhound.com/,Toonhound,"British cartoon, animation and comic strip cre...",Arts
3,com.digitalmediafx,http://www.digitalmediafx.com/Features/animati...,Digital Media FX: The History of Animation,Michael Crandol takes an exhaustive look at th...,Arts
4,net.animated-divots,http://www.animated-divots.net/,Richard's Animated Divots,"Chronology of animated movies, television prog...",Arts
5,com.angelfire,http://www.angelfire.com/anime2/ninisbishonen/,Nini's Bishonen Dungeon,"Shrines to Vega, Taiki, Dilandau, and Tiger Ey...",Arts
6,com.angelfire,http://www.angelfire.com/anime2/bestanimechara...,Site for Liz's Anime Favorites,"Shrines to Duo, Ryoko, Shampoo, Katy the Kitty...",Arts
7,com.tripod.valleyofazure,http://valleyofazure.tripod.com/,Azure Valley,Dedicated to anthropomorphic characters. Fan a...,Arts
8,com.angelfire,http://www.angelfire.com/nv/neko/,Neko Central,"Image galleries, descriptions, information, an...",Arts
9,com.angelfire,http://www.angelfire.com/grrl/magicshoppe2/,Chibi Hime's Magic Shoppe 2,"Shrines to Escaflowne's Dilandau, Final Fantas...",Arts


In [21]:
odp_df.to_csv("outputs/odp_df.csv", index=False)

In [3]:
# odp_df = pd.read_csv("odp_df.csv")

In [4]:
len(odp_df)

3573026

## Load in Common Crawl web graph

In [5]:
verticiesdomainpath = "webdata/commoncrawl/cc-main-2017-may-jun-jul/vertices.txt"

In [6]:
ccdomains_df = pd.read_csv(
        verticiesdomainpath,
        sep="\t",
        names=["ID", "ReversedDomain"],  # Explicit column names
        dtype={"ID": "int32", "ReversedDomain": "string"},  # Use optimized data types
        engine="c",  # Use the C engine for faster parsing
    )

In [7]:
len(ccdomains_df)

91034128

In [8]:
ccdomains_df.head(20)

Unnamed: 0,ID,ReversedDomain
0,0,aaa.a
1,1,aaa.aa
2,2,aaa.aaa
3,3,aaa.aaaa
4,4,aaa.aaaaaa
5,5,aaa.aaaaaaaa
6,6,aaa.aaaaaaaaa
7,7,aaa.aaaaaaaaaaa
8,8,aaa.asd
9,9,aaa.bzzzz


## Match the dataframes

In [9]:
conn = sqlite3.connect("outputs/matching.db")

odp_df.to_sql("odp", conn, if_exists="replace", index=False)
ccdomains_df.to_sql("ccdomains", conn, if_exists="replace", index=False)

91034128

In [10]:
conn.execute("CREATE INDEX idx_odp ON odp (ReversedDomain);")

<sqlite3.Cursor at 0x4287409c0>

In [11]:
conn.execute("CREATE INDEX idx_ccdomains ON ccdomains (ReversedDomain);")

<sqlite3.Cursor at 0x428740dc0>

In [12]:
pd.read_sql_query("SELECT ReversedDomain FROM odp;", conn)

Unnamed: 0,ReversedDomain
0,%20de.%20xn--wirtshaus-mittelmhle-5ec
1,1.104.23.163
2,1.105.23.163
3,1.108.23.163
4,1.111.23.163
...,...
3573021,zw.org.zispa
3573022,zw.org.zispa
3573023,zw.org.zlhr
3573024,zw.org.zlhr


In [13]:
query = """
    CREATE TABLE matched AS
    SELECT *
    FROM odp
    INNER JOIN ccdomains
    ON odp.ReversedDomain = ccdomains.ReversedDomain;
"""

conn.execute(query)

<sqlite3.Cursor at 0x428740fc0>

In [14]:
matched_df = pd.read_sql_query("SELECT * FROM matched", conn)

In [15]:
matched_df = matched_df.drop(['ReversedDomain:1'], axis=1)

In [16]:
matched_df

Unnamed: 0,ReversedDomain,URL,Title,Description,Topic,ID
0,ac.accent,http://www.accent.ac/,Accent Services,UK based full service commercial / industrial ...,Business,362
1,ac.accent,http://www.accent.ac/,Accent Services,A full service commercial and industrial HVAC ...,Regional,362
2,ac.acs,http://www.acs.ac/,Anderson County Schools,K-12 public schools in the county (not includi...,Regional,383
3,ac.acs,http://www.acs.ac/,Anderson County Schools,"(Clinton) Information, departments, and rules ...",Regional,383
4,ac.adamcadre,http://adamcadre.ac/905.html,9:05,Allows the game to be played on-line via a Jav...,Games,390
...,...,...,...,...,...,...
3265652,zw.org.nascoh,http://www.nascoh.org.zw/,National Association of Societies for the Care...,The umbrella body for organisations of and for...,Society,91033952
3265653,zw.org.zispa,http://www.zispa.org.zw/,ZW Domain - Zimbabwe,NIC for .zw CCTLD.,Computers,91034085
3265654,zw.org.zispa,http://www.zispa.org.zw/,Zimbabwe Internet Service Providers Association,"A non-profit organisation which controls, allo...",Regional,91034085
3265655,zw.org.zlhr,http://www.zlhr.org.zw/,Zimbabwe Lawyers for Human Rights (ZLHR),Non-profit organisation focusing on promoting ...,Regional,91034088


In [17]:
matched_df.to_csv("outputs/matched_df.csv", index=False)