In [8]:
import pandas as pd
import numpy as np
import re
import sqlite3

In [16]:
matched_df = pd.read_csv("outputs/matched_df.csv")
matched_df

Unnamed: 0,ReversedDomain,URL,Title,Description,Topic,ID
0,ac.accent,http://www.accent.ac/,Accent Services,UK based full service commercial / industrial ...,Business,362
1,ac.accent,http://www.accent.ac/,Accent Services,A full service commercial and industrial HVAC ...,Regional,362
2,ac.acs,http://www.acs.ac/,Anderson County Schools,K-12 public schools in the county (not includi...,Regional,383
3,ac.acs,http://www.acs.ac/,Anderson County Schools,"(Clinton) Information, departments, and rules ...",Regional,383
4,ac.adamcadre,http://adamcadre.ac/905.html,9:05,Allows the game to be played on-line via a Jav...,Games,390
...,...,...,...,...,...,...
3265652,zw.org.nascoh,http://www.nascoh.org.zw/,National Association of Societies for the Care...,The umbrella body for organisations of and for...,Society,91033952
3265653,zw.org.zispa,http://www.zispa.org.zw/,ZW Domain - Zimbabwe,NIC for .zw CCTLD.,Computers,91034085
3265654,zw.org.zispa,http://www.zispa.org.zw/,Zimbabwe Internet Service Providers Association,"A non-profit organisation which controls, allo...",Regional,91034085
3265655,zw.org.zlhr,http://www.zlhr.org.zw/,Zimbabwe Lawyers for Human Rights (ZLHR),Non-profit organisation focusing on promoting ...,Regional,91034088


In [17]:
matched_df = matched_df.replace(np.nan, "")

In [18]:
# create a content column 
matched_df["AllContent"] = matched_df["Title"] + " " + matched_df["Description"]

In [19]:
# Create a cleaned content column

double_dash_regex = re.compile(r"--+")  # Matches double dashes
invalid_char_regex = re.compile(r"[^\w\s-]+")  # Matches non-alphanumeric, non-space, non-dash characters
underscore_regex = re.compile(r"_")  # Matches underscores
numbers_regex = re.compile(r"\b[\w-]*\d[\w-]*\b")  # Matches valid words without digits
leading_trailing_dash_regex = re.compile(r"(?<!\w)-+|-+(?!\w)")  # Matches leading/trailing dashes

# Replace double dashes with a space
matched_df["CleanedContent"] = (
    matched_df["AllContent"]
    .str.replace(double_dash_regex, " ", regex=True)  # Replace double dashes 
    .str.replace(invalid_char_regex, " ", regex=True) # Replace invalid characters with a space (non-ASCII, non-alphanumeric, and non-dash)
    .str.replace(underscore_regex, " ", regex=True)  # Replace underscores with spaces
    .str.replace(numbers_regex, "", regex=True)  # Remove words with numbers
    .str.replace(leading_trailing_dash_regex, "", regex=True) # Strip leading and trailing dashes for each word
    .str.lower()
    .str.split()
)

matched_df.head(20)

Unnamed: 0,ReversedDomain,URL,Title,Description,Topic,ID,AllContent,CleanedContent
0,ac.accent,http://www.accent.ac/,Accent Services,UK based full service commercial / industrial ...,Business,362,Accent Services UK based full service commerci...,"[accent, services, uk, based, full, service, c..."
1,ac.accent,http://www.accent.ac/,Accent Services,A full service commercial and industrial HVAC ...,Regional,362,Accent Services A full service commercial and ...,"[accent, services, a, full, service, commercia..."
2,ac.acs,http://www.acs.ac/,Anderson County Schools,K-12 public schools in the county (not includi...,Regional,383,Anderson County Schools K-12 public schools in...,"[anderson, county, schools, public, schools, i..."
3,ac.acs,http://www.acs.ac/,Anderson County Schools,"(Clinton) Information, departments, and rules ...",Regional,383,"Anderson County Schools (Clinton) Information,...","[anderson, county, schools, clinton, informati..."
4,ac.adamcadre,http://adamcadre.ac/905.html,9:05,Allows the game to be played on-line via a Jav...,Games,390,9:05 Allows the game to be played on-line via ...,"[allows, the, game, to, be, played, on-line, v..."
5,ac.adamcadre,http://adamcadre.ac/,Adam Cadre,"Author's site with interactive fiction, inform...",Games,390,Adam Cadre Author's site with interactive fict...,"[adam, cadre, author, s, site, with, interacti..."
6,ac.adamcadre,http://adamcadre.ac/gull/gull.html,Gull,A package designed to enable people to familia...,Games,390,Gull A package designed to enable people to fa...,"[gull, a, package, designed, to, enable, peopl..."
7,ac.adamcadre,http://www.adamcadre.ac/inform.html,"Inform Library Patch Site, The",Catalog of bugs and patches for the Inform lib...,Games,390,"Inform Library Patch Site, The Catalog of bugs...","[inform, library, patch, site, the, catalog, o..."
8,ac.adamcadre,http://adamcadre.ac/if.html#I-0,AdamCadre.ac: Games,Author's website. Download I-0 or play it onli...,Games,390,AdamCadre.ac: Games Author's website. Download...,"[adamcadre, ac, games, author, s, website, dow..."
9,ac.adamcadre,http://adamcadre.ac/if.html#Photopia,AdamCadre.ac: Games,Author's website. Download Photopia in Z-machi...,Games,390,AdamCadre.ac: Games Author's website. Download...,"[adamcadre, ac, games, author, s, website, dow..."


In [20]:
# Keep only ASCII words
matched_df["CleanedContent"] = matched_df["CleanedContent"].apply(lambda words: [word for word in words if word.isascii()])

# Drop rows with empty CleanedContents
matched_df = matched_df[matched_df["CleanedContent"].str.len() > 0].reset_index(drop=True)

matched_df.head(20)

Unnamed: 0,ReversedDomain,URL,Title,Description,Topic,ID,AllContent,CleanedContent
0,ac.accent,http://www.accent.ac/,Accent Services,UK based full service commercial / industrial ...,Business,362,Accent Services UK based full service commerci...,"[accent, services, uk, based, full, service, c..."
1,ac.accent,http://www.accent.ac/,Accent Services,A full service commercial and industrial HVAC ...,Regional,362,Accent Services A full service commercial and ...,"[accent, services, a, full, service, commercia..."
2,ac.acs,http://www.acs.ac/,Anderson County Schools,K-12 public schools in the county (not includi...,Regional,383,Anderson County Schools K-12 public schools in...,"[anderson, county, schools, public, schools, i..."
3,ac.acs,http://www.acs.ac/,Anderson County Schools,"(Clinton) Information, departments, and rules ...",Regional,383,"Anderson County Schools (Clinton) Information,...","[anderson, county, schools, clinton, informati..."
4,ac.adamcadre,http://adamcadre.ac/905.html,9:05,Allows the game to be played on-line via a Jav...,Games,390,9:05 Allows the game to be played on-line via ...,"[allows, the, game, to, be, played, on-line, v..."
5,ac.adamcadre,http://adamcadre.ac/,Adam Cadre,"Author's site with interactive fiction, inform...",Games,390,Adam Cadre Author's site with interactive fict...,"[adam, cadre, author, s, site, with, interacti..."
6,ac.adamcadre,http://adamcadre.ac/gull/gull.html,Gull,A package designed to enable people to familia...,Games,390,Gull A package designed to enable people to fa...,"[gull, a, package, designed, to, enable, peopl..."
7,ac.adamcadre,http://www.adamcadre.ac/inform.html,"Inform Library Patch Site, The",Catalog of bugs and patches for the Inform lib...,Games,390,"Inform Library Patch Site, The Catalog of bugs...","[inform, library, patch, site, the, catalog, o..."
8,ac.adamcadre,http://adamcadre.ac/if.html#I-0,AdamCadre.ac: Games,Author's website. Download I-0 or play it onli...,Games,390,AdamCadre.ac: Games Author's website. Download...,"[adamcadre, ac, games, author, s, website, dow..."
9,ac.adamcadre,http://adamcadre.ac/if.html#Photopia,AdamCadre.ac: Games,Author's website. Download Photopia in Z-machi...,Games,390,AdamCadre.ac: Games Author's website. Download...,"[adamcadre, ac, games, author, s, website, dow..."


In [21]:
cleaned_df = matched_df[["ReversedDomain", "CleanedContent", "Topic", "ID"]]
cleaned_df = cleaned_df.rename(columns={"CleanedContent": "Content"})
cleaned_df.head(20)

Unnamed: 0,ReversedDomain,Content,Topic,ID
0,ac.accent,"[accent, services, uk, based, full, service, c...",Business,362
1,ac.accent,"[accent, services, a, full, service, commercia...",Regional,362
2,ac.acs,"[anderson, county, schools, public, schools, i...",Regional,383
3,ac.acs,"[anderson, county, schools, clinton, informati...",Regional,383
4,ac.adamcadre,"[allows, the, game, to, be, played, on-line, v...",Games,390
5,ac.adamcadre,"[adam, cadre, author, s, site, with, interacti...",Games,390
6,ac.adamcadre,"[gull, a, package, designed, to, enable, peopl...",Games,390
7,ac.adamcadre,"[inform, library, patch, site, the, catalog, o...",Games,390
8,ac.adamcadre,"[adamcadre, ac, games, author, s, website, dow...",Games,390
9,ac.adamcadre,"[adamcadre, ac, games, author, s, website, dow...",Games,390


In [76]:
len(cleaned_df)

3024464

In [22]:
cleaned_df = cleaned_df.groupby(['ID', 'Topic'], as_index=False).agg({
        'ReversedDomain': 'first',  # Keep the first ReversedDomain entry
        'Content': lambda x: sum(x, [])  # Combine lists without removing duplicates
    })

In [23]:
cleaned_df.head(20)

Unnamed: 0,ID,Topic,ReversedDomain,Content
0,362,Business,ac.accent,"[accent, services, uk, based, full, service, c..."
1,362,Regional,ac.accent,"[accent, services, a, full, service, commercia..."
2,383,Regional,ac.acs,"[anderson, county, schools, public, schools, i..."
3,390,Games,ac.adamcadre,"[allows, the, game, to, be, played, on-line, v..."
4,424,World,ac.aikido,"[langnau, aikidogruppe, in, zusammenarbeit, mi..."
5,442,Computers,ac.alastairc,"[alastairc, the, four, levels, of, pdf, access..."
6,536,Business,ac.apec,"[apec, birmingham, practice, provides, informa..."
7,548,Health,ac.apt,"[the, association, for, psychological, therapi..."
8,567,World,ac.arctic,"[arctic, herstellerseite, mit, vorstellung, se..."
9,650,World,ac.badminton,"[badminton, academy]"


In [24]:
len(cleaned_df)

2289397

In [25]:
cleaned_df.to_csv('outputs/cleaned_matched_df.csv', index=False)

## Create list of all terms

In [78]:
terms = cleaned_df["Content"].explode().drop_duplicates()
terms

0               accent
0             services
0                   uk
0                based
0                 full
              ...     
3024432    sterkinekor
3024436       tanganda
3024443         utande
3024459         nascoh
3024462           zlhr
Name: Content, Length: 1428965, dtype: object

In [80]:
len(terms)

1428965

In [81]:
terms_df = pd.DataFrame({'Terms': terms})

In [83]:
terms_df = terms_df.reset_index(drop=True)
terms_df

Unnamed: 0,Terms
0,accent
1,services
2,uk
3,based
4,full
...,...
1428960,sterkinekor
1428961,tanganda
1428962,utande
1428963,nascoh


## Create term-vectors

In [84]:
exploded_df = cleaned_df.explode("Content")

In [85]:
term_counts = (
    exploded_df.groupby("Topic")["Content"]
    .value_counts()
    .reset_index(name="Occurrences")
)
term_counts

Unnamed: 0,Topic,Content,Occurrences
0,Arts,and,185075
1,Arts,the,101480
2,Arts,of,80814
3,Arts,a,42547
4,Arts,in,33689
...,...,...,...
2029887,World,zzuchthunde,1
2029888,World,zzum,1
2029889,World,zzurro,1
2029890,World,zzwei,1


In [86]:
term_counts = term_counts.pivot(index="Content", columns="Topic", values="Occurrences").fillna(0).astype(int)
term_counts.reset_index(inplace=True)
term_counts

Topic,Content,Arts,Business,Computers,Games,Health,Home,News,Recreation,Reference,Regional,Science,Shopping,Society,Sports,World
0,a,42547,29110,20246,9309,11929,5302,1021,18110,8420,173199,15888,12602,44369,13808,106514
1,a-a,0,1,0,0,0,0,0,0,0,4,0,0,0,0,0
2,a-a-p,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0
3,a-a-reisen,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,a-aaa,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1428960,zzyzx,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1428961,zzz,0,0,0,0,0,0,0,0,0,1,0,0,0,0,3
1428962,zzzebra,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6
1428963,zzzing,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [93]:
term_counts = term_counts.rename(columns={"Content": "Terms"})

In [98]:
term_vectors_df = terms_df.merge(
    term_counts,  # Existing term counts
    on="Terms",  # Match on the 'Content' column
    how="left"  # Ensure all terms in all_terms_df are kept
)
term_vectors_df

Unnamed: 0,Terms,Arts,Business,Computers,Games,Health,Home,News,Recreation,Reference,Regional,Science,Shopping,Society,Sports,World
0,accent,19,76,6,0,10,0,0,1,2,137,3,41,5,2,110
1,services,779,30568,8031,97,7371,400,134,1478,1857,156151,3359,878,12404,820,16133
2,uk,1876,5894,1014,164,877,214,123,1274,246,8841,1187,1648,1636,1434,168
3,based,5339,12949,4371,552,1385,245,308,2478,442,14008,1493,1599,4172,2415,149
4,full,1577,3723,832,162,1253,136,44,806,222,11928,672,575,919,475,377
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1428960,sterkinekor,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1428961,tanganda,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1428962,utande,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1428963,nascoh,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [96]:
term_vectors_df.to_csv('outputs/term_vectors.csv', index=False)

# Results

In [26]:
cleaned_df

Unnamed: 0,ID,Topic,ReversedDomain,Content
0,362,Business,ac.accent,"[accent, services, uk, based, full, service, c..."
1,362,Regional,ac.accent,"[accent, services, a, full, service, commercia..."
2,383,Regional,ac.acs,"[anderson, county, schools, public, schools, i..."
3,390,Games,ac.adamcadre,"[allows, the, game, to, be, played, on-line, v..."
4,424,World,ac.aikido,"[langnau, aikidogruppe, in, zusammenarbeit, mi..."
...,...,...,...,...
2289392,91033952,Society,zw.org.nascoh,"[national, association, of, societies, for, th..."
2289393,91034085,Computers,zw.org.zispa,"[zw, domain, zimbabwe, nic, for, zw, cctld]"
2289394,91034085,Regional,zw.org.zispa,"[zimbabwe, internet, service, providers, assoc..."
2289395,91034088,Regional,zw.org.zlhr,"[zimbabwe, lawyers, for, human, rights, zlhr, ..."


In [99]:
term_vectors_df

Unnamed: 0,Terms,Arts,Business,Computers,Games,Health,Home,News,Recreation,Reference,Regional,Science,Shopping,Society,Sports,World
0,accent,19,76,6,0,10,0,0,1,2,137,3,41,5,2,110
1,services,779,30568,8031,97,7371,400,134,1478,1857,156151,3359,878,12404,820,16133
2,uk,1876,5894,1014,164,877,214,123,1274,246,8841,1187,1648,1636,1434,168
3,based,5339,12949,4371,552,1385,245,308,2478,442,14008,1493,1599,4172,2415,149
4,full,1577,3723,832,162,1253,136,44,806,222,11928,672,575,919,475,377
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1428960,sterkinekor,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1428961,tanganda,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1428962,utande,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1428963,nascoh,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
