In [1]:
import pandas as pd
import numpy as np
import re

import sqlite3

# Clean matched dataframe (verticies)

In [2]:
matched_df = pd.read_csv("outputs/matched_df.csv")
matched_df

Unnamed: 0,ReversedDomain,URL,Title,Description,Topic,ID
0,ac.accent,http://www.accent.ac/,Accent Services,UK based full service commercial / industrial ...,Business,362
1,ac.accent,http://www.accent.ac/,Accent Services,A full service commercial and industrial HVAC ...,Regional,362
2,ac.acs,http://www.acs.ac/,Anderson County Schools,K-12 public schools in the county (not includi...,Regional,383
3,ac.acs,http://www.acs.ac/,Anderson County Schools,"(Clinton) Information, departments, and rules ...",Regional,383
4,ac.adamcadre,http://adamcadre.ac/905.html,9:05,Allows the game to be played on-line via a Jav...,Games,390
...,...,...,...,...,...,...
3265652,zw.org.nascoh,http://www.nascoh.org.zw/,National Association of Societies for the Care...,The umbrella body for organisations of and for...,Society,91033952
3265653,zw.org.zispa,http://www.zispa.org.zw/,ZW Domain - Zimbabwe,NIC for .zw CCTLD.,Computers,91034085
3265654,zw.org.zispa,http://www.zispa.org.zw/,Zimbabwe Internet Service Providers Association,"A non-profit organisation which controls, allo...",Regional,91034085
3265655,zw.org.zlhr,http://www.zlhr.org.zw/,Zimbabwe Lawyers for Human Rights (ZLHR),Non-profit organisation focusing on promoting ...,Regional,91034088


In [3]:
matched_df = matched_df.replace(np.nan, "")

In [4]:
matched_df = matched_df.drop_duplicates()
len(matched_df)

3234591

In [5]:
# create a content column 
matched_df["AllContent"] = matched_df["Title"] + " " + matched_df["Description"]

In [6]:
# Create a cleaned content column

double_dash_regex = re.compile(r"--+")  # Matches double dashes
invalid_char_regex = re.compile(r"[^\w\s-]+")  # Matches non-alphanumeric, non-space, non-dash characters
underscore_regex = re.compile(r"_")  # Matches underscores
numbers_regex = re.compile(r"\b[\w-]*\d[\w-]*\b")  # Matches valid words without digits
leading_trailing_dash_regex = re.compile(r"(?<!\w)-+|-+(?!\w)")  # Matches leading/trailing dashes

# Replace double dashes with a space
matched_df["CleanedContent"] = (
    matched_df["AllContent"]
    .str.replace(double_dash_regex, " ", regex=True)  # Replace double dashes 
    .str.replace(invalid_char_regex, " ", regex=True) # Replace invalid characters with a space (non-ASCII, non-alphanumeric, and non-dash)
    .str.replace(underscore_regex, " ", regex=True)  # Replace underscores with spaces
    .str.replace(numbers_regex, "", regex=True)  # Remove words with numbers
    .str.replace(leading_trailing_dash_regex, "", regex=True) # Strip leading and trailing dashes for each word
    .str.lower()
)

matched_df.head(20)

Unnamed: 0,ReversedDomain,URL,Title,Description,Topic,ID,AllContent,CleanedContent
0,ac.accent,http://www.accent.ac/,Accent Services,UK based full service commercial / industrial ...,Business,362,Accent Services UK based full service commerci...,accent services uk based full service commerci...
1,ac.accent,http://www.accent.ac/,Accent Services,A full service commercial and industrial HVAC ...,Regional,362,Accent Services A full service commercial and ...,accent services a full service commercial and ...
2,ac.acs,http://www.acs.ac/,Anderson County Schools,K-12 public schools in the county (not includi...,Regional,383,Anderson County Schools K-12 public schools in...,anderson county schools public schools in the...
3,ac.acs,http://www.acs.ac/,Anderson County Schools,"(Clinton) Information, departments, and rules ...",Regional,383,"Anderson County Schools (Clinton) Information,...",anderson county schools clinton information ...
4,ac.adamcadre,http://adamcadre.ac/905.html,9:05,Allows the game to be played on-line via a Jav...,Games,390,9:05 Allows the game to be played on-line via ...,allows the game to be played on-line via a j...
5,ac.adamcadre,http://adamcadre.ac/,Adam Cadre,"Author's site with interactive fiction, inform...",Games,390,Adam Cadre Author's site with interactive fict...,adam cadre author s site with interactive fict...
6,ac.adamcadre,http://adamcadre.ac/gull/gull.html,Gull,A package designed to enable people to familia...,Games,390,Gull A package designed to enable people to fa...,gull a package designed to enable people to fa...
7,ac.adamcadre,http://www.adamcadre.ac/inform.html,"Inform Library Patch Site, The",Catalog of bugs and patches for the Inform lib...,Games,390,"Inform Library Patch Site, The Catalog of bugs...",inform library patch site the catalog of bugs...
8,ac.adamcadre,http://adamcadre.ac/if.html#I-0,AdamCadre.ac: Games,Author's website. Download I-0 or play it onli...,Games,390,AdamCadre.ac: Games Author's website. Download...,adamcadre ac games author s website download...
9,ac.adamcadre,http://adamcadre.ac/if.html#Photopia,AdamCadre.ac: Games,Author's website. Download Photopia in Z-machi...,Games,390,AdamCadre.ac: Games Author's website. Download...,adamcadre ac games author s website download...


In [7]:
len(matched_df)

3234591

In [21]:
# Keep only ASCII words
matched_df["CleanedContent"] = matched_df["CleanedContent"].str.split().apply(lambda words: " ".join([word for word in words if word.isascii()]))

# Drop rows with empty CleanedContents
matched_df = matched_df[matched_df["CleanedContent"].str.len() > 0].reset_index(drop=True)

matched_df.head(20)

Unnamed: 0,ReversedDomain,URL,Title,Description,Topic,ID,AllContent,CleanedContent
0,ac.accent,http://www.accent.ac/,Accent Services,UK based full service commercial / industrial ...,Business,362,Accent Services UK based full service commerci...,accent services uk based full service commerci...
1,ac.accent,http://www.accent.ac/,Accent Services,A full service commercial and industrial HVAC ...,Regional,362,Accent Services A full service commercial and ...,accent services a full service commercial and ...
2,ac.acs,http://www.acs.ac/,Anderson County Schools,K-12 public schools in the county (not includi...,Regional,383,Anderson County Schools K-12 public schools in...,anderson county schools public schools in the ...
3,ac.adamcadre,http://adamcadre.ac/905.html,9:05,Allows the game to be played on-line via a Jav...,Games,390,9:05 Allows the game to be played on-line via ...,allows the game to be played on-line via a jav...
4,ac.adamcadre,http://adamcadre.ac/,Adam Cadre,"Author's site with interactive fiction, inform...",Games,390,Adam Cadre Author's site with interactive fict...,adam cadre author s site with interactive fict...
5,ac.adamcadre,http://adamcadre.ac/gull/gull.html,Gull,A package designed to enable people to familia...,Games,390,Gull A package designed to enable people to fa...,gull a package designed to enable people to fa...
6,ac.adamcadre,http://www.adamcadre.ac/inform.html,"Inform Library Patch Site, The",Catalog of bugs and patches for the Inform lib...,Games,390,"Inform Library Patch Site, The Catalog of bugs...",inform library patch site the catalog of bugs ...
7,ac.adamcadre,http://adamcadre.ac/if.html#I-0,AdamCadre.ac: Games,Author's website. Download I-0 or play it onli...,Games,390,AdamCadre.ac: Games Author's website. Download...,adamcadre ac games author s website download o...
8,ac.adamcadre,http://adamcadre.ac/if.html#Photopia,AdamCadre.ac: Games,Author's website. Download Photopia in Z-machi...,Games,390,AdamCadre.ac: Games Author's website. Download...,adamcadre ac games author s website download p...
9,ac.adamcadre,http://adamcadre.ac/if.html#Varicella,AdamCadre.ac: Games,Author's website. Download free in Z-machine o...,Games,390,AdamCadre.ac: Games Author's website. Download...,adamcadre ac games author s website download f...


In [9]:
len(matched_df)

2995341

In [22]:
# drop duplicates where the only difference is the description (keep first version)
matched_df = matched_df.drop_duplicates(subset=['URL', 'Title', 'Topic', 'ID'], keep="first")
len(matched_df)

2920868

In [32]:
# if a reverseddomain appears more than once and it is not the case that only the topic and description differs
# then drop all instances of that reversedomain from the dataframe
# (ie. drop all rows unless the only difference is the topic or description)
# WE DO THIS IN ORDER TO PREVENT PAGES WITH LITTLE RELEVANCE TO BE PRIORITIZED DUE TO THEM BELONGING TO A 
# DOMAIN WITH A HIGH NUMBER OF INBOUND LINKS

columns_to_check = matched_df.columns.difference(['ReversedDomain', 'Description', 'Topic', 'AllContent', 'CleanedContent'])
non_topic_variation = matched_df.groupby('ReversedDomain')[columns_to_check].nunique().max(axis=1)
domains_to_remove = non_topic_variation[non_topic_variation > 1].index
domains_to_remove

ReversedDomain
ac.accent        1
ac.acs           1
ac.adamcadre     7
ac.aikido        1
ac.alastairc     2
                ..
zw.org.csz       1
zw.org.lind      2
zw.org.nascoh    1
zw.org.zispa     2
zw.org.zlhr      1
Length: 2022708, dtype: int64

In [35]:
matched_df = matched_df[~matched_df['ReversedDomain'].isin(domains_to_remove)].reset_index(drop=True)
matched_df

Unnamed: 0,ReversedDomain,URL,Title,Description,Topic,ID,AllContent,CleanedContent
0,ac.accent,http://www.accent.ac/,Accent Services,UK based full service commercial / industrial ...,Business,362,Accent Services UK based full service commerci...,accent services uk based full service commerci...
1,ac.accent,http://www.accent.ac/,Accent Services,A full service commercial and industrial HVAC ...,Regional,362,Accent Services A full service commercial and ...,accent services a full service commercial and ...
2,ac.acs,http://www.acs.ac/,Anderson County Schools,K-12 public schools in the county (not includi...,Regional,383,Anderson County Schools K-12 public schools in...,anderson county schools public schools in the ...
10,ac.aikido,http://aikido.ac/langnau/index.php,Langnau - Aikidogruppe,Einführungskurse in Zusammenarbeit mit der Vol...,World,424,Langnau - Aikidogruppe Einführungskurse in Zus...,langnau aikidogruppe in zusammenarbeit mit der...
13,ac.apec,http://www.apec.ac/,APEC,Birmingham practice provides information on co...,Business,536,APEC Birmingham practice provides information ...,apec birmingham practice provides information ...
...,...,...,...,...,...,...,...,...
2920857,zw.gov.parlzim,http://www.parlzim.gov.zw/,Zimbabwe Parliament,"Provides current Bills, order paper, Hansard t...",Regional,91033765,"Zimbabwe Parliament Provides current Bills, or...",zimbabwe parliament provides current bills ord...
2920860,zw.org.csz,http://www.csz.org.zw/,Computer Society of Zimbabwe,Aims to encourage research and development in ...,Regional,91033860,Computer Society of Zimbabwe Aims to encourage...,computer society of zimbabwe aims to encourage...
2920863,zw.org.nascoh,http://www.nascoh.org.zw/,National Association of Societies for the Care...,The umbrella body for organisations of and for...,Society,91033952,National Association of Societies for the Care...,national association of societies for the care...
2920866,zw.org.zlhr,http://www.zlhr.org.zw/,Zimbabwe Lawyers for Human Rights (ZLHR),Non-profit organisation focusing on promoting ...,Regional,91034088,Zimbabwe Lawyers for Human Rights (ZLHR) Non-p...,zimbabwe lawyers for human rights zlhr non-pro...


# Process and filter edges

In [110]:
valid_nodes = pd.DataFrame({"node_id": matched_df["ID"]}).drop_duplicates().reset_index(drop=True)
valid_nodes

Unnamed: 0,node_id
0,362
1,383
2,424
3,536
4,548
...,...
1808224,91033670
1808225,91033765
1808226,91033860
1808227,91033952


## Save edges to database

In [111]:
edgesdomainpath = "webdata/commoncrawl/cc-main-2017-may-jun-jul/edges.txt"

In [None]:
conn = sqlite3.connect("outputs/edges.db")
cursor = conn.cursor()

In [None]:
# Create table for edges
cursor.execute("""
    CREATE TABLE edges (
        from_node INTEGER,
        to_node INTEGER
    )
""")
conn.commit()

In [None]:
chunk_size = 10**6  # Adjust based on memory
with pd.read_csv(edgesdomainpath, sep="\t", chunksize=chunk_size, header=None, names=["from_node", "to_node"]) as reader:
    for chunk in reader:
        chunk.to_sql("edges", conn, if_exists="append", index=False)

In [None]:
valid_nodes.to_sql("valid_nodes", conn, if_exists="replace", index=False)

In [None]:
# Create indexes on the edges and valid_nodes tables
cursor.execute("CREATE INDEX idx_edges_from_node ON edges (from_node);")
cursor.execute("CREATE INDEX idx_edges_to_node ON edges (to_node);")
conn.commit()

In [None]:
cursor.execute("CREATE INDEX idx_valid_nodes_node_id ON valid_nodes (node_id);")
conn.commit()

## Create filtered database

### Remove nodes out of range

In [None]:
conn = sqlite3.connect("outputs/edges.db")
cursor = conn.cursor()

In [None]:
# Step 1: Get the minimum and maximum valid node IDs
cursor.execute("SELECT MIN(node_id), MAX(node_id) FROM valid_nodes;")
min_valid_id, max_valid_id = cursor.fetchone()
print(f"Valid node ID range: {min_valid_id} - {max_valid_id}")

cursor.execute("""
DELETE FROM edges
WHERE from_node < ? OR from_node > ?;
""", (min_valid_id, max_valid_id))

cursor.execute("""
DELETE FROM edges
WHERE to_node < ? OR to_node > ?;
""", (min_valid_id, max_valid_id))

conn.commit()

In [None]:
# Get the total number of rows in the edges table
cursor.execute("SELECT COUNT(*) FROM edges;")
total_rows = cursor.fetchone()[0]
print(total_rows)

#1071170354

### Filter based on the from_nodes (do in chunks due to size)

In [None]:
chunk_size = 5*10**8
offset = 0
query = f"""
    CREATE TABLE temp_edges AS
    SELECT *
    FROM edges
    LIMIT {chunk_size} OFFSET {offset};
"""
cursor.execute(query)
conn.commit()
print(f"Processed rows: {min(offset + chunk_size, total_rows)}")

In [None]:
query = f"""
    CREATE TABLE filtered_from_p1 AS
    SELECT temp_edges.*
    FROM temp_edges
    INNER JOIN valid_nodes 
    ON temp_edges.from_node = valid_nodes.node_id;
"""

cursor.execute(query)
conn.commit()
print(f"Processed rows: {min(offset + chunk_size, total_rows)}")

In [None]:
cursor.execute("SELECT COUNT(*) FROM filtered_from_p1;")
part1_from_rows = cursor.fetchone()[0]
print(part1_from_rows)

In [None]:
cursor.execute("SELECT * FROM filtered_from_p1 LIMIT 20 OFFSET 81509909;")
part1_from_rows = cursor.fetchall()
for row in part1_from_rows:
    print(row)

In [None]:
cursor.execute("DROP TABLE IF EXISTS temp_edges;")
conn.commit()

In [None]:
query = f"""
    CREATE TABLE temp_edges AS
    SELECT *
    FROM edges
    LIMIT {total_rows - chunk_size} OFFSET {chunk_size};
"""
cursor.execute(query)
conn.commit()
print(f"Processed rows: {min(offset + chunk_size, total_rows)}")

In [None]:
query = f"""
    CREATE TABLE filtered_from_p2 AS
    SELECT temp_edges.*
    FROM temp_edges
    INNER JOIN valid_nodes 
    ON temp_edges.from_node = valid_nodes.node_id;
"""

cursor.execute(query)
conn.commit()
print(f"Processed rows: {min(total_rows - chunk_size, total_rows)}")

In [None]:
cursor.execute("SELECT COUNT(*) FROM filtered_from_p2;")
part2_from_rows = cursor.fetchone()[0]
print(part2_from_rows)

In [None]:
cursor.execute("SELECT * FROM filtered_from_p2 LIMIT 10;")
part2_from_rows = cursor.fetchall()
for row in part2_from_rows:
    print(row)

In [None]:
query = """
    CREATE TABLE merged_filtered_from AS
    SELECT * FROM filtered_from_p1
    UNION ALL
    SELECT * FROM filtered_from_p2;
"""
cursor.execute(query)
conn.commit()

In [None]:
cursor.execute("SELECT COUNT(*) FROM merged_filtered_from;")
merged_from_rows = cursor.fetchone()[0]
print(merged_from_rows)

### Now filter based on the to_nodes (can do all at once due to reduced size)

In [None]:
query = f"""
    CREATE TABLE filtered_edges AS
    SELECT merged_filtered_from.*
    FROM merged_filtered_from
    INNER JOIN valid_nodes 
    ON merged_filtered_from.to_node = valid_nodes.node_id;
"""

cursor.execute(query)
conn.commit()

In [None]:
cursor.execute("SELECT COUNT(*) FROM filtered_edges;")
filtered_rows = cursor.fetchone()[0]
print(filtered_rows)

## Save filtered_edges table

In [None]:
filtered_edges_df = pd.read_sql_query("SELECT * FROM filtered_edges", conn)

In [None]:
filtered_edges_df

## Prune pages whose number of inbound edges are outliers to prevent recurring dominant pages
WE DO THIS IN ORDER TO PREVENT PAGES WITH LITTLE RELEVANCE TO BE PRIORITIZED DUE TO THEM BELONGING TO A 
DOMAIN WITH A HIGH NUMBER OF INBOUND LINKS

In [None]:
indegree_counts = filtered_edges_df.groupby('to_node').size().reset_index(name='in_degree')
indegree_counts

In [None]:
num_deviations = 2

upper_threshold = indegree_counts['in_degree'].mean() + num_deviations * indegree_counts['in_degree'].std()
upper_threshold

In [None]:
outlier_vertices = indegree_counts[indegree_counts['in_degree'] > upper_threshold]
outlier_vertices.sort_values(by="in_degree", ascending=False)

## Adjust matched_df  and filtered_edges_df to not include the outlier verticies

In [None]:
matched_df = matched_df[~matched_df['ID'].isin(outlier_vertices["to_node"])].reset_index(drop=True)
matched_df

In [None]:
filtered_edges_df = filtered_edges_df[~filtered_edges_df['from_node'].isin(outlier_vertices["to_node"])].reset_index(drop=True)
filtered_edges_df = filtered_edges_df[~filtered_edges_df['to_node'].isin(outlier_vertices["to_node"])].reset_index(drop=True)
filtered_edges_df

In [None]:
filtered_edges_df.to_csv("outputs/filtered_edges_df.csv", index=False)

# Produce cleaned version matched dataframe

In [53]:
# WE CREATE A "PHONEBOOK" WITH THE FIRST REPEAT OF THE DOMAIN TAKEN
pagelookup_df = matched_df[["ReversedDomain", "URL", "Title", "Description", "ID"]].drop_duplicates(
    subset=['ReversedDomain'], keep='first').reset_index(drop=True)
pagelookup_df

Unnamed: 0,ReversedDomain,URL,Title,Description,ID
0,ac.accent,http://www.accent.ac/,Accent Services,UK based full service commercial / industrial ...,362
1,ac.acs,http://www.acs.ac/,Anderson County Schools,K-12 public schools in the county (not includi...,383
2,ac.aikido,http://aikido.ac/langnau/index.php,Langnau - Aikidogruppe,Einführungskurse in Zusammenarbeit mit der Vol...,424
3,ac.apec,http://www.apec.ac/,APEC,Birmingham practice provides information on co...,536
4,ac.apt,http://www.apt.ac/,The Association for Psychological Therapies,An independent organization offering accredite...,548
...,...,...,...,...,...
1808224,zw.co.zol,http://www.zol.co.zw/,Zimbabwe Online,Dial-up ISP.,91033670
1808225,zw.gov.parlzim,http://www.parlzim.gov.zw/,Zimbabwe Parliament,"Provides current Bills, order paper, Hansard t...",91033765
1808226,zw.org.csz,http://www.csz.org.zw/,Computer Society of Zimbabwe,Aims to encourage research and development in ...,91033860
1808227,zw.org.nascoh,http://www.nascoh.org.zw/,National Association of Societies for the Care...,The umbrella body for organisations of and for...,91033952


In [54]:
pagelookup_df.to_csv('outputs/pagelookup_df.csv', index=False)

In [75]:
cleaned_df = matched_df[["ReversedDomain", "CleanedContent", "Topic", "ID"]]
cleaned_df = cleaned_df.rename(columns={"CleanedContent": "Content"})
cleaned_df.head(20)

Unnamed: 0,ReversedDomain,Content,Topic,ID
0,ac.accent,accent services uk based full service commerci...,Business,362
1,ac.accent,accent services a full service commercial and ...,Regional,362
2,ac.acs,anderson county schools public schools in the ...,Regional,383
3,ac.aikido,langnau aikidogruppe in zusammenarbeit mit der...,World,424
4,ac.apec,apec birmingham practice provides information ...,Business,536
5,ac.apt,the association for psychological therapies an...,Health,548
6,ac.arctic,arctic herstellerseite mit vorstellung seiner ...,World,567
7,ac.badminton,badminton academy,World,650
8,ac.baseball-tv,npb,World,676
9,ac.britneyspears,britney s guide to semiconductor physics semic...,Science,792


In [92]:
len(cleaned_df)

1888981

In [59]:
cleaned_df.to_csv('outputs/cleaned_matched_df.csv', index=False)

In [94]:
# cleaned_df = pd.read_csv("outputs/cleaned_matched_df.csv")
# cleaned_df

Unnamed: 0,ReversedDomain,Content,Topic,ID
0,ac.accent,accent services uk based full service commerci...,Business,362
1,ac.accent,accent services a full service commercial and ...,Regional,362
2,ac.acs,anderson county schools public schools in the ...,Regional,383
3,ac.aikido,langnau aikidogruppe in zusammenarbeit mit der...,World,424
4,ac.apec,apec birmingham practice provides information ...,Business,536
...,...,...,...,...
1887205,zw.gov.parlzim,zimbabwe parliament provides current bills ord...,Regional,91033765
1887206,zw.org.csz,computer society of zimbabwe aims to encourage...,Regional,91033860
1887207,zw.org.nascoh,national association of societies for the care...,Society,91033952
1887208,zw.org.zlhr,zimbabwe lawyers for human rights zlhr non-pro...,Regional,91034088


## Create list of all terms

In [96]:
terms = cleaned_df["Content"].str.split().explode().drop_duplicates()
terms

0                         accent
0                       services
0                             uk
0                          based
0                           full
                   ...          
1887188    government-controlled
1887189                 tanganda
1887196                   utande
1887207                   nascoh
1887208                     zlhr
Name: Content, Length: 1198246, dtype: object

In [98]:
len(terms)

1198246

In [99]:
terms_df = pd.DataFrame({'Terms': terms})

In [100]:
terms_df = terms_df.reset_index(drop=True)
terms_df

Unnamed: 0,Terms
0,accent
1,services
2,uk
3,based
4,full
...,...
1198241,government-controlled
1198242,tanganda
1198243,utande
1198244,nascoh


# Create term-vectors

In [101]:
exploded_df = cleaned_df.copy()
exploded_df["Content"] = exploded_df["Content"].str.split()
exploded_df = exploded_df.explode("Content")

In [102]:
term_counts = (
    exploded_df.groupby("Topic")["Content"]
    .value_counts()
    .reset_index(name="Occurrences")
)
term_counts

Unnamed: 0,Topic,Content,Occurrences
0,Arts,and,75153
1,Arts,the,28869
2,Arts,of,28636
3,Arts,in,17659
4,Arts,a,14683
...,...,...,...
1645500,World,zztop,1
1645501,World,zzuchthunde,1
1645502,World,zzum,1
1645503,World,zzwei,1


In [103]:
term_counts = term_counts.pivot(index="Content", columns="Topic", values="Occurrences").fillna(0).astype(int)
term_counts.reset_index(inplace=True)
term_counts

Topic,Content,Arts,Business,Computers,Games,Health,Home,News,Recreation,Reference,Regional,Science,Shopping,Society,Sports,World
0,a,14683,22173,8891,2576,6775,2077,591,11862,3461,121440,5853,10493,18492,7469,68376
1,a-a,0,1,0,0,0,0,0,0,0,3,0,0,0,0,0
2,a-a-p,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0
3,a-a-reisen,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,a-aaa,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1198241,zzum,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1198242,zzwei,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1198243,zzz,0,0,0,0,0,0,0,0,0,1,0,0,0,0,3
1198244,zzzing,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [104]:
term_counts = term_counts.rename(columns={"Content": "Terms"})

In [105]:
term_vectors_df = terms_df.merge(
    term_counts,  # Existing term counts
    on="Terms",  # Match on the 'Content' column
    how="left"  # Ensure all terms in all_terms_df are kept
)
term_vectors_df

Unnamed: 0,Terms,Arts,Business,Computers,Games,Health,Home,News,Recreation,Reference,Regional,Science,Shopping,Society,Sports,World
0,accent,18,69,6,0,4,0,0,1,2,113,2,33,3,2,70
1,services,556,24679,5781,80,5844,293,75,1119,784,126500,2527,759,8882,653,8709
2,uk,1352,4713,706,114,484,143,56,922,126,5777,594,1360,784,534,91
3,based,3804,10845,2955,268,1071,153,207,1910,263,10025,950,1384,3021,1780,124
4,full,264,3094,405,42,636,30,12,644,90,9463,218,475,401,371,244
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1198241,government-controlled,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1198242,tanganda,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1198243,utande,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1198244,nascoh,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [106]:
term_vectors_df.to_csv('outputs/term_vectors.csv', index=False)

# Results

In [107]:
cleaned_df

Unnamed: 0,ReversedDomain,Content,Topic,ID
0,ac.accent,accent services uk based full service commerci...,Business,362
1,ac.accent,accent services a full service commercial and ...,Regional,362
2,ac.acs,anderson county schools public schools in the ...,Regional,383
3,ac.aikido,langnau aikidogruppe in zusammenarbeit mit der...,World,424
4,ac.apec,apec birmingham practice provides information ...,Business,536
...,...,...,...,...
1887205,zw.gov.parlzim,zimbabwe parliament provides current bills ord...,Regional,91033765
1887206,zw.org.csz,computer society of zimbabwe aims to encourage...,Regional,91033860
1887207,zw.org.nascoh,national association of societies for the care...,Society,91033952
1887208,zw.org.zlhr,zimbabwe lawyers for human rights zlhr non-pro...,Regional,91034088


In [115]:
filtered_edges_df

Unnamed: 0,from_node,to_node
0,362,13833969
1,362,38847411
2,362,88039175
3,362,88492518
4,383,20737647
...,...,...
10710334,91033547,91031677
10710335,91033668,78311432
10710336,91033670,30651887
10710337,91033670,78099047


In [108]:
term_vectors_df

Unnamed: 0,Terms,Arts,Business,Computers,Games,Health,Home,News,Recreation,Reference,Regional,Science,Shopping,Society,Sports,World
0,accent,18,69,6,0,4,0,0,1,2,113,2,33,3,2,70
1,services,556,24679,5781,80,5844,293,75,1119,784,126500,2527,759,8882,653,8709
2,uk,1352,4713,706,114,484,143,56,922,126,5777,594,1360,784,534,91
3,based,3804,10845,2955,268,1071,153,207,1910,263,10025,950,1384,3021,1780,124
4,full,264,3094,405,42,636,30,12,644,90,9463,218,475,401,371,244
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1198241,government-controlled,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1198242,tanganda,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1198243,utande,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1198244,nascoh,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
