In [1]:
import pandas as pd
import numpy as np
from kmapper import KeplerMapper
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
from kmapper.cover import Cover
from sklearn.cluster import DBSCAN



In [2]:
#read in big names, row labled by year-author-subgenre-title
df = pd.read_csv("vep_big_names_of_science_v2_ubiq321_ds.csv", index_col=0)

#drop non-numeric columns
columns_to_drop = ['text_key', 'html_name', 'chunk_index', '!UNRECOGNIZED', '!UNTAGGED', '!BLACKLISTED', '<# Word Tokens>', '<# Punctuation Tokens>', '<# Tokens>']
df = df.drop(columns=columns_to_drop)

df.head(10)

Unnamed: 0_level_0,AbstractConcepts,Acknowledge,Anger,Apology,Aside,Attack_Citation,Authoritative_Citation,Autobio,Biographical_Time,Cause,...,SubjectivePercept,SubjectiveTime,Substitution,Support,TimeDate,TimeDuration,TimeShift,Transformation,Uncertainty,Updates
text_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A00429.headed.txt,5.880564,0.001389,0.002779,0.000463,0.460985,0.0,0.001698,0.007565,0.042918,0.076728,...,1.213905,0.035199,0.009109,0.008028,0.001235,0.164571,0.056041,0.246857,0.360173,0.098187
A01014.headed.txt,4.1383,0.028133,0.109717,0.00422,1.09717,0.0,0.011253,0.073145,0.12941,0.315085,...,2.599449,0.123783,0.022506,0.087211,0.0,0.392449,0.191301,0.519046,0.445901,0.351657
A01089.headed.txt,7.444412,0.0,0.0,0.0,0.372221,0.0,0.0,0.019591,0.019591,0.225291,...,0.607307,0.107748,0.0,0.039181,0.039181,0.959937,0.274268,0.391811,0.215496,0.176315
A01185.headed.txt,3.868852,0.025501,0.211293,0.0,0.834244,0.0,0.010929,0.018215,0.378871,0.167577,...,2.105647,0.56102,0.010929,0.029144,0.0,0.586521,0.327869,0.699454,0.947177,0.349727
A01410.headed.txt,3.915228,0.004244,0.098454,0.0,1.133914,0.0,0.001697,0.057714,0.201151,0.310638,...,1.236611,0.249529,0.005941,0.190117,0.0,0.807999,0.263109,0.528764,1.078746,0.328462
A01446.headed.txt,3.611563,0.005006,0.057565,0.0,0.230259,0.0,0.0,0.055062,0.322863,0.182706,...,2.490302,0.412965,0.052559,0.107621,0.015017,2.139907,0.165186,1.278939,0.49806,0.185208
A01454.headed.txt,3.014281,0.003195,0.052714,0.0,1.017539,0.0,0.0,0.035143,0.273154,0.14696,...,2.194818,0.41692,0.025558,0.067091,0.015974,1.448836,0.289128,0.771541,0.988786,0.309894
A01516.headed.txt,4.838999,0.011602,0.068556,0.0,1.097951,0.0,0.008438,0.036915,0.244692,0.229926,...,1.826754,0.255239,0.011602,0.032696,0.004219,0.491494,0.28688,0.491494,0.786812,0.309029
A01552.headed.txt,3.717834,0.012528,0.069471,0.0,0.91565,0.000569,0.001139,0.039291,0.146345,0.607586,...,1.465153,0.273328,0.018222,0.055805,0.013666,0.83479,0.404298,0.857567,0.874081,0.284717
A01586.headed.txt,9.058267,0.0,0.016321,0.0,0.391709,0.0,0.0,0.0,0.0,0.179533,...,0.767096,0.40803,0.0,0.032642,0.261139,1.077199,0.261139,0.440672,0.603884,0.326424


In [3]:
#scale the data
X = df.select_dtypes(include="number")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X) #take X and scale to [0,1] with std dev 1


In [4]:
#mapper takes np arrays, but we should preserve row names 

X_scaled = pd.DataFrame(
    X_scaled,
    index=df.index,
    columns=df.columns
)

#check it
X_scaled.head(3)


Unnamed: 0_level_0,AbstractConcepts,Acknowledge,Anger,Apology,Aside,Attack_Citation,Authoritative_Citation,Autobio,Biographical_Time,Cause,...,SubjectivePercept,SubjectiveTime,Substitution,Support,TimeDate,TimeDuration,TimeShift,Transformation,Uncertainty,Updates
text_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A00429.headed.txt,0.374638,-0.656612,-1.13028,-0.213132,-0.799031,-0.377559,-0.583792,-0.661981,-0.944424,-1.195027,...,-0.515698,-2.05738,-0.827043,-0.719027,-0.29458,-1.191723,-1.842535,-1.158829,-1.201787,-1.702813
A01014.headed.txt,-0.702615,0.684729,0.454034,0.500186,1.579108,-0.377559,0.151457,-0.248211,-0.253559,0.348138,...,2.097888,-1.177041,-0.305092,0.245885,-0.301549,-0.839915,-0.909765,-0.384046,-0.956273,0.181335
A01089.headed.txt,1.341576,-0.726302,-1.171449,-0.301072,-1.130842,-0.377559,-0.71447,-0.586105,-1.130757,-0.2332,...,-1.65994,-1.336397,-1.181901,-0.339398,-0.080469,0.036198,-0.337622,-0.746218,-1.616124,-1.122054


In [5]:
#pca two componenets 
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

print("Variance explained by each principle comp:", pca.explained_variance_ratio_)
print("Cumulative variance:", pca.explained_variance_ratio_.cumsum())

Variance explained by each principle comp: [0.15526954 0.06648878]
Cumulative variance: [0.15526954 0.22175832]


In [6]:
#dimensionality reduction, project to PCA space with two most significant comps
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# X_pca will be used as the lens/ruler 
lens = X_pca

In [7]:
#Init. Kepler Mapper
mapper = KeplerMapper(verbose=1)

KeplerMapper(verbose=1)


In [8]:
#create cover 
cover = Cover(n_cubes=15, perc_overlap=0.1)

In [9]:
# Create Mapper graph
graph = mapper.map(
    lens,
    X_scaled,
    clusterer=DBSCAN(eps=10, min_samples=3),
    cover=cover
)

Mapping on data shaped (329, 115) using lens shaped (329, 2)

Creating 225 hypercubes.

Created 35 edges and 37 nodes in 0:00:00.122050.


In [18]:
tooltips = df.index.astype(str).to_numpy()

mapper.visualize(graph, path_html="BigNamesNetwork.html",
                 title="Big Names Network!", custom_tooltips=tooltips, include_searchbar=True)

Wrote visualization to: BigNamesNetwork.html


'<!DOCTYPE html>\n<html>\n\n<head>\n  <meta charset="utf-8">\n  <meta name="generator" content="KeplerMapper">\n  <title>Big Names Network! | KeplerMapper</title>\n\n  <link rel="icon" type="image/png" href="http://i.imgur.com/axOG6GJ.jpg" />\n\n  <link href=\'https://fonts.googleapis.com/css?family=Roboto+Mono:700,300\' rel=\'stylesheet\' type=\'text/css\'>\n  <style>* {\n  margin: 0;\n  padding: 0;\n}\n\nhtml, body {\n  height: 100%;\n}\n\nbody {\n  font-family: "Roboto Mono", "Helvetica", sans-serif;\n  font-size: 14px;\n}\n\n#logo {\n  width:  85px;\n  height: 85px;\n}\n\n#display {\n  color: #95A5A6;\n  background: #212121;\n}\n\n#header {\n  background: #111111;\n}\n\n#print {\n  color: #000;\n  background: #FFF;\n}\n\nh1 {\n  font-size: 21px;\n  font-weight: 300;\n  font-weight: 300;\n}\n\nh2 {\n  font-size: 18px;\n  padding-bottom: 20px;\n  font-weight: 300;\n}\n\nh3 {\n  font-size: 14px;\n  font-weight: 700;\n  text-transform: uppercase;\n}\n\nh4 {\n  font-size: 13px;\n  font-

In [None]:
avgs = {}

meta_df = pd.read_csv("Metadata-BigNames.csv")

for key, value in graph['nodes'].items():
    date_sum = 0
    for i in range(len(value)):
        date_sum += meta_df[meta_df['text_name'] == tooltips[value[i]]].iloc[0]['Date']
    avgs[key] = date_sum / len(value)

{'cube5_cluster0': np.float64(1658.8),
 'cube11_cluster0': np.float64(1669.25),
 'cube19_cluster0': np.float64(1643.6666666666667),
 'cube20_cluster0': np.float64(1589.0),
 'cube21_cluster0': np.float64(1650.0),
 'cube30_cluster0': np.float64(1665.25),
 'cube37_cluster0': np.float64(1670.0),
 'cube47_cluster0': np.float64(1663.6666666666667),
 'cube48_cluster0': np.float64(1638.0),
 'cube49_cluster0': np.float64(1657.1666666666667),
 'cube50_cluster0': np.float64(1664.75),
 'cube54_cluster0': np.float64(1673.0),
 'cube55_cluster0': np.float64(1673.5),
 'cube57_cluster0': np.float64(1672.4),
 'cube58_cluster0': np.float64(1656.6),
 'cube59_cluster0': np.float64(1661.2),
 'cube60_cluster0': np.float64(1670.5),
 'cube61_cluster0': np.float64(1671.5),
 'cube66_cluster0': np.float64(1651.2),
 'cube67_cluster0': np.float64(1677.0),
 'cube68_cluster0': np.float64(1658.0),
 'cube69_cluster0': np.float64(1658.75),
 'cube70_cluster0': np.float64(1653.6666666666667),
 'cube71_cluster0': np.float6

In [27]:
import json
named_nodes = {}
for key, val in graph['nodes'].items():
    named_nodes[key] = [tooltips[i] for i in val]

with open("BigNamesGraph.json", "w") as file:
    graph['nodes'] = named_nodes
    json.dump(graph, file)

with open("BigNamesYears.json", "w") as file:
    json.dump(avgs, file)

In [28]:
full_names = {}

for i in tooltips:
    row = meta_df[meta_df['text_name'] == i].iloc[0]
    try:
        enc = f"{row['Date']}-{row['Author'].split(',')[0]}"
    except:
        enc = f"{row['Date']}-Unknown"
    full_names[i] = enc

with open("FullNames.json", "w") as file:
    json.dump(full_names, file)