In [1]:
import csv
import plotly.express as px
import pandas as pd
from dash import Dash, html, dcc, callback, Output, Input

In [2]:
df = pd.read_csv('./BigNames/vep_big_names_of_science_v2_ubiq321_ds.csv')
ubiq_categories = [i for i in df.select_dtypes(include='float64').columns]
print(ubiq_categories)
df

['!UNRECOGNIZED', '!UNTAGGED', 'AbstractConcepts', 'Acknowledge', 'Anger', 'Apology', 'Aside', 'Attack_Citation', 'Authoritative_Citation', 'Autobio', 'Biographical_Time', 'Cause', 'Citations', 'CommonAuthorities', 'CommunicatorRole', 'Comparison', 'Concessive', 'Confidence', 'ConfirmExperience', 'ConfirmedThght', 'Confront', 'Consequence', 'Contested_Citation', 'Contingency', 'Curiosity', 'Definition', 'DenyDisclaim', 'DialogCues', 'DirectAddress', 'DirectReasoning', 'Disclosure', 'ErrorRecovery', 'Example', 'Exceptions', 'Fear', 'Feedback', 'FirstPer', 'FollowUp', 'Future_Question', 'Future_in_Past', 'Generalization', 'GenericEvents', 'Immediacy', 'Imperative', 'In_Media', 'Inclusive', 'Innovations', 'Insist', 'Intensity', 'LangRef', 'MatureProcess', 'Metadiscourse', 'Motions', 'MoveBody', 'Narrative_Verbs', 'NegFeedback', 'Neg_Citation', 'Negative_Attribution', 'Negative_Relation', 'Negativity', 'Neutral_Attribution', 'Numbers', 'OpenQuery', 'OralCues', 'PersonPronoun', 'PersonPrope

Unnamed: 0,text_name,text_key,html_name,chunk_index,!UNRECOGNIZED,!UNTAGGED,!BLACKLISTED,AbstractConcepts,Acknowledge,Anger,...,Support,TimeDate,TimeDuration,TimeShift,Transformation,Uncertainty,Updates,<# Word Tokens>,<# Punctuation Tokens>,<# Tokens>
0,A00429.headed.txt,a00429headed,0,0,5.969797,32.177064,0,5.880564,0.001389,0.002779,...,0.008028,0.001235,0.164571,0.056041,0.246857,0.360173,0.098187,556010,91734,647744
1,A01014.headed.txt,a01014headed,0,0,4.647499,29.816294,0,4.138300,0.028133,0.109717,...,0.087211,0.000000,0.392449,0.191301,0.519046,0.445901,0.351657,60376,10716,71092
2,A01089.headed.txt,a01089headed,0,0,6.317955,28.171221,0,7.444412,0.000000,0.000000,...,0.039181,0.039181,0.959937,0.274268,0.391811,0.215496,0.176315,8896,1313,10209
3,A01185.headed.txt,a01185headed,0,0,3.697632,29.384335,0,3.868852,0.025501,0.211293,...,0.029144,0.000000,0.586521,0.327869,0.699454,0.947177,0.349727,23921,3529,27450
4,A01410.headed.txt,a01410headed,0,0,3.817623,28.201015,0,3.915228,0.004244,0.098454,...,0.190117,0.000000,0.807999,0.263109,0.528764,1.078746,0.328462,101816,16006,117822
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
324,A97051.headed.txt,a97051headed,0,0,4.217838,29.073952,0,5.413732,0.026607,0.093823,...,0.043411,0.002801,0.436907,0.399098,0.373892,0.669365,0.424304,57733,13678,71411
325,B00108.headed.txt,b00108headed,0,0,2.592925,28.554366,0,11.155254,0.000000,0.000000,...,0.008471,0.072849,1.667062,0.207536,0.278691,0.280385,0.049131,104195,13857,118052
326,B03739.headed.txt,b03739headed,0,0,1.481313,27.620784,0,5.241568,0.045579,0.000000,...,0.068368,0.182315,1.344576,0.501367,1.002735,0.683683,0.569736,3953,435,4388
327,B08000.headed.txt,b08000headed,0,0,11.067863,28.329667,0,5.246235,0.000000,0.033847,...,0.033847,0.033847,1.066170,0.152310,0.220003,0.626163,0.440007,5142,767,5909


In [3]:
fig = px.scatter(df, x="AbstractConcepts", y="Acknowledge", hover_data="text_name")
fig.show()

In [4]:
app = Dash()
app.layout = [
    html.H1(children="Ubiq Category Comparison", style={
        'textAlign': 'center',
        'color': 'white',
        'fontFamily': 'monospace'
    }),
    html.Div([
        dcc.Dropdown(ubiq_categories, 'AbstractConcepts', id='xaxis'),
    ], style={'width': '48%', 'display': 'inline-block', 'fontFamily': 'monospace'}),
    html.Div([
        dcc.Dropdown(ubiq_categories, 'Acknowledge', id='yaxis'),
    ], style={'width': '48%', 'display': 'inline-block', 'fontFamily': 'monospace'}),
    dcc.Graph(id='comparison-figure')
]

@callback(
    Output('comparison-figure', 'figure'),
    Input('xaxis', 'value'),
    Input('yaxis', 'value'))
def update_graph(xaxis, yaxis):
    fig = px.scatter(df, x=xaxis, y=yaxis, hover_data="text_name", trendline="ols", trendline_color_override="red")
    return fig

In [5]:
app.run(debug=True, use_reloader=False)

In [6]:
df_float = pd.DataFrame(df, columns=ubiq_categories[2:])
df_float.describe()

Unnamed: 0,AbstractConcepts,Acknowledge,Anger,Apology,Aside,Attack_Citation,Authoritative_Citation,Autobio,Biographical_Time,Cause,...,SubjectivePercept,SubjectiveTime,Substitution,Support,TimeDate,TimeDuration,TimeShift,Transformation,Uncertainty,Updates
count,329.0,329.0,329.0,329.0,329.0,329.0,329.0,329.0,329.0,329.0,...,329.0,329.0,329.0,329.0,329.0,329.0,329.0,329.0,329.0,329.0
mean,5.274654,0.014481,0.079071,0.001586,0.674736,0.001536,0.009285,0.112485,0.161154,0.261311,...,1.487293,0.242223,0.030337,0.067033,0.053442,0.93649,0.323226,0.653965,0.77981,0.327262
std,1.619784,0.019968,0.067601,0.005275,0.267921,0.004073,0.013015,0.158735,0.125384,0.154695,...,0.530939,0.100778,0.025707,0.082187,0.177496,0.648721,0.145231,0.351845,0.349709,0.134733
min,2.032061,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.1068,0.0,0.0,0.0,0.0,0.086065,0.05169,0.0,0.0712,0.020336
25%,4.334067,0.001531,0.027352,0.0,0.491771,0.0,0.0,0.016875,0.077963,0.151505,...,1.150787,0.176009,0.012484,0.024607,0.003257,0.567693,0.223425,0.406298,0.543123,0.238879
50%,4.992977,0.007695,0.064644,0.0,0.648904,0.0,0.004614,0.05169,0.139752,0.24541,...,1.474043,0.244235,0.024178,0.045897,0.015304,0.768199,0.297118,0.608617,0.738544,0.326309
75%,5.92324,0.017984,0.118738,0.000772,0.829571,0.00098,0.013518,0.122683,0.221129,0.344178,...,1.827067,0.303001,0.040899,0.08794,0.042781,1.08852,0.397248,0.846124,0.993456,0.403192
max,12.410057,0.124329,0.438328,0.071117,2.171536,0.039262,0.087196,1.208032,0.887184,1.114409,...,3.254026,0.56102,0.175747,1.060271,2.806619,5.339979,1.317195,2.917724,2.035401,0.983655


In [7]:
from sklearn.decomposition import PCA

n_components = 20

pca = PCA(n_components=n_components)
components_fit = pca.fit_transform(df_float)
components = pd.DataFrame(components_fit, columns=['PCA%i' % i for i in range(n_components)], index=df_float.index)
print(pca.explained_variance_ratio_)
components

[0.4393983  0.1665714  0.14232369 0.0408047  0.03048014 0.0242759
 0.02216046 0.01523143 0.01174064 0.01009373 0.00950434 0.00833502
 0.00800164 0.00702344 0.00596199 0.00489889 0.00468453 0.00436891
 0.00391638 0.00328592]


Unnamed: 0,PCA0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,PCA9,PCA10,PCA11,PCA12,PCA13,PCA14,PCA15,PCA16,PCA17,PCA18,PCA19
0,1.734988,0.972927,6.060562,-1.150079,0.953826,3.021775,1.249701,0.335153,0.957292,1.701903,-1.006404,-0.455545,-0.175341,-0.582243,-0.278895,0.710547,0.300161,-0.283411,0.452696,0.537878
1,-1.856439,-2.166702,-0.568462,-0.829233,-1.156278,0.387880,-0.278784,1.166331,-0.124766,-0.030779,-0.426650,0.428856,-0.013839,-0.361707,-0.377954,-0.011576,-0.345666,-0.392857,-0.191493,0.244147
2,4.554795,3.669956,5.578188,-2.127975,-0.625651,0.040669,0.215857,-0.432710,0.100934,-0.467235,-0.758888,0.862532,-0.484455,0.515630,-0.455579,0.235249,0.280194,-0.282514,-0.488758,0.135578
3,-2.249715,-2.980236,-0.514483,-1.042668,-0.535523,0.590750,-1.161384,0.202213,0.050804,0.087332,-0.564582,0.206190,0.336514,0.142069,0.457505,-0.414659,-0.586953,0.300948,0.250093,0.038310
4,-2.251481,-2.619134,-0.287261,-0.116946,0.737131,1.221590,0.637951,-0.220849,-0.181956,0.259305,-0.361175,-0.665775,0.469859,0.376591,0.372287,0.196046,-0.365222,-0.362129,-0.060637,0.223726
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
324,0.074750,-3.395046,2.252479,-0.742335,0.279540,-1.529564,4.532345,-0.310645,0.885118,0.292855,0.567755,-0.148394,-0.360327,-0.541959,0.391604,0.449507,-0.248425,0.175022,0.661640,-0.314448
325,11.743975,1.920875,5.820251,0.957052,-0.649333,-0.559398,-1.383673,-0.699151,-0.192676,-1.382196,-0.148610,0.607876,-0.250957,0.247858,-0.143023,0.410027,0.195000,0.072980,-0.160600,-0.046447
326,-0.195223,4.248129,2.707379,-1.750842,1.521633,0.806307,-0.703674,1.158651,0.315700,-1.083428,0.394395,-0.104891,-1.309179,0.214511,0.384908,0.207558,0.058997,0.203813,0.803844,-0.449226
327,-0.213788,0.333495,3.564756,-1.588752,0.202462,1.211447,1.003930,0.555818,0.386615,-0.690687,0.037533,-1.121600,-1.099730,-1.137509,0.229377,0.494277,-0.699578,-0.049152,0.387536,0.117255


In [8]:
fig = px.line(x=[i for i in range(n_components)], y=pca.explained_variance_ratio_, labels={'x': 'component', 'y': 'explained variance ratio'}, title="Explained Variance Ratio per Component")
fig.show()

In [84]:
import json

super_network = {}
bignames_network = {}
bignames = {}

with open("BigNames/FullNames.json", "r") as file:
    bignames = json.load(file)
    for key, val in bignames.items():
        bignames[key] = {
            'name': val,
            'bignames_cluster': [], 
            'superscience_cluster': []
        }
bigname_texts = set(bignames.keys())

with open("BigNames/BigNamesGraph.json", "r") as file:
    bignames_network = json.load(file)
with open("SuperScience/SuperScienceGraph.json", "r") as file:
    super_network = json.load(file)

for key, val in bignames_network['nodes'].items():
    for i in val:
        if i == "A00429.headed.txt":
            print(key)
        bignames[i]['bignames_cluster'].append(key)

for key, val in super_network['nodes'].items():
    for i in val:
        if i in bigname_texts:
            bignames[i]['superscience_cluster'].append(key)

bignames_cluster_connections = {}
super_cluster_connections = {}

for key, val in bignames_network['links'].items():
    if key in bignames_cluster_connections:
        bignames_cluster_connections[key] += val
    else:
        bignames_cluster_connections[key] = val
    
    for i in val:
        if i in bignames_cluster_connections:
            bignames_cluster_connections[i] += [key]
        else:
            bignames_cluster_connections[i] = [key]

for key, val in super_network['links'].items():
    if key in super_cluster_connections:
        super_cluster_connections[key] += val
    else:
        super_cluster_connections[key] = val
    
    for i in val:
        if i in super_cluster_connections:
            super_cluster_connections[i] += [key]
        else:
            super_cluster_connections[i] = [key]

for key, val in bignames.items():
    bignames[key]['bignames_connectivity'] = [len(bignames_cluster_connections[i]) if i in bignames_cluster_connections else 0 for i in val['bignames_cluster']]
    bignames[key]['super_connectivity'] = [len(super_cluster_connections[i]) if i in super_cluster_connections else 0 for i in val['superscience_cluster']]

meta_df = pd.read_csv("BigNames/Metadata-BigNames.csv")
for key, val in bignames.items():
    try: 
        yr = int(meta_df[meta_df['text_name'] == key].iloc[0]['Date'])
    except ValueError:
        yrs = meta_df[meta_df['text_name'] == key].iloc[0]['Date'].split('-')
        yrs = list(map(int, yrs))
        yr = sum(yrs) / len(yrs)
    bignames[key]['year'] = yr

with open("degree_connectivity_bignames.json", "w") as file:
    json.dump(bignames, file)

In [97]:
b_degrees = []
s_degrees = []
names = []
years = []
for key, val in bignames.items():
    b_conn = val['bignames_connectivity']
    s_conn = val['super_connectivity']
    if len(b_conn) == 0:
        b_conn = [0]
    if len(s_conn) == 0:
        s_conn = [0]
    for i in b_conn:
        for j in s_conn:
            b_degrees.append(i)
            s_degrees.append(j)
            names.append(key)
            years.append(val['year'])

conn = pd.DataFrame({"BigNames": b_degrees, "SuperScience": s_degrees, "id": names, "year": years})
conn_long = conn.melt(
    id_vars = ["id", "year"],
    value_vars = ["BigNames", "SuperScience"],
    var_name = "Dataset",
    value_name = "Degree"
)
fig = px.line(conn_long, x = "Dataset", y = "Degree", color = "year", color_discrete_sequence=px.colors.sequential.Viridis, markers=True)
fig.show()

In [85]:
import pprint
with open("SuperScience/FullNames.json", "r") as file:
    super = json.load(file)
    difference = {}
    for key, val in super.items():
        if key not in bigname_texts:
            difference[key] = {
                'name': val,
                'difference_cluster': [], 
                'superscience_cluster': []
            }
difference_texts = set(difference.keys())

with open("DifferenceGraph.json", "r") as file:
    difference_network = json.load(file)
for key, val in difference_network['nodes'].items():
    for i in val:
        difference[i]['difference_cluster'].append(key)

for key, val in super_network['nodes'].items():
    for i in val:
        if i in difference:
            difference[i]['superscience_cluster'].append(key)

difference_cluster_connections = {}

for key, val in difference_network['links'].items():
    if key in difference_cluster_connections:
        difference_cluster_connections[key] += val
    else:
        difference_cluster_connections[key] = val
    
    for i in val:
        if i in difference_cluster_connections:
            difference_cluster_connections[i] += [key]
        else:
            difference_cluster_connections[i] = [key]

for key, val in difference.items():
    difference[key]['difference_connectivity'] = [len(difference_cluster_connections[i]) if i in difference_cluster_connections else 0 for i in val['difference_cluster']]
    difference[key]['super_connectivity'] = [len(super_cluster_connections[i]) if i in super_cluster_connections else 0 for i in val['superscience_cluster']]

meta_df = pd.read_csv("SuperScience/Metadata-SuperScience.csv")
for key, val in difference.items():
    try: 
        yr = int(meta_df[meta_df['text_name'] == key].iloc[0]['Date'])
    except ValueError:
        yrs = meta_df[meta_df['text_name'] == key].iloc[0]['Date'].split('-')
        yrs = list(map(int, yrs))
        yr = sum(yrs) / len(yrs)
    difference[key]['year'] = yr

with open("degree_connectivity_difference.json", "w") as file:
    json.dump(difference, file)

In [86]:
d_degrees = []
s_degrees = []
names = []
years = []
for key, val in difference.items():
    d_conn = val['difference_connectivity']
    s_conn = val['super_connectivity']
    if len(d_conn) == 0:
        d_conn = [0]
    if len(s_conn) == 0:
        s_conn = [0]
    for i in d_conn:
        for j in s_conn:
            d_degrees.append(i)
            s_degrees.append(j)
            names.append(key)
            years.append(val['year'])

conn = pd.DataFrame({"Difference": d_degrees, "SuperScience": s_degrees, "id": names, "year": years})
conn_long = conn.melt(
    id_vars = "id",
    value_vars = ["Difference", "SuperScience"],
    var_name = "Dataset",
    value_name = "Degree"
)
fig = px.line(conn_long, x = "Dataset", y = "Degree", color = "year", markers=True, color_discrete_sequence="viridis")
fig.show()

ValueError: Value of 'color' is not the name of a column in 'data_frame'. Expected one of ['id', 'Dataset', 'Degree'] but received: year