In [23]:
import csv
import plotly.express as px
import pandas as pd
from dash import Dash, html, dcc, callback, Output, Input

In [29]:
df = pd.read_csv('vep_big_names_of_science_v2_ubiq321_ds.csv')
ubiq_categories = [i for i in df.select_dtypes(include='float64').columns]
print(ubiq_categories)
df

['!UNRECOGNIZED', '!UNTAGGED', 'AbstractConcepts', 'Acknowledge', 'Anger', 'Apology', 'Aside', 'Attack_Citation', 'Authoritative_Citation', 'Autobio', 'Biographical_Time', 'Cause', 'Citations', 'CommonAuthorities', 'CommunicatorRole', 'Comparison', 'Concessive', 'Confidence', 'ConfirmExperience', 'ConfirmedThght', 'Confront', 'Consequence', 'Contested_Citation', 'Contingency', 'Curiosity', 'Definition', 'DenyDisclaim', 'DialogCues', 'DirectAddress', 'DirectReasoning', 'Disclosure', 'ErrorRecovery', 'Example', 'Exceptions', 'Fear', 'Feedback', 'FirstPer', 'FollowUp', 'Future_Question', 'Future_in_Past', 'Generalization', 'GenericEvents', 'Immediacy', 'Imperative', 'In_Media', 'Inclusive', 'Innovations', 'Insist', 'Intensity', 'LangRef', 'MatureProcess', 'Metadiscourse', 'Motions', 'MoveBody', 'Narrative_Verbs', 'NegFeedback', 'Neg_Citation', 'Negative_Attribution', 'Negative_Relation', 'Negativity', 'Neutral_Attribution', 'Numbers', 'OpenQuery', 'OralCues', 'PersonPronoun', 'PersonPrope

Unnamed: 0,text_name,text_key,html_name,chunk_index,!UNRECOGNIZED,!UNTAGGED,!BLACKLISTED,AbstractConcepts,Acknowledge,Anger,...,Support,TimeDate,TimeDuration,TimeShift,Transformation,Uncertainty,Updates,<# Word Tokens>,<# Punctuation Tokens>,<# Tokens>
0,A00429.headed.txt,a00429headed,0,0,5.969797,32.177064,0,5.880564,0.001389,0.002779,...,0.008028,0.001235,0.164571,0.056041,0.246857,0.360173,0.098187,556010,91734,647744
1,A01014.headed.txt,a01014headed,0,0,4.647499,29.816294,0,4.138300,0.028133,0.109717,...,0.087211,0.000000,0.392449,0.191301,0.519046,0.445901,0.351657,60376,10716,71092
2,A01089.headed.txt,a01089headed,0,0,6.317955,28.171221,0,7.444412,0.000000,0.000000,...,0.039181,0.039181,0.959937,0.274268,0.391811,0.215496,0.176315,8896,1313,10209
3,A01185.headed.txt,a01185headed,0,0,3.697632,29.384335,0,3.868852,0.025501,0.211293,...,0.029144,0.000000,0.586521,0.327869,0.699454,0.947177,0.349727,23921,3529,27450
4,A01410.headed.txt,a01410headed,0,0,3.817623,28.201015,0,3.915228,0.004244,0.098454,...,0.190117,0.000000,0.807999,0.263109,0.528764,1.078746,0.328462,101816,16006,117822
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
324,A97051.headed.txt,a97051headed,0,0,4.217838,29.073952,0,5.413732,0.026607,0.093823,...,0.043411,0.002801,0.436907,0.399098,0.373892,0.669365,0.424304,57733,13678,71411
325,B00108.headed.txt,b00108headed,0,0,2.592925,28.554366,0,11.155254,0.000000,0.000000,...,0.008471,0.072849,1.667062,0.207536,0.278691,0.280385,0.049131,104195,13857,118052
326,B03739.headed.txt,b03739headed,0,0,1.481313,27.620784,0,5.241568,0.045579,0.000000,...,0.068368,0.182315,1.344576,0.501367,1.002735,0.683683,0.569736,3953,435,4388
327,B08000.headed.txt,b08000headed,0,0,11.067863,28.329667,0,5.246235,0.000000,0.033847,...,0.033847,0.033847,1.066170,0.152310,0.220003,0.626163,0.440007,5142,767,5909


In [17]:
fig = px.scatter(df, x="AbstractConcepts", y="Acknowledge", hover_data="text_name")
fig.show()

In [56]:
app = Dash()
app.layout = [
    html.H1(children="Ubiq Category Comparison", style={
        'textAlign': 'center',
        'color': 'white',
        'fontFamily': 'monospace'
    }),
    html.Div([
        dcc.Dropdown(ubiq_categories, 'AbstractConcepts', id='xaxis'),
    ], style={'width': '48%', 'display': 'inline-block', 'fontFamily': 'monospace'}),
    html.Div([
        dcc.Dropdown(ubiq_categories, 'Acknowledge', id='yaxis'),
    ], style={'width': '48%', 'display': 'inline-block', 'fontFamily': 'monospace'}),
    dcc.Graph(id='comparison-figure')
]

@callback(
    Output('comparison-figure', 'figure'),
    Input('xaxis', 'value'),
    Input('yaxis', 'value'))
def update_graph(xaxis, yaxis):
    fig = px.scatter(df, x=xaxis, y=yaxis, hover_data="text_name", trendline="ols", trendline_color_override="red")
    return fig

In [57]:
app.run(debug=True, use_reloader=False)

In [62]:
df_float = pd.DataFrame(df, columns=ubiq_categories)
df_float.describe()

Unnamed: 0,!UNRECOGNIZED,!UNTAGGED,AbstractConcepts,Acknowledge,Anger,Apology,Aside,Attack_Citation,Authoritative_Citation,Autobio,...,SubjectivePercept,SubjectiveTime,Substitution,Support,TimeDate,TimeDuration,TimeShift,Transformation,Uncertainty,Updates
count,329.0,329.0,329.0,329.0,329.0,329.0,329.0,329.0,329.0,329.0,...,329.0,329.0,329.0,329.0,329.0,329.0,329.0,329.0,329.0,329.0
mean,3.93598,29.268545,5.274654,0.014481,0.079071,0.001586,0.674736,0.001536,0.009285,0.112485,...,1.487293,0.242223,0.030337,0.067033,0.053442,0.93649,0.323226,0.653965,0.77981,0.327262
std,2.949137,2.389749,1.619784,0.019968,0.067601,0.005275,0.267921,0.004073,0.013015,0.158735,...,0.530939,0.100778,0.025707,0.082187,0.177496,0.648721,0.145231,0.351845,0.349709,0.134733
min,0.513111,15.038299,2.032061,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.1068,0.0,0.0,0.0,0.0,0.086065,0.05169,0.0,0.0712,0.020336
25%,2.022472,27.867905,4.334067,0.001531,0.027352,0.0,0.491771,0.0,0.0,0.016875,...,1.150787,0.176009,0.012484,0.024607,0.003257,0.567693,0.223425,0.406298,0.543123,0.238879
50%,3.0885,29.2142,4.992977,0.007695,0.064644,0.0,0.648904,0.0,0.004614,0.05169,...,1.474043,0.244235,0.024178,0.045897,0.015304,0.768199,0.297118,0.608617,0.738544,0.326309
75%,4.799349,30.569708,5.92324,0.017984,0.118738,0.000772,0.829571,0.00098,0.013518,0.122683,...,1.827067,0.303001,0.040899,0.08794,0.042781,1.08852,0.397248,0.846124,0.993456,0.403192
max,21.845241,40.505758,12.410057,0.124329,0.438328,0.071117,2.171536,0.039262,0.087196,1.208032,...,3.254026,0.56102,0.175747,1.060271,2.806619,5.339979,1.317195,2.917724,2.035401,0.983655


In [75]:
from sklearn.decomposition import PCA

n_components = 20

pca = PCA(n_components=n_components)
components_fit = pca.fit_transform(df_float)
components = pd.DataFrame(components_fit, columns=['PCA%i' % i for i in range(n_components)], index=df_float.index)
print(pca.explained_variance_ratio_)
components

[0.35525834 0.18226199 0.13665229 0.10277107 0.05500549 0.0284618
 0.01764498 0.01642549 0.01333723 0.01068055 0.00805132 0.00698735
 0.006781   0.00607287 0.00502664 0.00458742 0.0042543  0.00346136
 0.00330564 0.00299851]


Unnamed: 0,PCA0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,PCA9,PCA10,PCA11,PCA12,PCA13,PCA14,PCA15,PCA16,PCA17,PCA18,PCA19
0,1.534061,2.428633,1.676813,-6.284484,1.587008,-1.842339,3.291417,0.853761,0.879147,-0.379401,0.531104,0.194759,-1.381671,0.380351,0.237725,-0.512155,-0.151794,0.424378,0.241127,0.921391
1,-1.543088,0.408766,-2.589662,0.081609,0.826761,-1.034588,-0.517589,0.176675,-0.155766,1.219391,-0.120317,-0.124324,-0.583392,-0.302234,-0.135798,-0.403862,-0.212720,-0.392992,-0.186775,0.240762
2,4.981658,1.357249,3.205672,-5.481750,-2.715634,-1.544210,-0.184764,0.371508,-0.541839,-0.209870,-0.429133,-0.541184,-0.587313,-0.947563,0.389399,0.273632,-0.925626,0.172633,-0.159487,-0.106277
3,-2.074450,-0.904051,-2.809141,0.214837,1.123194,-1.391261,-0.302395,-0.714140,-0.661987,0.323980,-0.200300,-0.005385,-0.691216,0.111721,0.109401,0.228401,0.499502,-0.583169,0.154530,-0.086597
4,-1.851201,-1.594893,-2.654641,0.136560,0.135569,-0.207073,1.470848,0.482063,-0.224238,-0.107523,0.326392,-0.187473,-0.290323,0.834181,0.295762,0.605145,-0.083621,-0.396731,-0.056844,0.367927
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
324,0.339208,-1.707047,-2.553227,-2.447731,0.406041,-0.296664,0.028063,3.629971,3.042038,-1.607154,0.004222,0.234741,0.791822,-0.221312,-0.835931,-0.053340,0.662806,-0.019875,0.288297,0.325461
325,10.676352,-2.233994,5.684402,-4.789275,1.908825,0.476514,-1.464555,-0.836085,-0.990935,-0.389951,-0.732047,-0.758433,0.507367,-0.741547,-0.453185,0.262833,-0.316685,0.264773,0.137172,-0.083834
326,-0.759386,-1.525302,4.736058,-1.767465,-2.639399,-1.512905,1.133136,-0.898486,0.035738,1.299929,-0.857991,-0.518319,0.861128,-0.736346,1.002713,-0.082543,0.476127,0.189967,0.527423,0.882877
327,1.929115,4.019810,-3.190207,-4.820190,-3.390076,-0.724794,2.255260,0.353222,0.453198,0.634501,-0.163705,-1.238511,0.625224,0.127438,0.336325,-0.564171,0.452977,-0.567778,0.552087,0.812832


In [79]:
fig = px.line(x=[i for i in range(n_components)], y=pca.explained_variance_ratio_, labels={'x': 'component', 'y': 'explained variance ratio'}, title="Explained Variance Ratio per Component")
fig.show()