<a href="https://colab.research.google.com/github/azagsam/xl-user-comments/blob/master/Interactive_visualizations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# install dependencies
!pip install sentence_transformers

# download example files
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1Iug8Bf_imuesls0wIdzEg-gz_X_URJDV' -O 'example.txt'
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1Y234fq5o-4XTBVbyBBQDvzMjO2bjd1KU' -O '24sata-3topics.txt'

--2021-01-29 10:06:27--  https://docs.google.com/uc?export=download&id=1Iug8Bf_imuesls0wIdzEg-gz_X_URJDV
Resolving docs.google.com (docs.google.com)... 74.125.142.139, 74.125.142.100, 74.125.142.138, ...
Connecting to docs.google.com (docs.google.com)|74.125.142.139|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-0k-60-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/iidqs2rltv6br2h49kebmnio1ro3h8hp/1611914775000/16509431880972231375/*/1Iug8Bf_imuesls0wIdzEg-gz_X_URJDV?e=download [following]
--2021-01-29 10:06:28--  https://doc-0k-60-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/iidqs2rltv6br2h49kebmnio1ro3h8hp/1611914775000/16509431880972231375/*/1Iug8Bf_imuesls0wIdzEg-gz_X_URJDV?e=download
Resolving doc-0k-60-docs.googleusercontent.com (doc-0k-60-docs.googleusercontent.com)... 74.125.142.132, 2607:f8b0:400e:c08::84
Connecting to doc-0k-60-docs.googleusercontent.com (doc-0k-60-d

In [None]:
from sentence_transformers import SentenceTransformer
import networkx as nx
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sentence_transformers import SentenceTransformer, util
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import numpy as np

In [None]:
# download and load neural cross-lingual model
model = SentenceTransformer('xlm-r-100langs-bert-base-nli-stsb-mean-tokens')

# Clustering

In [None]:
# load data for clustering
with open('example.txt') as f:
    sentences = [line.strip() for line in f.readlines()]

# calculate embeddings
embeddings = model.encode(sentences, convert_to_tensor=True)

# reduce dimensionality
pca = PCA(n_components=2)
reduced_dim_embeddings = pca.fit_transform(embeddings)

# calculate clusters
gm = GaussianMixture(n_components=3, covariance_type='full').fit(reduced_dim_embeddings)
clustering = gm.predict(reduced_dim_embeddings)

# construct plot
xs = [x for x, _ in reduced_dim_embeddings]
ys = [y for _, y in reduced_dim_embeddings]
labels = [f'Cluster {c}' for c in clustering]

df = pd.DataFrame(
    {
        'x': xs,
        'y': ys,
        'cluster': labels,
        'sentences': sentences
    }
)

fig = px.scatter(df,
                 x='x',
                 y='y',
                 hover_name='sentences',
                 color='cluster',
                 title='<b>Dimensionality reduced by PCA, colored with GaussianMixture</b>'
                 )

fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))

fig.add_trace(
    go.Scatter(
        x=[x for x, _ in gm.means_],
        y=[y for _, y in gm.means_],
        showlegend=False,
        hovertext=list(range(len(gm.means_))),
        mode='markers',
        marker=dict(
            color='Black',
            size=16,
            symbol='x',
        )
    )
)
fig.show()

# TextRank: Barplot 

In [None]:
# load 24sata example
with open('24sata-3topics.txt') as f:
    sentences = [line.strip() for line in f.readlines()]
    print(sentences)
    
# calculate embeddings
embeddings = model.encode(sentences, convert_to_tensor=True)

# similarity matrix
sim_mat = cosine_similarity(embeddings)

# rescale
scaler = MinMaxScaler(feature_range=(0, 1))
sim_mat = scaler.fit_transform(sim_mat.flatten().reshape(-1, 1)).reshape(len(embeddings), len(embeddings))

# calculate pagerank
nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph, max_iter=500)  # max number of cycles to converge
score_list = [scores[sent_idx] for sent_idx in range(len(sentences))]

# plot data
df = pd.DataFrame(
    {
        'scores': score_list,
        'sentences': sentences,
    }
)

fig = px.bar(df,
             y=list(range(len(sentences))),
             x='scores',
             text='sentences',
             orientation='h',
             color='scores',
             color_continuous_scale='OrRd',
             title='<b>TextRank - Bar plot</b>'
             )
fig.update_traces(  # texttemplate='%{text:.2s}',
    textposition='inside')
fig.update_yaxes(autorange="reversed",
                 )
fig.show()

['Dinamovci su u utorak ujutro imali novo testiranje na korona virus, a u srijedu ujutro došli su rezultati koji su obradovali sve u klubu.', 'Naime, svi igrači i članovi stručnog stožera negativni su na koronu.', 'Zoran Mamić i Joško Gvardiol najavit će u podne utakmicu protiv CSKA.', 'Treći put zaredom su dinamovci negativni, pozitivnih je najviše bilo uoči utakmice 3. kola protiv Wolfsbergera.', 'Iz Tesle su odbili komentirati situaciju.', 'Njemački sud naredio je Elonu Musku da zaustavi gradnju megatvornice Teslinih automobila kraj Berlina zbog opasnosti da će tolika sječa šume ugroziti zmije koje sad hiberniraju.', 'Ekološki aktivisti iz udruge NABU upozorili su da bi sječa šume koja je potrebna da bi se tvornica izgradila, mogla trajno oštetiti populaciju dviju vrsta zmija i ugroziti preživljavanje jedne vrste guštera.', 'S druge strane, lokalno stanovništvo brine hoće li tvornica, pogotovo kad krene s proizvodnjom baterija, iscrpiti lokalnu pitku vodu zbog čega su zahtijevali od

# TextRank: nodes + edges
NOTE: Larger, darker and most connected nodes are the most important ones


In [None]:
def scale_centrality_scores(centrality_scores, q=0.1):
    scaler = MinMaxScaler(feature_range=(1, 10))
    scaler.fit(centrality_scores[centrality_scores >= np.quantile(centrality_scores, q=q)].reshape(-1, 1))
    centrality_scores = scaler.transform(centrality_scores.reshape(-1, 1))
    centrality_scores = np.where(centrality_scores < 0, 0.5, centrality_scores).ravel()
    return centrality_scores

# cluster sentences
embeddings = model.encode(sentences, convert_to_tensor=True)

# similarity matrix
sim_mat = cosine_similarity(embeddings)

# rescale
scaler = MinMaxScaler(feature_range=(0, 1))
sim_mat = scaler.fit_transform(sim_mat.flatten().reshape(-1, 1)).reshape(len(embeddings), len(embeddings))

# calculate pagerank
nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph, max_iter=500)  # number of cycles to converge
score_list = [scores[sent_idx] for sent_idx in range(len(sentences))]

# reduce dimensionality
pca = PCA(n_components=2)
pos = pca.fit_transform(embeddings)

# get weights
weights = sim_mat
centrality_scores = np.array(score_list)
centrality_scores = scale_centrality_scores(centrality_scores, q=0.1)
np.fill_diagonal(weights, 0)

G = nx.from_numpy_array(weights)

edge_x = []
edge_y = []
for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    if weights[edge[0], edge[1]] > 0.5:
        edge_x.append(x0)
        edge_x.append(x1)
        edge_x.append(None)
        edge_y.append(y0)
        edge_y.append(y1)
        edge_y.append(None)

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.75,
              color='#888'),
    hoverinfo='none',
    mode='lines')

node_x = []
node_y = []
for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers',
    hoverinfo='text',
    marker=dict(
        showscale=True,
        # colorscale options
        # 'Greys' | 'YlGnBu' | 'Greens' | 'YlOrRd' | 'Bluered' | 'RdBu' |
        # 'Reds' | 'Blues' | 'Picnic' | 'Rainbow' | 'Portland' | 'Jet' |
        # 'Hot' | 'Blackbody' | 'Earth' | 'Electric' | 'Viridis' |
        colorscale='Reds',
        # reversescale=True,
        color=[],
        size=[s * 10 for s in centrality_scores],
        colorbar=dict(
            thickness=15,
            title='Centrality Score',
            xanchor='left',
            titleside='right'
        ),
        line_width=1))

node_adjacencies = []
node_text = []
for node, weight in enumerate(centrality_scores):
    node_adjacencies.append(weight)
    node_text.append(sentences[node])

node_trace.marker.color = node_adjacencies
node_trace.text = node_text

fig = go.Figure(data=[edge_trace, node_trace],
                layout=go.Layout(
                    title='<b>TextRank Summarization</b>',
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=20, l=5, r=5, t=40),
                ))
fig.show()