# Imports

In [None]:
# %load_ext nb_black
import pandas as pd
import numpy as np
from plotly import graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import networkx as nx

In [None]:
colors = px.colors.qualitative.Plotly
# colors = px.colors.qualitative.Plasma

In [None]:
# Create plot layout
layout = go.Layout(
    # width=1300,
    # height=650,
    width=1000,
    height=500,
    template="plotly_white",
    font=dict(
        color="black",
        size=20,
        family="Arial",
    ),
    # xaxis=dict(showgrid=False),
    # yaxis=dict(showgrid=False),
)

# Read files

In [None]:
df = pd.read_excel("../data/06_model_output/output_keywording.xlsx")
df

In [None]:
df.columns

# RQ1 - Publications over time and locations

## RQ1 - Caracterização das publicações - Anual

In [None]:
pub_per_year = df.groupby(["RQ1_PubYear"])["title"].count()
fig_pub_per_year = go.Figure([go.Bar(x=pub_per_year.index, y=pub_per_year, text=pub_per_year, textposition='auto',)], layout=layout)
fig_pub_per_year.show()

## RQ1 - Caracterização das publicações - Anual por Fonte

In [None]:
pub_per_type_year = df.groupby(["RQ1_PubYear", "RQ1_PubSourceType"])["title"].count().reset_index()
fig_pub_per_type_year = go.Figure(layout=layout)

for pub_type in pub_per_type_year["RQ1_PubSourceType"].unique():

    df_filt = pub_per_type_year[pub_per_type_year["RQ1_PubSourceType"] == pub_type]

    x = list(df_filt["RQ1_PubYear"])
    y = list(df_filt["title"])
    
    fig_pub_per_type_year.add_trace(go.Bar(x=x, y=y, name = pub_type, text=y, textposition='auto',))

fig_pub_per_type_year.update_layout(barmode='stack')
fig_pub_per_type_year.show()

## RQ1 - Caracterização das publicações - Veículo

In [None]:
fig_pub_per_location = go.Figure(layout=layout)

pub_per_location = df.groupby(["RQ1_PubSourceType", "RQ1_ShortPubSourceName"])["title"].count().reset_index()
pub_per_location = pub_per_location.sort_values(by = "title")

colors_dict = {
    'Conference': colors[0],
    'Journal': colors[1],
    'Workshop': colors[2],
}

for pub_type in sorted(pub_per_location["RQ1_PubSourceType"].unique(), reverse=True):

    df_filt = pub_per_location[pub_per_location["RQ1_PubSourceType"] == pub_type]

    x = list(df_filt["RQ1_ShortPubSourceName"])
    y = list(df_filt["title"])
    
    fig_pub_per_location.add_trace(go.Bar(x=y, y=x, name = pub_type, text=y, marker_color = colors_dict[pub_type], textposition='auto',orientation='h'))

fig_pub_per_location.update_layout(font=dict(size=16,))
fig_pub_per_location.show()

## RQ1 - Caracterização das publicações - Multiplot

In [None]:
fig_rq1 = make_subplots(rows=1, cols=2,  horizontal_spacing = 0.25,  subplot_titles=['Publicações por Ano e Fonte', 'Publicações por Fonte e Veículo'])

pub_per_type_year = df.groupby(["RQ1_PubYear", "RQ1_PubSourceType"])["title"].count().reset_index()

for pub_type in pub_per_type_year["RQ1_PubSourceType"].unique():

    df_filt = pub_per_type_year[pub_per_type_year["RQ1_PubSourceType"] == pub_type]

    x = list(df_filt["RQ1_PubYear"])
    y = list(df_filt["title"])
    
    # fig_rq1.add_trace(go.Bar(x=x, y=y, name = pub_type, text=y, textposition='auto',showlegend=False),  row=1, col=1)
    fig_rq1.add_trace(go.Bar(x=x, y=y, name = pub_type,showlegend=False),  row=1, col=1)

pub_per_location = df.groupby(["RQ1_PubSourceType", "RQ1_ShortPubSourceName"])["title"].count().reset_index()
pub_per_location = pub_per_location.sort_values(by = "title")

colors_dict = {
    'Conference': colors[0],
    'Journal': colors[1],
    'Workshop': colors[2],
}

for pub_type in sorted(pub_per_location["RQ1_PubSourceType"].unique(), reverse=True):

    df_filt = pub_per_location[pub_per_location["RQ1_PubSourceType"] == pub_type]

    x = list(df_filt["RQ1_ShortPubSourceName"])
    y = list(df_filt["title"])

    pub_name = pub_type.replace('Conference', 'Conferência').replace('Journal', 'Journal')
    
    fig_rq1.add_trace(go.Bar(x=y, y=x, name = pub_name, marker_color = colors_dict[pub_type], orientation='h'),  row=1, col=2)

fig_rq1.update_layout(width=1000,
    height=500,
    template="plotly_white",
    font=dict(
        color="black",
        size=14,  # can change the size of font here
        family="Arial",
    ),
    xaxis=dict(
        tickmode='linear'
    ),
    barmode='stack',
    title = "RQ1 - Caracterização das publicações",
)
fig_rq1.show()

# RQ0 - Bibliometria

## RQ0 - Evolução temporal de publicações e citações

In [None]:
df_sorted = df.sort_values(by = "RQ1_EstimatedPubDate")
df_sorted["colors"] = df_sorted["RQ1_PubSourceType"].map(colors_dict)

fig_rq0 = go.Figure(layout = layout)

for pub_type in df_sorted["RQ1_PubSourceType"].unique():

    df_filt = df_sorted[df_sorted["RQ1_PubSourceType"] == pub_type]

    pub_name = pub_type.replace('Conference', 'Conferência').replace('Journal', 'Journal')

    fig_rq0.add_trace(
        go.Scatter(
            x=list(df_filt["RQ1_EstimatedPubDate"]), 
            # y=[0.5*i for i in range(0, len(df_sorted["RQ1_EstimatedPubDate"]))],
            y=list(df_filt["RQ0_ScopusCitations"]),
            # marker=dict(size=df_sorted["RQ0_ScopusCitations"]*1),
            marker=dict(color=list(df_filt["colors"])),
            mode='markers',
            name = pub_name,
        )
    )

fig_rq0.update_layout(
    title = "RQ0 - Evolução temporal de Publicações e Citações",
    # xaxis_title="X Axis Title",
    yaxis_title="Citações Scoups",
)

fig_rq0.show()

In [None]:
df_sorted = df.sort_values(by = "RQ1_EstimatedPubDate")
df_sorted["colors"] = df_sorted["RQ1_PubSourceType"].map(colors_dict)

fig_rq0 = go.Figure(layout = layout)

for pub_type in df_sorted["RQ1_PubSourceType"].unique():

    df_filt = df_sorted[df_sorted["RQ1_PubSourceType"] == pub_type]

    pub_name = pub_type.replace('Conference', 'Conferência').replace('Journal', 'Journal')
    
    fig_rq0.add_trace(
        go.Scatter(
            x=list(df_filt["RQ1_EstimatedPubDate"]), 
            y=list(df_filt["RQ0_FWCI"]),
            # y=[0.5*i for i in range(0, len(df_sorted["RQ1_EstimatedPubDate"]))],
            # y=[1 for i in range(0, len(df_sorted["RQ1_EstimatedPubDate"]))],
            # y=list(df_filt["RQ0_ScopusCitations"]),
            marker=dict(size=5*(1+df_filt["RQ0_ScopusCitations"]), color=list(df_filt["colors"])),
            # marker=dict(color=list(df_filt["colors"])),
            mode='markers',
            name = pub_name,
        )
    )

fig_rq0.update_layout(
    title = "RQ0 - Evolução temporal de Publicações, Citações e Impacto",
    # xaxis_title="X Axis Title",
    yaxis_title="Impacto Scopus",
    # zaxis_title="Citações Scoups",
)

fig_rq0.add_annotation(
    text="Tamanho:<br>Citações Scopus",
    xref="paper", yref="paper",
    x=1.24, y=0.7, 
    showarrow=False,
    font = {'family': "Arial", 'size': 20},
    align="left"
)

fig_rq0.show()

In [None]:
df_citations = df.sort_values(by = "RQ0_ScopusCitations", ascending = False)

citations_per_article = make_subplots(specs=[[{"secondary_y": True}]])
citations_per_article.add_trace(
    go.Bar(
        x=df_citations["entry"].astype(str), 
        y=df_citations["RQ0_ScopusCitations"], 
        name = "Citações Scopus",
    ), 
)
citations_per_article.add_trace(
    go.Scatter(
        x=df_citations["entry"].astype(str), 
        y=df_citations["RQ0_FWCI"], 
        name = "Impacto Scopus",
    ), 
    secondary_y=True,
)
citations_per_article.update_layout(
    title = "RQ0 - Distribuição das Citações e Impacto",
    width=1000,
    height=500,
    template="plotly_white",
    font=dict(
        color="black",
        size=20,
        family="Arial",
    ),
)
citations_per_article.update_xaxes(showticklabels=False)
citations_per_article.update_yaxes(showgrid=False, secondary_y=False, title = "Citações Scopus")
citations_per_article.update_yaxes(showgrid=False, secondary_y=True, range = [0, 3.5], title = "Impacto Scopus")

citations_per_article.add_annotation(
    text="Publicações",
    xref="paper", yref="paper",
    x=0.45, y=-0.13, 
    showarrow=False,
)
 
citations_per_article.show()

## RQ0 - Mapa autores

In [None]:
df_countries = df[["entry", "RQ0_Countries"]]
explode_cols = df_countries["RQ0_Countries"].str.split(";", expand = True)
df_countries = pd.concat([df_countries, explode_cols], axis = 1)
df_countries["count_vals"] = df_countries[[0, 1, 2]].count(axis = 1)
df_countries = pd.concat(
    [
        df_countries[["entry", 0, "count_vals"]].rename(columns = {0: "country"}),
        df_countries[["entry", 1, "count_vals"]].rename(columns = {1: "country"}),
        df_countries[["entry", 2, "count_vals"]].rename(columns = {2: "country"}),
    ], axis = 0
).dropna().reset_index(drop = True)
df_countries["weight"] = 1/df_countries["count_vals"]
articles_per_country = df_countries.groupby(["country"])["weight"].sum()
fig_rq0_countries = px.choropleth(
    locations=articles_per_country.index, 
    color = articles_per_country, 
    locationmode='country names', 
    scope='world',  
    color_continuous_scale="Burg",
)

fig_rq0_countries.layout.coloraxis.colorbar.title = 'Autorias de <br>publicações <br>por país'

fig_rq0_countries.update_layout(    
    width=1000,
    height=500,
    title = "RQ0 - Distribuição Geográfica de Publicações",
)
fig_rq0_countries.show()

## RQ0 - Rede autores

### Create nodes and links tables

In [None]:
# Create nodes and links tables

df_authors = df[["entry", "RQ0_Authors"]]
explode_cols = df_authors["RQ0_Authors"].str.split("; ", expand = True)
df_authors = pd.concat([df_authors, explode_cols], axis = 1)

links = pd.DataFrame(columns = ["author1", "author2"])

for entry in df_authors["entry"]:
    for i in range(0, 5):
        for j in range(i+1, 5+1):
            author1 = df_authors[df_authors["entry"] == entry][i].values[0]
            author2 = df_authors[df_authors["entry"] == entry][j].values[0]

            if author1 != None and author2 != None:
                sorted_authors = sorted([author1, author2])
                links = pd.concat([links, pd.DataFrame([sorted_authors], columns = links.columns)], ignore_index=True)

links = links[["author1", "author2"]].value_counts().reset_index()
links.columns = ["author1", "author2", "weight"]
links

nodes = pd.concat(
    [
        df_authors[["entry", 0]].rename(columns = {0: "authors"}),
        df_authors[["entry", 1]].rename(columns = {1: "authors"}),
        df_authors[["entry", 2]].rename(columns = {2: "authors"}),
        df_authors[["entry", 3]].rename(columns = {3: "authors"}),
        df_authors[["entry", 4]].rename(columns = {4: "authors"}),
        df_authors[["entry", 5]].rename(columns = {5: "authors"}),
    ], axis = 0
).dropna().reset_index(drop = True)
nodes = nodes.groupby(["authors"])["entry"].count().sort_values(ascending = False)
nodes

# Create graph
G=nx.from_pandas_edgelist(links, source="author1", target='author2',edge_attr="weight")
pos = nx.spring_layout(G, k=0.3,seed=42)
pos

# Plot graph
edge_x = []
edge_y = []

for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.append(x0)
    edge_x.append(x1)
    edge_x.append(None)
    edge_y.append(y0)
    edge_y.append(y1)
    edge_y.append(None)

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines')

node_x = []
node_y = []
node_name = []
node_size = []
for node in G.nodes():
    x, y = pos[node]
    node_name.append(node)
    node_x.append(x)
    node_y.append(y)
    num_articles = nodes.loc[node]
    if num_articles == 1:
        size = 10
    elif num_articles == 2:
        size = 15
    elif num_articles == 8:
        size = 40
    node_size.append(size)

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers',
    text = node_name,
    hoverinfo='text',
    marker=dict(
        showscale=True,
        # colorscale='Plasma',
        # colorscale='RdPu_r',
        colorscale='Burg_r',
        reversescale=True,
        size=node_size,
        colorbar=dict(
            thickness=15,
            title='Conexões dos Nós',
            xanchor='left',
            titleside='right'
        ),
    )
)

node_adjacencies = []
i = 0
for node, adjacencies in enumerate(G.adjacency()):
    node_adjacencies.append(len(adjacencies[1]))
    i = i + 1

node_trace.marker.color = node_adjacencies


authors_network = go.Figure(data=[edge_trace, node_trace],
             layout=go.Layout(
                title="<br>RQ0 - Conexões entre autores",
                width=1000,
                height=500,
                template="plotly_white",
                font=dict(
                    color="black",
                    size=12,
                    family="Arial",
                ),
                showlegend=False,
                margin=dict(b=20,l=50,r=5,t=60),
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False, showline=True, mirror=True, linecolor = "black", linewidth = 1),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False, showline=True, mirror=True, linecolor = "black", linewidth = 1)
             )
)

authors_network.add_annotation(
    text="J. Saltz",
    xref="paper", yref="paper",
    x=0.80, y=0.45, 
    showarrow=False,
    font = {'family': "Arial", 'size': 12},
)

authors_network.add_annotation(
    text="I. Shamshurin",
    xref="paper", yref="paper",
    x=0.72, y=0.37, 
    showarrow=False,
    font = {'family': "Arial", 'size': 12},
)

authors_network.add_annotation(
    text="C. Naik",
    xref="paper", yref="paper",
    x=0.18, y=0.9, 
    showarrow=False,
    font = {'family': "Arial", 'size': 12},
)

authors_network.add_annotation(
    text="K. Singla",
    xref="paper", yref="paper",
    x=0.15, y=0.77, 
    showarrow=False,
    font = {'family': "Arial", 'size': 12},
)

authors_network.add_annotation(
    text="J. Bose",
    xref="paper", yref="paper",
    x=0.23, y=0.77, 
    showarrow=False,
    font = {'family': "Arial", 'size': 12},
)

authors_network.add_annotation(
    text="Tamanho:<br>Publicações",
    xref="paper", yref="paper",
    x=1.1, y=1.09, 
    showarrow=False,
    font = {'family': "Arial", 'size': 11},
    align="left"
)
authors_network.update_traces(textposition='top center')
authors_network.show()

## RQ0 - Rede citações

In [None]:
df_citacoes = pd.read_excel("../data/06_model_output/citacoes_compilado.xlsx")
df_citacoes = df_citacoes[["article", "Title"]]

missing_articles = pd.DataFrame([x for x in list(df["entry"]) if x not in list(df_citacoes["article"].unique())], columns = ["article"])
missing_articles["Title"] = np.nan

df_citacoes = pd.concat([df_citacoes, missing_articles], ignore_index = True)

G=nx.from_pandas_edgelist(df_citacoes, source="article", target='Title')
pos = nx.spring_layout(G, k=0.3,seed=44)
pos

edge_x = []
edge_y = []

for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    if np.nan not in edge:
        edge_x.append(x0)
        edge_x.append(x1)
        edge_x.append(None)
        edge_y.append(y0)
        edge_y.append(y1)
        edge_y.append(None)

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines')

node_x = []
node_y = []
article_node_x = []
article_node_y = []
article_node_name = []
node_name = []
node_color = []
for node in G.nodes():
    x, y = pos[node]
    if str(node) != "nan":
        node_name.append(node)
        node_x.append(x)
        node_y.append(y)
        if node in list(df["entry"]):
            article_node_name.append(node)
            article_node_x.append(x)
            article_node_y.append(y)

articles_node_trace = go.Scatter(
    x=article_node_x, y=article_node_y,
    mode='markers',
    text = article_node_name,
    hoverinfo='text',
    marker=dict(
        size=10,
        color = colors[3],
    )
)

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers',
    text = node_name,
    hoverinfo='text',
    marker=dict(
        showscale=True,
        # colorscale='Plasma',
        # colorscale='RdPu_r',
        colorscale='Burg_r',
        reversescale=True,
        size=10,
        colorbar=dict(
            thickness=15,
            title='Publicações citadas dentre as selecionadas',
            xanchor='left',
            titleside='right'
        ),
    )
)

node_adjacencies = []
i = 0
for node, adjacencies in enumerate(G.adjacency()):
    if list(G.nodes())[node] not in list(df["entry"]):
        node_adjacencies.append(len(adjacencies[1]))
    else:
        node_adjacencies.append(1)
    i = i + 1

node_trace.marker.color = node_adjacencies

authors_network = go.Figure(data=[edge_trace, node_trace, articles_node_trace],
             layout=go.Layout(
                title="<br>RQ0 - Conexões entre citações das publicações",
                width=1000,
                height=500,
                template="plotly_white",
                font=dict(
                    color="black",
                    size=12,
                    family="Arial",
                ),
                showlegend=False,
                margin=dict(b=20,l=50,r=5,t=60),
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False, showline=True, mirror=True, linecolor = "black", linewidth = 1),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False, showline=True, mirror=True, linecolor = "black", linewidth = 1)
             )
)
authors_network.show()