In [152]:
%load_ext lab_black

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from bokeh.io import output_notebook, show
from bokeh.models import ColumnDataSource, FactorRange
from bokeh.plotting import figure
from bokeh.palettes import Spectral3

sns.set()
output_notebook()

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [327]:
dfs = [
    pd.read_csv("data/health/ng_medi_2.csv", delimiter=";"),
    pd.read_csv("data/ecology/cng_ecology.csv"),
    pd.read_csv("data/energy/cng_energy.csv"),
]

In [328]:
dfs[0]["theme"] = "Здоровье"
dfs[1]["theme"] = "Экология"
dfs[2]["theme"] = "Экономика"

In [329]:
data = pd.concat(dfs)

In [73]:
keywords_files = [
    "data/health/keywords_ng_medi.csv",
    "data/ecology/keywords.csv",
    "data/energy/keywords.csv",
]
keywords = []
for filename in keywords_files:
    keywords.append(
        pd.read_csv(filename, names=("keywords", "x", "y", "label"), skiprows=1)
    )

keywords[0]["theme"] = "Здоровье"
keywords[1]["theme"] = "Экология"
keywords[2]["theme"] = "Экономика"


keywords[0] = keywords[0][keywords[0]["label"] == 53]
keywords[1] = keywords[1][keywords[1]["label"] == 6]
keywords[2] = keywords[2][keywords[2]["label"] == 46]

# keywords = pd.concat(keywords)
# keywords["label"] = keywords["label"].astype("category")
# keywords.head()

In [74]:
def check_containts(source: str, target: list, return_bool=False):
    if type(source) != str:
        if return_bool:
            return False

        return set()

    source = source.lower().split("; ")
    contains = set()
    for t in target:
        if t in source:
            if return_bool:
                return True

            contains.add(t)

    if return_bool:
        return False
    return contains

In [75]:
data_filtered = []
for i in range(3):
    data_filtered.append(
        dfs[i][
            dfs[i]["Ключевые слова указателя"].apply(
                lambda x: check_containts(x, keywords[i]["keywords"], return_bool=True)
            )
        ].copy()
    )

data_filtered = pd.concat(data_filtered)
keywords = pd.concat(keywords)

## По темам

In [153]:
counts = data_filtered["theme"].value_counts()
counts_theme = counts.index.tolist()
counts_vals = counts.values.tolist()

source_counts = ColumnDataSource(
    data=dict(themes=counts_theme, counts=counts_vals, color=Spectral3)
)

In [155]:
p = figure(
    x_range=counts_theme,
    plot_height=300,
    title="Количество статей по темам (отфильтровано по кластерам)",
    toolbar_location=None,
    tools="",
)
p.vbar(x="themes", top="counts", color="color", source=source_counts, width=0.9)
p.xgrid.grid_line_color = None
p.y_range.start = 0
show(p)

## По годам

In [205]:
data_filtered["Год"] = data_filtered["Год"].astype(int)

In [206]:
data_filtered_year = data_filtered[data_filtered["Год"] > 2010].copy()

In [244]:
counts_year = data_filtered_year.groupby(["theme", "Год"]).size()
themes = []
years = sorted(data_filtered_year["Год"].unique())
for t in data_filtered_year["theme"].unique():
    data_source[t] = counts_year[t].values.tolist()
    themes.append(t)
data_source["years"] = [str(i) for i in years]

# counts_year_theme = np.unique(counts_year.index.get_level_values(0)).tolist()
# counts_year_year = counts_year.index.get_level_values(1).unique().tolist()

In [252]:
colors = ["#c9d9d3", "#718dbf", "#e84d60"]

p = figure(
    x_range=data_source["years"],
    plot_height=300,
    title="Распределение тематик по годам",
    toolbar_location=None,
    tools="",
)

p.vbar_stack(
    themes, x="years", width=0.9, color=colors, source=data_source, legend_label=themes
)

p.y_range.start = 0
p.x_range.range_padding = 0.1
p.xgrid.grid_line_color = None
p.axis.minor_tick_line_color = None
p.outline_line_color = None
p.legend.location = "top_left"
p.legend.orientation = "horizontal"

show(p)

## По годам отдельно по темам

In [297]:
def plot_count_theme(theme):
    data_filtered_theme = data_filtered[
        (data_filtered["theme"] == theme) & (data_filtered["Год"] > 2010)
    ].copy()
    counts_year = data_filtered_theme["Год"].value_counts().sort_index()
    years = counts_year.index.tolist()
    years = [str(i) for i in years]

    counts = counts_year.values.tolist()
    p = figure(
        x_range=years,
        plot_height=300,
        title=f"Распределение статей по теме '{theme}' по годам",
        toolbar_location=None,
        tools="",
    )

    p.vbar(x=years, top=counts, width=0.9, color="#e84d60")

    p.y_range.start = 0
    p.x_range.range_padding = 0.1
    p.xgrid.grid_line_color = None
    p.axis.minor_tick_line_color = None
    p.outline_line_color = None

    show(p)


# counts_year_theme = np.unique(counts_year.index.get_level_values(0)).tolist()
# counts_year_year = counts_year.index.get_level_values(1).unique().tolist()

In [300]:
plot_count_theme("Экономика")

## Цитирования

In [302]:
data_filtered["Цитирования"] = pd.to_numeric(data_filtered["Цитирования"])

In [305]:
data_filtered.groupby(["theme", "Год"])["Цитирования"].mean()

theme      Год 
Здоровье   1956    23.000000
           1967     2.000000
           1970     1.000000
           1971     9.000000
           1972    17.000000
                     ...    
Экономика  2017    20.707317
           2018    14.241935
           2019    12.177215
           2020     5.613636
           2021     2.466667
Name: Цитирования, Length: 125, dtype: float64

In [333]:
data_filtered.columns

Index(['Авторы', 'Идентификатор автора(ов)', 'Название', 'Год',
       'Название источника', 'Том', 'Выпуск ', 'Статья №', 'Страница начала',
       'Страница окончания', 'Количество страниц', 'Цитирования', 'DOI',
       'Ссылка', 'Организации', 'Авторы организаций', 'Краткое описание',
       'Ключевые слова автора', 'Ключевые слова указателя',
       'Адрес для корреспонденции', 'Редакторы', 'Издатель', 'ISSN', 'ISBN',
       'CODEN', 'Идентификатор PubMed', 'Язык оригинального документа',
       'Сокращенное название источника', 'Тип документа', 'Стадия публикации',
       'Open Access (открытый доступ)', 'Источник', 'EID', 'theme'],
      dtype='object')

In [306]:
data_filtered_year = data_filtered[data_filtered["Год"] > 2010].copy()
cites_year = data_filtered_year.groupby(["theme", "Год"])["Цитирования"].mean()
themes = []
years = sorted(data_filtered_year["Год"].unique())
for t in data_filtered_year["theme"].unique():
    data_source[t] = cites_year[t].values.tolist()
    themes.append(t)
data_source["years"] = [str(i) for i in years]

# counts_year_theme = np.unique(counts_year.index.get_level_values(0)).tolist()
# counts_year_year = counts_year.index.get_level_values(1).unique().tolist()

In [314]:
colors = ["#c9d9d3", "#718dbf", "#e84d60"]

p = figure(
    x_range=data_source["years"],
    plot_height=300,
    title="Среднее цитирование статей по годам и темам",
    toolbar_location=None,
    tools="",
)

p.vbar_stack(
    themes, x="years", width=0.9, color=colors, source=data_source, legend_label=themes
)

p.y_range.start = 0
p.yaxis.axis_label = "Количество цитирований"
p.x_range.range_padding = 0.1
p.xgrid.grid_line_color = None
p.axis.minor_tick_line_color = None
p.outline_line_color = None
p.legend.location = "top_right"
p.legend.orientation = "horizontal"

show(p)

In [324]:
data['Цитирования'] = pd.to_numeric(data['Цитирования'])

ValueError: Unable to parse string "Int. J. Environ. Res. Public Health" at position 24

In [332]:
data_filtered[(data_filtered["theme"] == "Здоровье") & (data_filtered["Год"] == 2014)][
    ["Цитирования", "Название"]
].dropna().sort_values(by="Цитирования")

Unnamed: 0,Цитирования,Название
561,4.0,What are the main environmental exposures asso...
544,6.0,Hydraulic fracturing and the risk of silicosis
562,6.0,Modeling the effects of immunizations timing o...
547,6.0,Shale gas regulation in the UK and health impl...
568,7.0,Mansonella ozzardi (Nematoda: Onchocercidae) i...
543,7.0,Sonochemistry in the service of SOFC research
508,8.0,Hospital bioterrorism planning and burn surge
524,9.0,Genetic resources for methane production from ...
495,11.0,Injury rates on new and old technology oil and...
566,18.0,Uneven futures of human lifespans: Reckonings ...


In [326]:
data["Цитирования"]

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
       ... 
1663    NaN
1664    NaN
1665    NaN
1666    NaN
1667    NaN
Name: Цитирования, Length: 3685, dtype: object