In [1]:
import pandas as pd
import numpy as np
import openai
import matplotlib.pyplot as plt
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
df = pd.read_csv('NU.csv')

In [33]:
import pandas as pd
import plotly.express as px

# Count the number of articles per year
year_counts = df['publication_year'].value_counts().sort_index().reset_index()
year_counts.columns = ['publication_year', 'count']

# Create a bar chart using Plotly
fig = px.bar(
    year_counts, 
    x='publication_year', 
    y='count', 
    title='NU sjukvården antal artiklar publicerade per år',
    labels={'publication_year': 'Publication Year', 'count': 'Number of Articles'},
    color_discrete_sequence=['skyblue']
)

# Update layout for better appearance
fig.update_layout(
    xaxis=dict(tickmode='linear'),  # Ensure all years are shown on the x-axis
    xaxis_title='År',
    yaxis_title='Artiklar',
    bargap=0.2
)

fig.show()

In [34]:
import plotly.express as px

# Create a histogram for the countries_distinct_count column using Plotly
fig = px.histogram(
    df,
    x='countries_distinct_count',
    nbins=20,  # Set the number of bins
    title='NU-sjukvården: Antal nationaliteter per artikel',
    labels={'countries_distinct_count': 'Number of Distinct Countries'}
)

# Update layout for better visualization
fig.update_layout(
    xaxis_title='Number of Distinct Countries',
    yaxis_title='Frequency',
    bargap=0.1,  # Adjust gap between bars
    template='plotly_white'
)

# Show the figure
fig.show()

In [37]:
import plotly.express as px

# Create a histogram for the FWCI column using Plotly
fig = px.histogram(
    df,
    x='fwci',
    nbins=100,  # Set the number of bins
    title='NU-sjukvården forskningskvalitet (FWCI)',
    labels={'fwci': 'Field-Weighted Citation Impact (FWCI)'},
)

# Update layout for better visualization
fig.update_layout(
    xaxis_title='FWCI',
    yaxis_title='Frequency',
    bargap=0.1,  # Adjust gap between bars
    template='plotly_white'
)

# Show the figure
fig.show()

FWCI is the ratio of the actual number of citations received by an output to date and the `expected' number for an output with similar characteristics. `Expected' refers to average citations over the previous three years for all Scopus outputs of the same age, document type and field.

In [39]:
import pandas as pd
import plotly.express as px

# Creating a new DataFrame that groups by primary topic and counts the number of rows
topic_counts = df['primary_topic.subfield.display_name'].value_counts()

# Filtering the original DataFrame to include only topics with at least 5 publications
filtered_df = df[df['primary_topic.subfield.display_name'].isin(topic_counts[topic_counts >= 2].index)]

# Grouping the filtered DataFrame by publication year and primary topic
filtered_df_grouped = filtered_df.groupby(['publication_year', 'primary_topic.subfield.display_name']).size().reset_index(name='count')

# Plotting with Plotly
fig = px.bar(
    filtered_df_grouped,
    x='publication_year',
    y='count',
    color='primary_topic.subfield.display_name',
    title='NU sjukvården antal artiklar publicerade per ämnesområde',
    labels={'publication_year': 'Publication Year', 'count': 'Count of Publications'},
)

fig.update_layout(
    xaxis_title='Publication Year',
    yaxis_title='Count of Publications',
    legend_title='Primary Topic',
    xaxis=dict(tickangle=45),
    legend=dict(
        title='Primary Topic',
        orientation='v',
        x=1.02,
        y=1,
        xanchor='left',
        yanchor='top'
    )
)

fig.show()

In [41]:
import pandas as pd
import plotly.express as px

# Create a boxplot for the citation count grouped by primary subfield, rotated 90 degrees
fig = px.box(
    df, 
    y='primary_topic.subfield.display_name', 
    x='cited_by_count', 
    title='NU-sjukvården: Antal citat (ej normaliserat) per ämnesområde',
    labels={
        'primary_topic.subfield.display_name': 'Subfield',
        'cited_by_count': 'Total Citation Count'
    },
    color='primary_topic.subfield.display_name'  # Optional: Color by subfield for clarity
)

# Update layout for better readability
fig.update_layout(
    yaxis_title='Subfield',
    xaxis_title='Citation Count',
    showlegend=False,
    height=800  # Adjust height if necessary to fit all subfields
)

# Show the boxplot
fig.show()

In [42]:
import pandas as pd
import plotly.express as px

# Creating a new DataFrame that groups by primary topic and counts the number of rows
topic_counts = df['primary_topic.display_name'].value_counts()

# Filtering the original DataFrame to include only topics with at least 5 publications
filtered_df = df[df['primary_topic.display_name'].isin(topic_counts[topic_counts >= 2].index)]

# Grouping the filtered DataFrame by publication year and primary topic
filtered_df_grouped = filtered_df.groupby(['publication_year', 'primary_topic.display_name']).size().reset_index(name='count')

# Plotting with Plotly
fig = px.bar(
    filtered_df_grouped,
    x='publication_year',
    y='count',
    color='primary_topic.display_name',
    title='NU sjukvården: Antal artiklar per subkategori',
    labels={'publication_year': 'Publication Year', 'count': 'Count of Publications'},
)

fig.update_layout(
    xaxis_title='Publication Year',
    yaxis_title='Count of Publications',
    legend_title='Primary Topic',
    xaxis=dict(tickangle=45),
    legend=dict(
        title='Primary Topic',
        orientation='v',
        x=1.02,
        y=1,
        xanchor='left',
        yanchor='top'
    )
)

fig.show()

In [43]:

import pandas as pd
import plotly.express as px

# Creating a new DataFrame that groups by primary topic and counts the number of rows
topic_counts = df['primary_topic.field.display_name'].value_counts()

# Filtering the original DataFrame to include only topics with at least 5 publications
filtered_df = df[df['primary_topic.field.display_name'].isin(topic_counts[topic_counts >= 2].index)]

# Grouping the filtered DataFrame by publication year and primary topic
filtered_df_grouped = filtered_df.groupby(['publication_year', 'primary_topic.field.display_name']).size().reset_index(name='count')

# Plotting with Plotly
fig = px.bar(
    filtered_df_grouped,
    x='publication_year',
    y='count',
    color='primary_topic.field.display_name',
    title='NU sjukvården, antal artiklar per övergripande disciplin',
    labels={'publication_year': 'Publication Year', 'count': 'Count of Publications'},
)

fig.update_layout(
    xaxis_title='Publication Year',
    yaxis_title='Count of Publications',
    legend_title='Primary Topic',
    xaxis=dict(tickangle=45),
    legend=dict(
        title='Primary Topic',
        orientation='v',
        x=1.02,
        y=1,
        xanchor='left',
        yanchor='top'
    )
)

fig.show()


In [44]:
import plotly.express as px

# Count the occurrences of each source display name
source_counts = df['primary_location.source.display_name'].value_counts().reset_index()
source_counts.columns = ['Source', 'Count']

# Create a bar chart using Plotly
fig = px.bar(
    source_counts,
    x='Source',
    y='Count',
    title='Var publicerar NU sjukvården?',
    labels={'Source': 'Source Display Name', 'Count': 'Number of Publications'},
    template='plotly_white'
)

# Update layout for better readability
fig.update_layout(
    xaxis_title='Source Display Name',
    yaxis_title='Number of Publications',
    xaxis_tickangle=-45  # Rotate x-axis labels for better readability
)

# Show the figure
fig.show()

In [45]:
import pandas as pd
from wordcloud import WordCloud
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np

# Assuming your DataFrame is named df and has the column 'keywords.display_name'

# Combine all keywords into a single string
all_keywords = ' '.join(df['keywords.display_name'].dropna())

# Split the string into individual keywords using the separator '|'
all_keywords_list = all_keywords.split('|')

# Join the keywords back into a single string, separated by spaces
keywords_text = ' '.join(all_keywords_list)

# Create a word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white', collocations=False).generate(keywords_text)

# Save the word cloud image as an array
image_array = np.array(wordcloud)

# Plotly does not directly support word clouds, so we display the image
fig = px.imshow(image_array, title="MeSH-termer, ordmoln")
fig.update_xaxes(showticklabels=False).update_yaxes(showticklabels=False)
fig.show()

In [46]:
import pandas as pd
import plotly.express as px
from collections import Counter

# Combine all country entries into a single string, handle NaNs by dropping them
all_countries = ' '.join(df['authorships.countries'].dropna())

# Split the string into individual country names based on the separator '|'
all_countries_list = all_countries.split('|')

# Count occurrences of each country
country_counts = Counter(all_countries_list)

# Convert the counts to a DataFrame
country_df = pd.DataFrame(country_counts.items(), columns=['Country', 'Count'])

# Sort the DataFrame by count for better visualization
country_df = country_df.sort_values(by='Count', ascending=False)

# Create a bar chart using Plotly
fig = px.bar(country_df, x='Country', y='Count', title='Samverkan med andra länder',
             labels={'Country': 'Country', 'Count': 'Number of Publications'},
             color='Count', color_continuous_scale='Viridis')

# Show the bar chart
fig.show()

In [48]:
import pandas as pd
import plotly.express as px
from collections import Counter

# Combine all funder entries into a single string, handle NaNs by dropping them
all_funders = ' '.join(df['grants.funder_display_name'].dropna())

# Split the string into individual funder names based on the separator '|'
all_funders_list = all_funders.split('|')

# Count occurrences of each funder
funder_counts = Counter(all_funders_list)

# Convert the counts to a DataFrame
funder_df = pd.DataFrame(funder_counts.items(), columns=['Funder', 'Count'])

# Sort the DataFrame by count for better visualization
funder_df = funder_df.sort_values(by='Count', ascending=False)

# Create a bar chart using Plotly
fig = px.bar(funder_df, x='Funder', y='Count', title='Publikationer per finansiär (kan vara beviljad till annan forskare än NU-sjukvårdens)',
             labels={'Funder': 'Funder', 'Count': 'Number of Publications'},
             color='Count', color_continuous_scale='Viridis')

# Show the bar chart
fig.show()