# Create Topic Table:

Simple Table: This will list all the topics in order of largest to smallest clusters.
Advanced Table: This will additionally list median attention, median policy citations, and median academic citations.

## Import libraries:

In [None]:
import pandas as pd
import numpy as np
import os

# Plotting libraries:
import plotly.graph_objs as go
import plotly.figure_factory as ff

# Set to display all columns
pd.set_option('display.max_columns', None)

# Change working directory to one folder up
os.chdir('..')

# Print working directory
print(os.getcwd())

## Import data:

In [None]:
df_cdc = pd.read_csv('Data/FullTable_CDC_Cleaned.csv')

In [None]:
df_cdc.info()

In [None]:
# Fill NAs:
df_cdc['Altmetric Attention Score'].fillna(0,inplace=True)
df_cdc['Altmetric Policy mentions'].fillna(0,inplace=True)
df_cdc['Number of Dimensions citations'].fillna(0,inplace=True)
df_cdc['BMJ Policy citation count'].fillna(0,inplace=True)

In [None]:
df_cdc['Year'].value_counts()

In [None]:
df_cdc.groupby('Final Label').describe()

In [None]:
simple_data = {#'Topic Name': df_cdc.groupby('Final Label').describe().index,
    '# of Publications': df_cdc.groupby('Final Label').describe()['Year']['count'].astype(int)}

In [None]:
df_simple = pd.DataFrame(simple_data).sort_values('# of Publications',ascending=False)

In [None]:
clustered_total = pd.DataFrame([sum(df_cdc.groupby('Final Label').describe()['Year']['count'].astype(int))],
                               columns=['# of Publications'],
                               index=['Total - Clustered'])

In [None]:
unclustered_total = pd.DataFrame([sum(df_cdc['Final Label'].isna())],
                                 columns=['# of Publications'],
                                 index=['Total - Unclustered'])

In [None]:
total = pd.DataFrame([len(df_cdc)],
                     columns=['# of Publications'],
                     index=['Total'])

In [None]:
df_simple = pd.concat([df_simple,clustered_total,unclustered_total,total])

In [None]:
df_simple

In [None]:
df_simple['% of Publications'] = round(100*df_simple['# of Publications']/len(df_cdc),1)

In [None]:
df_simple

In [None]:
df_simple.to_excel('Results/topics_table_simple.xlsx')

## Add median AAS:

In [None]:
df_advanced = df_simple

In [None]:
df_advanced['Median AAS'] = df_cdc.groupby('Final Label').describe()['Altmetric Attention Score']['50%']

In [None]:
df_advanced

In [None]:
df_advanced.loc['Total - Clustered','Median AAS'] = np.median(df_cdc[
    df_cdc['Final Label'].notna()]['Altmetric Attention Score'])

In [None]:
df_advanced.loc['Total - Unclustered','Median AAS'] = np.median(df_cdc[
    df_cdc['Final Label'].isna()]['Altmetric Attention Score'])

In [None]:
df_advanced.loc['Total','Median AAS'] = np.median(df_cdc['Altmetric Attention Score'])

In [None]:
df_advanced

In [None]:
df_advanced.sort_values('Median AAS',ascending=False)

## Thoughts:

For the advanced table, we might run into some issues. Specifically, the year ranges for academic and policy citations will be different than the year range for attention and the total topic #s... It may make the table too messy. We can do supplement tables that report academic and policy.

For consideration, the advanced main table can still report median attention, so let's see what that looks like (above).

In [None]:
#df_advanced['Median Academic Citations'] = df_cdc.groupby('Final Label').describe()['Number of Dimensions citations']['50%']

In [None]:
df_older = df_cdc[df_cdc['Year'] <= 2020]

In [None]:
academic = pd.DataFrame(df_older.groupby('Final Label').describe()['Number of Dimensions citations']['50%'])

In [None]:
academic.sort_values('50%',ascending=False)

In [None]:
df_advanced['Median Academic Citations'] = df_older.groupby('Final Label').describe()['Number of Dimensions citations']['50%']

df_advanced.loc['Total - Clustered','Median Academic Citations'] = np.median(df_older[
    df_older['Final Label'].notna()]['Number of Dimensions citations'])

df_advanced.loc['Total - Unclustered','Median Academic Citations'] = np.median(df_older[
    df_older['Final Label'].isna()]['Number of Dimensions citations'])

df_advanced.loc['Total','Median Academic Citations'] = np.median(df_older['Number of Dimensions citations'])

In [None]:
df_advanced

In [None]:
any_academic = pd.DataFrame(df_older['Number of Dimensions citations'].ne(0).groupby(df_older['Final Label']).value_counts().unstack())

any_academic['Total'] = any_academic[False] + any_academic[True]
any_academic['Any %'] = any_academic[True]/any_academic['Total']

any_academic.sort_values('Any %',ascending=False)

In [None]:
# So that flag for has policy citations works:
df_older['All Policy'] = df_older['Altmetric Policy mentions'] + df_older['BMJ Policy citation count']

In [None]:
policy = pd.DataFrame(df_older.groupby('Final Label').describe()['BMJ Policy citation count']['50%'])

In [None]:
policy.sort_values('50%',ascending=False)

In [None]:
any_policy = pd.DataFrame(df_older['All Policy'].ne(0).groupby(df_older['Final Label']).value_counts().unstack())

In [None]:
any_policy['Total'] = any_policy[False] + any_policy[True]
any_policy['Any %'] = any_policy[True]/any_policy['Total']

any_policy.sort_values('Any %',ascending=False)

In [None]:
df_advanced['% with Policy Citations'] = any_policy['Any %']

df_advanced.loc['Total - Clustered','% with Policy Citations'] = sum(df_older[
    df_older['Final Label'].notna()]['All Policy'] > 0)/len(df_older[df_older['Final Label'].notna()])

df_advanced.loc['Total - Unclustered','% with Policy Citations'] = sum(df_older[
    df_older['Final Label'].isna()]['All Policy'] > 0)/len(df_older[df_older['Final Label'].isna()])

df_advanced.loc['Total','% with Policy Citations'] = sum(df_older['All Policy'] > 0)/len(df_older)

In [None]:
df_advanced

In [None]:
df_advanced.to_excel('Results/topic_table_advanced.xlsx')