In [1]:
import pandas as pd
import datetime
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
# read data from files
sourceDF = pd.read_csv('data/FieldsOfStudy.csv', header=0)

In [3]:
# format date strings as datetimes
sourceDF['Date'] = pd.to_datetime(sourceDF['Date'])

In [4]:
# sort by date
sourceDF = sourceDF.sort_values('Date')
sourceDF.tail()


Unnamed: 0,Date,PaperId,FieldOfStudyId,Score,PaperFieldRank,DisplayName,Level,PaperFamilyCount,CitationCount
252653,2020-11-11,3045538049,3079626,0.467303,1,Quantum electrodynamics,1,221988,2772442
116837,2020-11-11,3045253591,121332964,0.371848,2,Physics,0,10085505,73027998
116836,2020-11-11,3045253591,120665830,0.425285,1,Optics,1,4004750,32140069
252654,2020-11-11,3045538049,121332964,0.429845,2,Physics,0,10085505,73027998
136154,2020-11-11,3042542352,49040817,0.407966,1,Optoelectronics,1,3318327,24445739


**Top level fields of study paper counts by day**
- Day/Month/year
- Field name
- Paper Count for that day

In [5]:
# set up aggregation data frame
topLevelFieldsOnly = sourceDF[sourceDF['Level'] == 0]
topLevelFieldsOnly = topLevelFieldsOnly[['Date', 'PaperId', 'FieldOfStudyId', 'DisplayName']]
# daily top level paper counts
topLevelFieldsGroupedCounts = topLevelFieldsOnly \
    .groupby(['Date', 'DisplayName'], as_index=False)  \
    .size().rename(columns={'size':'PaperCount'})
topLevelFieldsGroupedCounts.head()

Unnamed: 0,Date,DisplayName,PaperCount
0,2018-11-11,Business,1
1,2018-11-11,Computer science,35
2,2018-11-11,Geography,1
3,2018-11-11,Mathematics,11
4,2018-11-11,Physics,5


In [6]:
# Monthly top level paper counts
# topLevelFieldsGroupedCounts = topLevelFieldsOnly \
#    .groupby([pd.Grouper(key='Date', freq='M'), pd.Grouper(key='DisplayName')], as_index=False)  \
#   .size().rename(columns={'size':'PaperCount'})3

In [11]:
# get list of top level fields
topLevelFieldsList = topLevelFieldsGroupedCounts['DisplayName'].unique()
topLevelFieldsList.sort()

In [28]:
# Sparklines
numRows = 7
numCols = 3
colors = ['darkslategray',  'maroon', 'darkgreen', 'yellowgreen', 'darkblue', \
          'fuchsia', 'red', 'darkorange', 'gold', 'lime', \
          'mediumspringgreen', 'blue', 'mediumaquamarine', 'dodgerblue', 'salmon', \
          'deeppink', 'wheat', 'violet', 'lightskyblue']
sparklinesFig = make_subplots(rows=numRows, cols=numCols, subplot_titles=topLevelFieldsList)
row = col = 1
index = 0
for field in topLevelFieldsList:
    # get daily paper totals for given field/month, filling in missing days with 0
    monthlyFieldDf = topLevelFieldsGroupedCounts[ \
        (topLevelFieldsGroupedCounts['DisplayName'] == field) & \
        (topLevelFieldsGroupedCounts['Date'] <= datetime.datetime(2018, 11, 30)) \
    ][['Date', 'PaperCount']]
    #todo date select above
    monthlyFieldDf.set_index(monthlyFieldDf.Date, inplace=True)
    monthlyFieldDf = monthlyFieldDf.resample('D').sum().fillna(0)
    monthlyFieldDf.reset_index(inplace=True)
    # add scatter to figure
    sparklinesFig.add_trace(
        go.Scatter(
            x=monthlyFieldDf['Date'], 
            y=monthlyFieldDf['PaperCount'],
            mode='lines',
            name=field,
            marker={'color':colors[index]}
        ),
        row=row, 
        col=col
    )
    col += 1
    index += 1
    if col > numCols:
        row += 1
        col = 1
# figure level configuration
sparklinesFig.update_layout(
    template='simple_white',
    showlegend=False
)
sparklinesFig.update_xaxes(
    showgrid=False,
    zeroline=False,
    visible=False
)
sparklinesFig.update_yaxes(
    showgrid=False,
    zeroline=False,
    visible=False
)

sparklinesFig.show()