# Generate dataframes with sample and study counts over time

In [3]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
pd.options.mode.chained_assignment = None  # default='warn'
from matplotlib.colors import ListedColormap

def query_tree(tdf,search_term):
    mesh_list = []
    tlist = tdf[tdf.mesh_id==search_term].mesh_treenumbers.tolist()
    for it in tlist:
        nchar = len(it)
        mesh = tdf[tdf.mesh_treenumbers.str[:nchar]==it].mesh_id.to_list()
        mesh_list += mesh
    return np.unique(mesh_list)

def query_tags(tadf, mesh_list, time_range):
    min_date = int(time_range[0])
    max_date = int(time_range[1])
    match_df = tadf.loc[(np.isin(tadf.mesh_id, mesh_list))&(tadf.year<=max_date)&(tadf.year>=min_date),:] # find matches for the mesh list
    return match_df

def get_counts_per_year(matchdf,time_range):
    # counts per year
    ts = matchdf[['geo_id', 'year', 'aux']].groupby(['geo_id', 'year']).mean().reset_index()[['geo_id', 'year']]
    cpy = ts.groupby('year').size().reset_index(name='c')
    # make sure that all years are in df
    all_years = pd.DataFrame(pd.Series(np.arange(int(time_range[0]), int(time_range[1]),1)), columns = ['year'])
    all_years['aux'] = 1
    cpy_merge = pd.merge(all_years, cpy, on = 'year', how='outer')
    cpy_merge = cpy_merge.drop('aux', axis = 1)
    cpy_merge.loc[cpy_merge.c.isnull(), 'c'] = 0
    return cpy_merge

def get_samples_per_year(matchdf,time_range):
    ts = matchdf.groupby(['geo_id', 'year']).nsamples.mean().reset_index()
    spy = ts.groupby('year').nsamples.sum().reset_index()
    all_years = pd.DataFrame(pd.Series(np.arange(int(time_range[0]), int(time_range[1]),1)), columns = ['year'])
    all_years['aux'] = 1
    spy = pd.merge(all_years, spy, on = 'year', how='outer')
    spy = spy.drop('aux', axis = 1)
    spy.loc[spy.nsamples.isnull(), 'nsamples'] = 0
    return spy

def normalise_it(df):
    df['c_norm'] = (df.c)/(df.c.sum())
    return df

def add_all_years(p, time_range):
    all_years = pd.DataFrame(pd.Series(np.arange(int(time_range[0]), int(time_range[1]),1)), columns = ['year'])
    all_years['aux'] = 1
    mesh = p.mesh_heading.unique()[0]
    p_merge = pd.merge(all_years, p, on = 'year', how='outer')
    p_merge = p_merge.drop('aux', axis = 1)
    p_merge.loc[p_merge.n.isnull(), 'n'] = 0
    p_merge.loc[p_merge.mesh_heading.isnull(), 'mesh_heading'] = mesh
    return p_merge


path = '../../data/final'

# load tree
tree_file = 'mesh.pkl'
trdf = pd.read_pickle(os.path.join(path, tree_file))

# load tags
tag_file ='geo_filtered.pkl'
tadf = pd.read_pickle(os.path.join(path, tag_file))
tadf['year'] = (tadf.date.str[:4]).astype(int) # year
tadf = pd.merge(tadf,trdf[['mesh_id', 'mesh_treenumbers']], on='mesh_id', how='left')

# load top topics based on n series
count_file ='meshids_rankedby_NSeries.pkl'
countsdf = pd.read_pickle(os.path.join(path,count_file))
countsdf = pd.merge(countsdf,trdf, on='mesh_id', how='left')
countsdf = countsdf.loc[np.isin(countsdf.category, ['C'])].reset_index(drop=True)

min_year = '2000'
max_year = '2017'
time_range = [min_year, max_year]

# Generate time series

In [None]:
ntopdisease = 200

mesh_ids = countsdf.loc[:ntopdisease,'mesh_id'].to_list()

top_d_df = trdf.loc[np.isin(trdf.mesh_id, mesh_ids),:]
# build 3 levels of hierarchy
top_d_df['level'] = top_d_df.mesh_treenumbers.str.len()
top_d_df.loc[top_d_df['level']<4,'level'] = 1
top_d_df.loc[(top_d_df['level']>=4) & (top_d_df['level']<=20),'level'] = 2
top_d_df.loc[top_d_df['level']>20,'level'] = 3
top_d_df = top_d_df.reset_index(drop=True)

mesh_ids = top_d_df.mesh_id.to_list()
mesh_headings = top_d_df.mesh_heading.to_list()

countdf = pd.DataFrame()
sampledf = pd.DataFrame()
for iid, ihead in zip(mesh_ids, mesh_headings):
    mesh_list = query_tree(trdf,iid)
    matchdf = query_tags(tadf, mesh_list, time_range)
    matchdf['year'] = matchdf.date.astype(str).str[:4].astype(int)
    matchdf['aux'] = 1
    cpy = get_counts_per_year(matchdf,time_range); cpy['mesh_id'] = iid; cpy['mesh_heading'] = ihead; 
    spy = get_samples_per_year(matchdf,time_range); spy['mesh_id'] = iid; spy['mesh_heading'] = ihead;
    countdf = countdf.append(cpy)
    sampledf = sampledf.append(spy)

# add parent column to df
n = 1 # start with level 1
l1_tree_list = top_d_df.loc[top_d_df.level==n].mesh_treenumbers.tolist()
l1_mesh_headings = top_d_df.loc[top_d_df.level==n].mesh_heading.tolist()
for itree, iheading in zip(l1_tree_list, l1_mesh_headings):
    nchar = len(itree)
    top_d_df.loc[(top_d_df.level==n+1)&(top_d_df.mesh_treenumbers.str[:nchar]==itree),'parent'] = iheading 
top_d_df = pd.merge(countsdf[['mesh_id', 'n']],top_d_df, on=['mesh_id'], how='inner')
top_d_df = top_d_df.sort_values('n', ascending=False)

sampledf = sampledf.rename(index=str, columns = {'nsamples':'c'})

top_d_df.to_pickle('../../data/final/top_diseases_for_plotting.pkl')
countdf.to_pickle('../../data/final/countsbyyear_for_plotting.pkl')
sampledf.to_pickle('../../data/final/samplesbyyear_for_plotting.pkl')