In [1]:
#Packages
import os
import pandas as pd
import itertools
import numpy as np
import pickle
import plotly
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objs as go
from operator import itemgetter

In [2]:
#Load Data
moving_average_shares = pd.read_csv('../temporary/moving_average_shares.csv', index_col='Unnamed: 0')
topics = pd.read_csv('../temporary/topics.csv')
volume_topics = pd.read_csv('../temporary/topic_weights.csv')

#volume metadata
metapath = '../input/metadata.p'
metadata = pickle.load(open(metapath, 'rb'))

metadata['Year_rounded'] = pd.to_numeric(metadata['Year'])
metadata['Year'] = pd.to_numeric(metadata['Year'], downcast='signed')
def fix_htid(row):
    return row['HTID'].replace(":","+").replace("/", "=")

metadata['HTID'] = metadata.apply(fix_htid, axis=1)

#match volume years and fix to be compatible with category weights
volume_topics = pd.merge(volume_topics, metadata, on = 'HTID', how = 'inner').drop(columns=['oclc','Year'])

for ind,row in volume_topics.iterrows():
    if row['Year_rounded'] > 1890:
        volume_topics.at[ind, 'Year_rounded'] = 1890
    elif row['Year_rounded'] < 1510:
        volume_topics.at[ind, 'Year_rounded'] = 1510


moving_average_shares

Unnamed: 0,1510,1511,1512,1513,1514,1515,1516,1517,1518,1519,...,1881,1882,1883,1884,1885,1886,1887,1888,1889,1890
1x2,0.000010,1.993856e-06,1.890401e-06,1.797580e-06,1.797580e-06,0.000025,0.000023,0.000023,0.000023,0.000023,...,0.002293,0.002321,0.002335,0.002333,0.002247,0.002226,0.002238,0.002245,0.002236,0.002221
1x3,0.000005,1.068301e-06,1.005507e-06,9.554093e-07,9.554093e-07,0.000012,0.000011,0.000011,0.000011,0.000011,...,0.000669,0.000691,0.000721,0.000737,0.000755,0.000758,0.000769,0.000787,0.000802,0.000786
1x4,0.000009,2.563470e-06,2.364779e-06,2.242189e-06,2.242189e-06,0.000020,0.000019,0.000019,0.000019,0.000019,...,0.000735,0.000739,0.000704,0.000693,0.000674,0.000659,0.000636,0.000626,0.000647,0.000630
1x5,0.000014,1.480231e-06,1.488265e-06,1.423555e-06,1.423555e-06,0.000212,0.000199,0.000199,0.000199,0.000199,...,0.001772,0.001853,0.001881,0.001907,0.001958,0.001996,0.002015,0.002068,0.002129,0.002141
1x6,0.000002,1.352189e-07,1.484902e-07,1.431995e-07,1.431995e-07,0.000005,0.000006,0.000006,0.000006,0.000006,...,0.000304,0.000328,0.000336,0.000359,0.000374,0.000373,0.000368,0.000363,0.000350,0.000349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57x59,0.000011,1.248851e-05,1.004193e-04,9.509673e-05,9.509673e-05,0.000084,0.000079,0.000079,0.000079,0.000079,...,0.000035,0.000035,0.000033,0.000033,0.000033,0.000033,0.000032,0.000033,0.000032,0.000033
57x60,0.000002,1.767700e-07,7.619845e-04,7.197240e-04,7.197240e-04,0.000632,0.000595,0.000595,0.000595,0.000595,...,0.000128,0.000133,0.000133,0.000142,0.000153,0.000153,0.000149,0.000148,0.000153,0.000155
58x59,0.003315,3.998958e-03,3.537325e-03,3.338580e-03,3.338580e-03,0.002930,0.002758,0.002758,0.002758,0.002758,...,0.000198,0.000202,0.000204,0.000199,0.000200,0.000204,0.000205,0.000209,0.000214,0.000214
58x60,0.000104,1.116548e-04,9.957842e-05,9.400082e-05,9.400082e-05,0.000086,0.000081,0.000081,0.000081,0.000081,...,0.000916,0.000920,0.000944,0.000950,0.000983,0.001013,0.001030,0.001046,0.001066,0.001057


In [3]:
topics

Unnamed: 0,1,2,topic_number
0,0.14322,paint pictur artist music engrav painter colou...,1
1,0.25957,town road church build built river stone wall ...,2
2,0.124,franc pari french loui madam duke count napole...,3
3,0.19923,church christian christ bishop holi paul doctr...,4
4,0.36643,love heart beauti soul sweet dark night earth ...,5
5,0.05915,india chines china nativ indian bengal govern ...,6
6,0.15907,fig water iron engin pressur steam electr air ...,7
7,0.07679,acid solut heat carbon water sulphur iron gas ...,8
8,0.49381,exist refer period similar consist occur conne...,9
9,0.0733,vol lond fol folio calf copi pari par morocco ...,10


In [4]:
def category_shares(topics, ctshares, year, categories):
    #'categories' needs to be a dict with keys as category names and values as a list of category topics, i.e. 'Category': [1,2,3]
    #'ctshares' needs to be a dataframe with cross_topic shares as values, years as columns, and indices as 'topic1 x topic2', e.g. '3x4'
    #topics must have a column 'topic_number' with the number corresponding to each topic
    tmp_dict = {}
    
    topic_numbers = list(topics['topic_number'])
    shares = ctshares[str(year)] #grab column with cross-topic shares for the year

    for name,category in categories.items():
        combos = {topic:list(itertools.product([topic], set(category) - set([topic]))) for topic in topic_numbers} #This gets the appropriate combo of each topic and the topics in each category, so for topic 1 and 'Political Economy', gets [(1,33),(1,34),(1,47)]

        cross_combos = {key:['x'.join(map(str,sorted(i))) for i in value] for key,value in combos.items()} #sorts topic pairs so that they will be called from 'shares' correctly, and joins with 'x'

        cross_shares = {key:[shares[str(i)] for i in value] for key,value in cross_combos.items()} #get share value from 'shares'

        cross_sum = {key:sum(value) if key not in category else sum(value)*1.5 for key,value in cross_shares.items()} #sum shares for the topic and topics in category. If topic in category, multiply by 1.5 since it will be missing one topic pair, e.g. for topic 33 it just has 33x34 and 33x47, so must be scaled correctly. NOT ROBUST YET TO OTHER SIZES THAN 3 TOPICS PER CATEGORY.

        tmp_dict[name] = cross_sum

    df = pd.DataFrame.from_dict(tmp_dict)

    total = df.sum(axis=1) #get total of category scores
    cat_shares = df.div(total, axis=0) #divide by total, so that Religion + Science + Political Economy = 1
    
    return cat_shares

In [5]:
categories = {
    'Religion':[4,12,52],
    'Science':[7,8,41],
    'Political Economy':[33,34,47]
    }

years=[]
for year in range(1510,1891):
    years.append(year)


In [6]:
topic_shares = {}

for year in years:
    topic_shares[year] = category_shares(topics = topics, ctshares = moving_average_shares, year = year, categories = categories)

topic_shares[1550]

Unnamed: 0,Religion,Science,Political Economy
1,0.769908,0.001331,0.22876
2,0.799548,0.006658,0.193793
3,0.808955,0.002621,0.188424
4,0.809775,0.001638,0.188587
5,0.932808,0.000494,0.066698
6,0.928216,0.014389,0.057395
7,0.909221,0.006309,0.08447
8,0.932187,0.003544,0.064269
9,0.868286,0.005064,0.12665
10,0.802223,0.010861,0.186916


In [7]:
ls = []
topic_columns = [str(i) for i in topics['topic_number']]
for ind,row in volume_topics.iterrows():
    year = int(row['Year_rounded'])
    a = np.array(row[topic_columns]) #row of topic weights for each volume
    b = np.array(topic_shares[year]) #topic category weights by year

    c = np.array(np.matmul(a,b)) #matrix multiplication--multiplies volume topic-weights by topic category weights, summed by each category to get category weights for each volume for each row --> (1,60)x(60,3) --> (1,3)
    c = c[None,:] #reshape array 
    tmp = pd.DataFrame(c, columns = topic_shares[year].columns)
    tmp['HTID'] = row['HTID']
    ls.append(tmp)

volumes = pd.concat(ls, axis = 0)
volumes

#export
volumes.to_csv('../temporary/volumes.csv')

In [8]:
#Topic ternary plots-much easier to do it within this script so as to not have to export/import dictionary of dataframes

#Get colors of each topic
#Uses 1850 as a basis for whether topic is "science, religion, or politics"
for year in years:
    topic_shares[year]['Color'] = topic_shares[1850][['Religion','Science','Political Economy']].idxmax(axis=1)

topic_shares[1550]

Unnamed: 0,Religion,Science,Political Economy,Color
1,0.769908,0.001331,0.22876,Political Economy
2,0.799548,0.006658,0.193793,Political Economy
3,0.808955,0.002621,0.188424,Political Economy
4,0.809775,0.001638,0.188587,Religion
5,0.932808,0.000494,0.066698,Religion
6,0.928216,0.014389,0.057395,Political Economy
7,0.909221,0.006309,0.08447,Science
8,0.932187,0.003544,0.064269,Science
9,0.868286,0.005064,0.12665,Political Economy
10,0.802223,0.010861,0.186916,Religion


In [9]:
for year in years:
    fig = px.scatter_ternary(topic_shares[year],
                                a='Religion', b = 'Political Economy', c = 'Science',
                                color = 'Color',
                                color_discrete_map = {'Science':'green', 'Religion': 'blue', 'Political Economy':'red'},
                                template = 'simple_white',
                                symbol = "Color",
                                symbol_map = {'Science': 'circle','Religion': 'cross', 'Political Economy': 'triangle-up'})

    fig.update_traces(showlegend=False, marker = {'size': 10})
    fig.update_layout(title_text = str(year), title_font_size=30, font_size=20)

    if year == 1850:
        #add legend and adjust size for 1850
        fig.update_traces(showlegend=True)
        fig.update_layout(legend = dict(y=0.5), legend_title_text = 'Legend')
        fig.write_image('../output/topic_triangles/' + str(year) +'.png', width = 900)
    else:
        fig.write_image('../output/topic_triangles/' + str(year) +'.png')



In [None]:
for year in years:
    if year == 1850:
        fig = px.scatter_ternary(moving_totals[str(year)],
                                 a='Religion', b = 'Political Economy', c = 'Science',
                                 hover_name = 'Topic',
                                 color = 'Color',
                                 color_discrete_map = {'Science':'green', 'Religion': 'blue', 'Political Economy':'red'},
                                 template = 'simple_white',
                                 symbol = "Color",
                                 symbol_map = {'Science': 'circle','Religion': 'cross', 'Political Economy': 'triangle-up'})
        fig.update_traces(showlegend=True, marker = {'size': 10})
        fig.update_layout(title_text = str(year), title_font_size=30, font_size=20, legend = dict(y=0.5), legend_title_text='Legend')
        fig.write_image('./Clusters_moving_average_2/Figures/1850/' + str(year) +'.png', width = 900)
    #     fig.show()
    else:
        fig = px.scatter_ternary(moving_totals[str(year)],
                                 a='Religion', b = 'Political Economy', c = 'Science',
                                 hover_name = 'Topic',
                                 color = 'Color',
                                 color_discrete_map = {'Science':'green', 'Religion': 'blue', 'Political Economy':'red'},
                                 template = 'simple_white',
                                 symbol = "Color",
                                 symbol_map = {'Science': 'circle','Religion': 'cross', 'Political Economy': 'triangle-up'})
        fig.update_traces(showlegend=False, marker = {'size': 10})
        fig.update_layout(title_text = str(year), title_font_size=30, font_size=20)
        fig.write_image('./Clusters_moving_average_2/Figures/1850/' + str(year) +'.png')
    #     fig.show()

    

In [69]:
plotly.__version__

'5.10.0'

## Playground

In [9]:
volume_topics

Unnamed: 0,HTID,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,uc1.b5568131,0.335310,0.000703,3.859994e-05,0.070415,0.094437,1.841183e-05,0.001295,0.000024,0.000154,...,0.000062,0.000063,0.004128,1.217153e-05,0.021920,5.930538e-03,8.631037e-06,0.000059,0.000046,0.035256
1,uc1.$b135547,0.002840,0.001595,3.160480e-05,0.001070,0.071715,1.507521e-05,0.000041,0.000020,0.067670,...,0.003109,0.037010,0.012810,9.965785e-06,0.026869,3.071620e-03,1.020237e-02,0.001323,0.000038,0.000065
2,hvd.32044106314859,0.000017,0.000032,1.514681e-05,0.000024,0.000045,7.224896e-06,0.000019,0.000009,0.000060,...,0.000024,0.000025,0.000032,4.776170e-06,0.000051,6.244357e-06,2.812933e-03,0.000023,0.000140,0.000031
3,uc1.$b29323,0.000014,0.000003,3.943318e-03,0.013454,0.000005,3.564930e-02,0.000039,0.000075,0.000006,...,0.000076,0.006445,0.000028,4.816316e-07,0.000005,6.296845e-07,3.415332e-07,0.000002,0.068478,0.023605
4,mdp.39015076816662,0.000002,0.000004,9.407721e-04,0.000003,0.000006,9.914233e-07,0.000003,0.020569,0.033567,...,0.000003,0.000020,0.000004,6.554013e-07,0.000007,6.790642e-05,4.647562e-07,0.000020,0.000002,0.000004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166774,chi.090018182,0.000023,0.000678,1.974364e-05,0.000191,0.064386,1.686437e-04,0.024865,0.000490,0.148955,...,0.001783,0.004968,0.003863,6.225665e-06,0.000862,8.139426e-06,4.414725e-06,0.000190,0.000183,0.000041
166775,hvd.32044089522510,0.020714,0.000236,1.241825e-01,0.000008,0.000014,2.231040e-06,0.000044,0.016562,0.068709,...,0.000875,0.000196,0.136070,1.474876e-06,0.180247,1.928251e-06,1.045860e-06,0.008947,0.053230,0.015777
166776,uc1.31175035187601,0.000134,0.019152,1.454623e-05,0.000258,0.001803,6.938422e-06,0.000136,0.000009,0.000644,...,0.008587,0.008822,0.002846,1.218974e-04,0.000166,5.996762e-06,3.252571e-06,0.044483,0.138092,0.000968
166777,aeu.ark+=13960=t09w1n868,0.000068,0.008213,5.900777e-05,0.000095,0.005885,2.814618e-05,0.010069,0.000037,0.027360,...,0.000094,0.000097,0.001076,1.860663e-05,0.001626,2.432628e-05,1.319428e-05,0.043396,0.000070,0.004404


In [14]:
volume_topics = pd.merge(volume_topics, metadata, on = 'HTID', how = 'inner').drop(columns=['oclc','Year'])

volume_topics

Unnamed: 0,HTID,1,2,3,4,5,6,7,8,9,...,52,53,54,55,56,57,58,59,60,Year_rounded
0,uc1.b5568131,0.335310,0.000703,3.859994e-05,0.070415,0.094437,1.841183e-05,0.001295,0.000024,0.000154,...,0.000063,0.004128,1.217153e-05,0.021920,5.930538e-03,8.631037e-06,0.000059,0.000046,0.035256,1898.0
1,uc1.$b135547,0.002840,0.001595,3.160480e-05,0.001070,0.071715,1.507521e-05,0.000041,0.000020,0.067670,...,0.037010,0.012810,9.965785e-06,0.026869,3.071620e-03,1.020237e-02,0.001323,0.000038,0.000065,1832.0
2,hvd.32044106314859,0.000017,0.000032,1.514681e-05,0.000024,0.000045,7.224896e-06,0.000019,0.000009,0.000060,...,0.000025,0.000032,4.776170e-06,0.000051,6.244357e-06,2.812933e-03,0.000023,0.000140,0.000031,1896.0
3,uc1.$b29323,0.000014,0.000003,3.943318e-03,0.013454,0.000005,3.564930e-02,0.000039,0.000075,0.000006,...,0.006445,0.000028,4.816316e-07,0.000005,6.296845e-07,3.415332e-07,0.000002,0.068478,0.023605,1825.0
4,mdp.39015076816662,0.000002,0.000004,9.407721e-04,0.000003,0.000006,9.914233e-07,0.000003,0.020569,0.033567,...,0.000020,0.000004,6.554013e-07,0.000007,6.790642e-05,4.647562e-07,0.000020,0.000002,0.000004,1882.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166775,chi.090018182,0.000023,0.000678,1.974364e-05,0.000191,0.064386,1.686437e-04,0.024865,0.000490,0.148955,...,0.004968,0.003863,6.225665e-06,0.000862,8.139426e-06,4.414725e-06,0.000190,0.000183,0.000041,1843.0
166776,hvd.32044089522510,0.020714,0.000236,1.241825e-01,0.000008,0.000014,2.231040e-06,0.000044,0.016562,0.068709,...,0.000196,0.136070,1.474876e-06,0.180247,1.928251e-06,1.045860e-06,0.008947,0.053230,0.015777,1874.0
166777,uc1.31175035187601,0.000134,0.019152,1.454623e-05,0.000258,0.001803,6.938422e-06,0.000136,0.000009,0.000644,...,0.008822,0.002846,1.218974e-04,0.000166,5.996762e-06,3.252571e-06,0.044483,0.138092,0.000968,1808.0
166778,aeu.ark+=13960=t09w1n868,0.000068,0.008213,5.900777e-05,0.000095,0.005885,2.814618e-05,0.010069,0.000037,0.027360,...,0.000097,0.001076,1.860663e-05,0.001626,2.432628e-05,1.319428e-05,0.043396,0.000070,0.004404,1857.0


In [30]:
#fix years to be compatible with 'topic_shares'

for ind,row in volume_topics.iterrows():
    if row['Year_rounded'] > 1890:
        volume_topics.at[ind, 'Year_rounded'] = 1890
    elif row['Year_rounded'] < 1510:
        volume_topics.at[ind, 'Year_rounded'] = 1510

In [44]:
ls = []
topic_columns = [str(i) for i in topics['topic_number']]
for ind,row in volume_topics.iterrows():
    year = int(row['Year_rounded'])
    a = np.array(row[topic_columns])
    b = np.array(topic_shares[year])

    c = np.array(np.matmul(a,b)) #matrix multiplication--multiplies volume topic-weights by topic category weights, summed by each category to get category weights for each volume
    c = c[None,:] #reshape array 
    tmp = pd.DataFrame(c, columns = topic_shares[year].columns)
    tmp['HTID'] = row['HTID']
    ls.append(tmp)

volumes = pd.concat(ls, axis = 0)
volumes

Unnamed: 0,Religion,Science,Political Economy,HTID
0,0.326938,0.164118,0.508944,uc1.b5568131
0,0.410077,0.104824,0.485099,uc1.$b135547
0,0.065574,0.629993,0.304434,hvd.32044106314859
0,0.400788,0.038419,0.560793,uc1.$b29323
0,0.028457,0.594895,0.376647,mdp.39015076816662
...,...,...,...,...
0,0.405057,0.192782,0.402161,chi.090018182
0,0.270244,0.149941,0.579815,hvd.32044089522510
0,0.161509,0.17248,0.666012,uc1.31175035187601
0,0.13443,0.151041,0.71453,aeu.ark+=13960=t09w1n868
