In [27]:
#libraries
import numpy as np
import pandas as pd
import pickle
from functools import reduce

In [14]:
#Load Data
volumes = pd.read_csv('../temporary/volumes.csv')
industry = pd.read_csv('../input/industry_scores.csv')
sentiment = pd.read_csv('../input/sentiment_scores_march23.csv')

#volume metadata
metadata = pickle.load(open('../input/metadata.p', 'rb'))

metadata['Year'] = pd.to_numeric(metadata['Year'], downcast='signed')

def fix_htid(row):
    return row['HTID'].replace(":","+").replace("/", "=")

metadata['HTID'] = metadata.apply(fix_htid, axis=1)
metadata.drop(columns=['oclc'], inplace=True)

In [15]:
#clean data
industry = industry.rename(columns={'Unnamed: 0': 'HTID', '2-vote':'industry_2','3-vote':'industry_3'})
industry['HTID'] = industry['HTID'].map(lambda x: x.rstrip('.txt'))#remove '.txt' at the end of each string for HTIDs

#Clean Sentiment Data

sentiment = sentiment.rename(columns = {'Unnamed: 0': 'HTID', 'Regression': 'percent_regression', 'Pessimism': 'percent_pessimism', 'Optimism':'percent_optimistic', 'Progress': 'percent_progress'})

sentiment['HTID'] = sentiment['HTID'].map(lambda x: x.rstrip('.txt')) #remove '.txt' at the end of each string for HTIDs


In [16]:
#Dimensions
print('volumes:' + str(volumes.shape))
print('industry:' + str(industry.shape))
print('sentiment:' + str(industry.shape))

volumes:(166780, 4)
industry:(173067, 3)
sentiment:(173067, 3)


In [24]:
sentiment_scores = pd.merge(sentiment, industry, on = 'HTID')
sentiment_scores_metadata = pd.merge(sentiment_scores, metadata, on = 'HTID')
topic_scores_metadata = pd.merge(volumes, metadata, on = 'HTID')

In [25]:
print('sentiment_scores:' + str(sentiment_scores.shape))
print('sentiment_scores_metadata:' + str(sentiment_scores_metadata.shape))
print('topic_scores_metadata:' + str(topic_scores_metadata.shape))

sentiment_scores:(173137, 7)
sentiment_scores_metadata:(168931, 8)
topic_scores_metadata:(166782, 5)


In [39]:
dfs = [sentiment_scores, volumes, metadata]

# final_merge = pd.merge(sentiment_scores_metadata, volumes, on = 'HTID')
final_merge = reduce(lambda left, right: pd.merge(left, right, on = 'HTID', how = 'inner'), dfs)
outer = pd.merge(sentiment_scores, volumes, on = 'HTID', how = 'outer', indicator=True)
anti = outer[~(outer._merge == 'both')]
anti = pd.merge(anti, metadata, on = 'HTID')

In [40]:
anti 

Unnamed: 0,HTID,percent_regression,percent_pessimism,percent_optimistic,percent_progress,industry_2,industry_3,Religion,Science,Political Economy,_merge,Year
0,nyp.33433081689832,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,,,left_only,1810.0
1,hvd.hn5cyr,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,,,left_only,1848.0
2,uc1.$b283481,0.000000,0.000000,0.000000,0.000000,0.013649,0.013649,,,,left_only,1885.0
3,hvd.32044106334329,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,,,left_only,1794.0
4,uc1.$b13904,0.000253,0.000253,0.000253,0.000253,0.046071,0.050448,,,,left_only,1838.0
...,...,...,...,...,...,...,...,...,...,...,...,...
10206,uiuo.ark+=13960=t3dz0fk4x,,,,,,,0.380205,0.123039,0.496756,right_only,1875.0
10207,uc2.ark+=13960=t4zg6w79x,,,,,,,0.342954,0.111881,0.545165,right_only,1894.0
10208,uc2.ark+=13960=t1sf2nv5x,,,,,,,0.428217,0.051069,0.520715,right_only,1848.0
10209,uiuo.ark+=13960=t3kw5p40t,,,,,,,0.323473,0.052603,0.623924,right_only,1835.0


In [41]:
map = {'left_only':'sentiment_data', 'right_only':'topic_weights'}
anti['dataset'] = anti['_merge'].map(map)
anti

Unnamed: 0,HTID,percent_regression,percent_pessimism,percent_optimistic,percent_progress,industry_2,industry_3,Religion,Science,Political Economy,_merge,Year,dataset
0,nyp.33433081689832,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,,,left_only,1810.0,sentiment_data
1,hvd.hn5cyr,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,,,left_only,1848.0,sentiment_data
2,uc1.$b283481,0.000000,0.000000,0.000000,0.000000,0.013649,0.013649,,,,left_only,1885.0,sentiment_data
3,hvd.32044106334329,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,,,left_only,1794.0,sentiment_data
4,uc1.$b13904,0.000253,0.000253,0.000253,0.000253,0.046071,0.050448,,,,left_only,1838.0,sentiment_data
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10206,uiuo.ark+=13960=t3dz0fk4x,,,,,,,0.380205,0.123039,0.496756,right_only,1875.0,topic_weights
10207,uc2.ark+=13960=t4zg6w79x,,,,,,,0.342954,0.111881,0.545165,right_only,1894.0,topic_weights
10208,uc2.ark+=13960=t1sf2nv5x,,,,,,,0.428217,0.051069,0.520715,right_only,1848.0,topic_weights
10209,uiuo.ark+=13960=t3kw5p40t,,,,,,,0.323473,0.052603,0.623924,right_only,1835.0,topic_weights


In [42]:
anti_volumes = anti[['HTID', 'Year', 'dataset']]

In [46]:
anti_volumes.to_csv('../temporary/anti_joined.csv')
anti_volumes

Unnamed: 0,HTID,Year,dataset
0,nyp.33433081689832,1810.0,sentiment_data
1,hvd.hn5cyr,1848.0,sentiment_data
2,uc1.$b283481,1885.0,sentiment_data
3,hvd.32044106334329,1794.0,sentiment_data
4,uc1.$b13904,1838.0,sentiment_data
...,...,...,...
10206,uiuo.ark+=13960=t3dz0fk4x,1875.0,topic_weights
10207,uc2.ark+=13960=t4zg6w79x,1894.0,topic_weights
10208,uc2.ark+=13960=t1sf2nv5x,1848.0,topic_weights
10209,uiuo.ark+=13960=t3kw5p40t,1835.0,topic_weights


In [23]:
anti['_merge'].value_counts()

left_only     6181
right_only    4030
both             0
Name: _merge, dtype: int64

In [32]:
anti_industry = pd.merge(anti, industry, on = 'HTID')

In [33]:
anti_industry

Unnamed: 0,HTID,percent_regression,percent_pessimism,percent_optimistic,percent_progress,industry_2_x,industry_3_x,Year,Religion,Science,Political Economy,industry_2_y,industry_3_y
0,nyp.33433081689832,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1810.0,,,,0.000000,0.000000
1,hvd.hn5cyr,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1848.0,,,,0.000000,0.000000
2,uc1.$b283481,0.000000,0.000000,0.000000,0.000000,0.013649,0.013649,1885.0,,,,0.013649,0.013649
3,hvd.32044106334329,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1794.0,,,,0.000000,0.000000
4,uc1.$b13904,0.000253,0.000253,0.000253,0.000253,0.046071,0.050448,1838.0,,,,0.046071,0.050448
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6176,osu.32437121702837,0.000000,0.000000,0.000000,0.000000,0.006479,0.006479,1561.0,,,,0.006479,0.006479
6177,njp.32101067651271,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1800.0,,,,0.000000,0.000000
6178,nnc1.cu55143032,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1755.0,,,,0.000000,0.000000
6179,inu.30000008742250,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1730.0,,,,0.000000,0.000000


In [29]:
final_merge

Unnamed: 0,HTID,percent_regression,percent_pessimism,percent_optimistic,percent_progress,industry_2,industry_3,Religion,Science,Political Economy,Year
0,hvd.32044025716390,0.003229,0.003229,0.003229,0.003229,0.205761,0.235340,0.470864,0.065063,0.464074,1886.0
1,uc2.ark+=13960=t7sn0cd5r,0.001073,0.001073,0.001073,0.001073,0.056452,0.057502,0.413910,0.041589,0.544500,1828.0
2,uiuo.ark+=13960=t83j42z5r,0.000668,0.000668,0.000668,0.000668,0.087561,0.088896,0.289778,0.074732,0.635490,1892.0
3,chi.65460297,0.002482,0.002482,0.002482,0.002482,1.745223,1.938208,0.141372,0.155263,0.703366,1835.0
4,uc1.c034714672,0.000293,0.000293,0.000293,0.000293,0.173269,0.252494,0.236004,0.278340,0.485655,1874.0
...,...,...,...,...,...,...,...,...,...,...,...
162747,mdp.39015063997871,0.000000,0.000000,0.000000,0.000000,0.521087,0.540769,0.142778,0.324604,0.532619,1883.0
162748,nnc1.1002316935,0.001876,0.001876,0.001876,0.001876,0.062852,0.062852,0.353575,0.086502,0.559923,1890.0
162749,mdp.39015033940241,0.000534,0.000534,0.000534,0.000534,0.142026,0.145587,0.380669,0.156957,0.462374,1850.0
162750,hvd.32044023949654,0.000000,0.000000,0.000000,0.000000,0.178859,0.178859,0.357519,0.089237,0.553245,1890.0


In [16]:
#import packages
print('Loading Packages')
import os
import pandas as pd
import itertools
from iteration_utilities import unique_everseen
from math import comb

#Load Data
print('Loading Data')
volume_weights = pd.read_csv("../temporary/topic_weights.csv")
cross = pd.read_csv('../temporary/cross_topics.csv')
topics = pd.read_csv('../input/20191007_keys.txt', sep = '\t', lineterminator='\n', header=None)

volume_weights_pre1750 = pd.read_csv('../temporary/topic_weights_pre1750.csv')
cross_pre1750 = pd.read_csv('../temporary/cross_topics_pre1750.csv')
topics_pre1750 = pd.read_csv('../input/20230623_keys.txt', sep = '\t', lineterminator='\n', header=None)

#fix topic numbers
topics.drop(columns=0, inplace=True)
topics_pre1750.drop(columns=0, inplace=True)

topics['topic_number'] = list(range(1,len(topics)+1))
topics_pre1750['topic_number'] = list(range(1, len(topics_pre1750) +1))

print(topics)
print(topics_pre1750)

group = [5,9,22,26,35,46,50,55,60] #innocuous topics to be eliminated

#functions
def cross_share(data):
    df = data.copy() #Won't modify original object

    if 'HTID' in df.columns:
        df.drop(columns = ['HTID'], inplace=True)
    
    share = df.sum(axis = 0) / sum(df.sum(axis=0)) #numerator: sum down across rows, denominator: sum the sum of rows to get total of all cross-topics
    return share

def get_shares(shares, top = topics, omit = None, length = 3):
    #'topics' is a list or dataframe of topics, where each row corresponds to a topic
    #'omit' is a list of topics to omit, should be a list of numbers
    #'length' is the size of the categories (i.e. how many topics should make up a category), default 3

    n = len(top) #get number of topics
    topic_numbers = list(top['topic_number']) # generate topic numbers
    topic_dict = pd.Series(top[2].values, index = top.topic_number).to_dict() #mapping of topic numbers and words

    if omit is not None:
        topic_numbers = [i for i in topic_numbers if i not in omit] #remove innocuous topics
    
    topic_numbers.sort() #itertools.combinations needs sorted list
    combos = list(itertools.combinations(topic_numbers, r = length)) #create combinations of desired length

    combo_sets = [set(i) for i in combos] #get set of topic numbers for each row, i is each combo, contained in a tuple

    cross_combos = [list(itertools.combinations(i,2)) for i in combos]#gets every combination of elements in row from combos, i.e. for (1,2,3) gets (1,2),(1,3),(2,3)
    cross_combos = [['x'.join(map(str, i)) for i in c] for c in cross_combos] #joins each topic pair with 'x' to reference 'shares'
    cross_shares = [[shares[str(i)] for i in c] for c in cross_combos] #get share for each element
    cross_sum = [sum(i) for i in cross_shares] #sum each row
    topic_words = [[topic_dict[i] for i in t] for t in combos]

    #column names
    topic_names = ['topic' + str(i) for i in range(1, length+1)]
    cross_names = ['combination' + str(i) for i in range(1, comb(length, 2)+1)] #'math.comb' gives the number of combinations, not the combinations themselves
    share_names = ['share' + str(i) for i in range(1, comb(length, 2)+1)]
    topic_words_names = ['words' + str(i) for i in range(1, length+1)]

    #convert to dataframes since each are lists of tuples, easier to join
    combos = pd.DataFrame(combos, columns=topic_names)
    combo_sets = pd.DataFrame(pd.Series(combo_sets), columns=['Sets'])
    cross_combos = pd.DataFrame(cross_combos, columns=cross_names)
    cross_shares = pd.DataFrame(cross_shares, columns=share_names)
    cross_sum = pd.DataFrame(cross_sum, columns=['Sum'])
    topic_words = pd.DataFrame(topic_words, columns=topic_words_names)


    tmp = pd.concat([combos, combo_sets, cross_combos, cross_shares, cross_sum, topic_words], axis = 1)
    df = pd.DataFrame(tmp)
    # df = pd.DataFrame(tmp, columns=[topic_names, cross_names, share_names, 'Sum'])

    return df

def distinct_categories(data):
    #algorithm to get distinct categories
    #takes the output of 'get_shares' function and finds unique categories
    #'Sets' in 'data' is a column with the set of topics in each row

    data.sort_values('Sum', ascending = False, inplace = True) #Make sure values are sorted

    seen = set([]) #create empty set
    unique = [] #list for appending unique rows

    for ind, row in data.iterrows():
        if bool(row['Sets'] & seen): #checks if any elements in 'Set' are in 'seen', if so, move to next row
            pass
        else:
            unique.append(row) #if the set is unique, grab row
            seen.update(row['Sets']) #add set of topics to 'seen'

    df = pd.DataFrame(unique)

    return(df)

print('Calculating shares')
shares_all = cross_share(cross)
shares_pre1750 = cross_share(cross_pre1750)

print('Getting categories')
clusters = get_shares(shares = shares_all, top = topics, omit = group, length = 3)
clusters_pre1750 = get_shares(shares = shares_pre1750, top = topics_pre1750, length = 3)

print('Finding distinct categories')
clusters_corpus = distinct_categories(clusters)
clusters_corpus_pre1750 = distinct_categories(clusters_pre1750)
print(clusters_corpus)
print(clusters_corpus_pre1750)

print('Exporting Topics')
topics.to_csv('../temporary/topics.csv', index=False)
topics_pre1750.to_csv('../temporary/topics_pre1750.csv', index=False)

Loading Packages
Loading Data
          1                                                  2  topic_number
0   0.14322  paint pictur artist music engrav painter colou...             1
1   0.25957  town road church build built river stone wall ...             2
2   0.12400  franc pari french loui madam duke count napole...             3
3   0.19923  church christian christ bishop holi paul doctr...             4
4   0.36643  love heart beauti soul sweet dark night earth ...             5
5   0.05915  india chines china nativ indian bengal govern ...             6
6   0.15907  fig water iron engin pressur steam electr air ...             7
7   0.07679  acid solut heat carbon water sulphur iron gas ...             8
8   0.49381  exist refer period similar consist occur conne...             9
9   0.07330  vol lond fol folio calf copi pari par morocco ...            10
10  0.12983  thou thi hath sir doth duke ladi pray exit sce...            11
11  0.28791  god christ lord thi faith holi si

In [17]:
clusters_pre1750

Unnamed: 0,topic1,topic2,topic3,Sets,combination1,combination2,combination3,share1,share2,share3,Sum,words1,words2,words3
4390,3,26,52,"{26, 3, 52}",3x26,3x52,26x52,0.019997,0.014956,0.016396,0.051349,paint fame figur piec wall roman stone rome ma...,hath doth fee one fame arc hall thi thou yea d...,fame fee fever fet cafe defir juft ufe feem ob...
1733,2,3,26,"{26, 2, 3}",2x3,2x26,3x26,0.008933,0.010333,0.019997,0.039264,juft diﬀer sor suﬀer shew suﬃcient men hath oﬀ...,paint fame figur piec wall roman stone rome ma...,hath doth fee one fame arc hall thi thou yea d...
4105,3,19,26,"{19, 26, 3}",3x19,3x26,19x26,0.007529,0.019997,0.008645,0.036171,paint fame figur piec wall roman stone rome ma...,aforesaid justic counti statut fame cap commit...,hath doth fee one fame arc hall thi thou yea d...
2794,2,26,52,"{2, 26, 52}",2x26,2x52,26x52,0.010333,0.007371,0.016396,0.034100,juft diﬀer sor suﬀer shew suﬃcient men hath oﬀ...,hath doth fee one fame arc hall thi thou yea d...,fame fee fever fet cafe defir juft ufe feem ob...
3979,3,16,26,"{16, 26, 3}",3x16,3x26,16x26,0.006248,0.019997,0.007057,0.033302,paint fame figur piec wall roman stone rome ma...,men fame religion fee christian shew design ha...,hath doth fee one fame arc hall thi thou yea d...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6667,5,9,18,"{9, 18, 5}",5x9,5x18,9x18,0.000009,0.000017,0.000005,0.000031,christ sin hath holi thi gospel thou doth glor...,god faith thi soul heaven love sin holi thou s...,vous qui par nous pour tout quil bien comm fai...
6697,5,9,48,"{48, 9, 5}",5x9,5x48,9x48,0.000009,0.000017,0.000005,0.000031,christ sin hath holi thi gospel thou doth glor...,god faith thi soul heaven love sin holi thou s...,che cum par con gli del por fol della rom lat ...
6694,5,9,45,"{9, 45, 5}",5x9,5x45,9x45,0.000009,0.000016,0.000005,0.000030,christ sin hath holi thi gospel thou doth glor...,god faith thi soul heaven love sin holi thou s...,ind quod idem apud anno pred und hic regi fuit...
12518,9,18,45,"{9, 18, 45}",9x18,9x45,18x45,0.000005,0.000005,0.000018,0.000028,god faith thi soul heaven love sin holi thou s...,vous qui par nous pour tout quil bien comm fai...,ind quod idem apud anno pred und hic regi fuit...


In [18]:
topics_pre1750

Unnamed: 0,1,2,topic_number
0,0.0751,ifland fever indian fame fort coaft tho empero...,1
1,0.21745,juft diﬀer sor suﬀer shew suﬃcient men hath oﬀ...,2
2,0.11456,paint fame figur piec wall roman stone rome ma...,3
3,0.09618,writ heir defend statut court hath tenant debt...,4
4,0.21621,christ sin hath holi thi gospel thou doth glor...,5
5,0.04072,ditto esq juli counti aug hon sept jan nov rob...,6
6,0.03135,root flower yellow seed juic stalk herb color ...,7
7,0.08934,thou thi israel chap david hath thine behold v...,8
8,0.41956,god faith thi soul heaven love sin holi thou s...,9
9,0.07537,esq thoma william tho ireland june juli petit ...,10


In [14]:
def get_shares(shares, top = topics, omit = None, length = 3):
    #'topics' is a list or dataframe of topics, where each row corresponds to a topic
    #'omit' is a list of topics to omit, should be a list of numbers
    #'length' is the size of the categories (i.e. how many topics should make up a category), default 3

    n = len(top) #get number of topics
    topic_numbers = list(top['topic_number']) # generate topic numbers
    topic_dict = pd.Series(top[2].values, index = top.topic_number).to_dict() #mapping of topic numbers and words

    if omit is not None:
        topic_numbers = [i for i in topic_numbers if i not in omit] #remove innocuous topics
    
    topic_numbers.sort() #itertools.combinations needs sorted list
    combos = list(itertools.combinations(topic_numbers, r = length)) #create combinations of desired length

    combo_sets = [set(i) for i in combos] #get set of topic numbers for each row, i is each combo, contained in a tuple

    cross_combos = [list(itertools.combinations(i,2)) for i in combos]#gets every combination of elements in row from combos, i.e. for (1,2,3) gets (1,2),(1,3),(2,3)
    cross_combos = [['x'.join(map(str, i)) for i in c] for c in cross_combos] #joins each topic pair with 'x' to reference 'shares'
    cross_shares = [[shares[str(i)] for i in c] for c in cross_combos] #get share for each element
    cross_sum = [sum(i) for i in cross_shares] #sum each row
    topic_words = [[topic_dict[i] for i in t] for t in combos]

    #column names
    topic_names = ['topic' + str(i) for i in range(1, length+1)]
    cross_names = ['combination' + str(i) for i in range(1, comb(length, 2)+1)] #'math.comb' gives the number of combinations, not the combinations themselves
    share_names = ['share' + str(i) for i in range(1, comb(length, 2)+1)]
    topic_words_names = ['words' + str(i) for i in range(1, length+1)]

    #convert to dataframes since each are lists of tuples, easier to join
    combos = pd.DataFrame(combos, columns=topic_names)
    combo_sets = pd.DataFrame(pd.Series(combo_sets), columns=['Sets'])
    cross_combos = pd.DataFrame(cross_combos, columns=cross_names)
    cross_shares = pd.DataFrame(cross_shares, columns=share_names)
    cross_sum = pd.DataFrame(cross_sum, columns=['Sum'])
    topic_words = pd.DataFrame(topic_words, columns=topic_words_names)


    tmp = pd.concat([combos, combo_sets, cross_combos, cross_shares, cross_sum, topic_words], axis = 1)
    df = pd.DataFrame(tmp)
    # df = pd.DataFrame(tmp, columns=[topic_names, cross_names, share_names, 'Sum'])

    return df

clusters = get_shares(shares = shares_all, top = topics, omit = group, length = 3)
clusters

Unnamed: 0,topic1,topic2,topic3,Sets,combination1,combination2,combination3,share1,share2,share3,Sum,words1,words2,words3
0,1,2,3,"{1, 2, 3}",1x2,1x3,2x3,0.002057,0.000499,0.000597,0.003153,paint pictur artist music engrav painter colou...,town road church build built river stone wall ...,franc pari french loui madam duke count napole...
1,1,2,4,"{1, 2, 4}",1x2,1x4,2x4,0.002057,0.000506,0.000554,0.003117,paint pictur artist music engrav painter colou...,town road church build built river stone wall ...,church christian christ bishop holi paul doctr...
2,1,2,6,"{1, 2, 6}",1x2,1x6,2x6,0.002057,0.000180,0.000612,0.002849,paint pictur artist music engrav painter colou...,town road church build built river stone wall ...,india chines china nativ indian bengal govern ...
3,1,2,7,"{1, 2, 7}",1x2,1x7,2x7,0.002057,0.000739,0.001280,0.004076,paint pictur artist music engrav painter colou...,town road church build built river stone wall ...,fig water iron engin pressur steam electr air ...
4,1,2,8,"{8, 1, 2}",1x2,1x8,2x8,0.002057,0.000298,0.000172,0.002527,paint pictur artist music engrav painter colou...,town road church build built river stone wall ...,acid solut heat carbon water sulphur iron gas ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20820,54,58,59,"{58, 59, 54}",54x58,54x59,58x59,0.000545,0.000334,0.000238,0.001117,fame fee fever cafe fet ufe feem obferv defir ...,ship island sea captain vessel coast sail shor...,emperor itali spain german germani franc duke ...
20821,56,57,58,"{56, 57, 58}",56x57,56x58,57x58,0.000160,0.000118,0.000051,0.000328,hym doe hath bee sayd doth own wee kyng hem ty...,kal kai tov yap rov occ masc xai fut sing mid ...,ship island sea captain vessel coast sail shor...
20822,56,57,59,"{56, 57, 59}",56x57,56x59,57x59,0.000160,0.000053,0.000025,0.000238,hym doe hath bee sayd doth own wee kyng hem ty...,kal kai tov yap rov occ masc xai fut sing mid ...,emperor itali spain german germani franc duke ...
20823,56,58,59,"{56, 58, 59}",56x58,56x59,58x59,0.000118,0.000053,0.000238,0.000410,hym doe hath bee sayd doth own wee kyng hem ty...,ship island sea captain vessel coast sail shor...,emperor itali spain german germani franc duke ...


In [15]:
topics

Unnamed: 0,1,2,topic_number
0,0.14322,paint pictur artist music engrav painter colou...,1
1,0.25957,town road church build built river stone wall ...,2
2,0.124,franc pari french loui madam duke count napole...,3
3,0.19923,church christian christ bishop holi paul doctr...,4
4,0.36643,love heart beauti soul sweet dark night earth ...,5
5,0.05915,india chines china nativ indian bengal govern ...,6
6,0.15907,fig water iron engin pressur steam electr air ...,7
7,0.07679,acid solut heat carbon water sulphur iron gas ...,8
8,0.49381,exist refer period similar consist occur conne...,9
9,0.0733,vol lond fol folio calf copi pari par morocco ...,10


# Info about Principia

In [1]:
import pandas as pd
import config
import pickle

data = pd.read_csv('../temporary/topic_weights.csv')
topic_shares = pickle.load(open('../temporary/topic_shares.pickle', 'rb'))
title_data = pd.read_csv('../input/volume_titles.csv', encoding='latin-1')
titles = title_data[['HTID', '245a', '245b']].rename(columns={'245a': 'title_1',
                                                              '245b': 'title_2'})


  title_data = pd.read_csv('../input/volume_titles.csv', encoding='latin-1')


In [5]:
htid = 'uc1.31822005223383'
principia = data[data['HTID'] == htid]
principia

Unnamed: 0,HTID,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
104619,uc1.31822005223383,4e-06,7e-06,3e-06,5e-06,1e-05,2e-06,4e-06,2e-06,1.3e-05,...,5e-06,6e-06,7e-06,1e-06,1.1e-05,1e-06,7.506052e-07,5e-06,4e-06,7e-06


In [19]:
principia.iloc[:,1:61].sum(axis=1)

104619    1.0
dtype: float64

In [21]:
t = pd.DataFrame(topic_shares[1689]).drop(columns='Color')

In [22]:
t

Unnamed: 0,Religion,Science,Political Economy
1,0.706794,0.108114,0.185093
2,0.636976,0.111293,0.251731
3,0.638564,0.012477,0.348958
4,0.749966,0.004732,0.245302
5,0.724762,0.030061,0.245177
6,0.731687,0.058792,0.209521
7,0.312215,0.471006,0.216779
8,0.588845,0.211309,0.199846
9,0.748866,0.04966,0.201473
10,0.814022,0.028545,0.157433


In [23]:
#export
principia.to_csv('../output/random/principia_weights.csv')
t.to_csv('../output/random/topic_weights_1689.csv')
