In [70]:
import pandas as pd
import pickle
from functools import reduce


In [71]:
#Load Data
volumes = pd.read_csv('../temporary/volumes.csv')
industry = pd.read_csv('../input/industry_scores.csv')
sentiment = pd.read_csv('../input/sentiment_scores.csv')

#volume metadata
metadata = pickle.load(open('../input/metadata.p', 'rb'))

metadata['Year'] = pd.to_numeric(metadata['Year'], downcast='signed')

def fix_htid(row):
    return row['HTID'].replace(":","+").replace("/", "=")

metadata['HTID'] = metadata.apply(fix_htid, axis=1)
metadata.drop(columns=['oclc'], inplace=True)

volumes

Unnamed: 0,Religion,Science,Political Economy,HTID
0,0.326938,0.164118,0.508944,uc1.b5568131
1,0.410077,0.104824,0.485099,uc1.$b135547
2,0.065574,0.629993,0.304434,hvd.32044106314859
3,0.400788,0.038419,0.560793,uc1.$b29323
4,0.028457,0.594895,0.376647,mdp.39015076816662
...,...,...,...,...
166775,0.405057,0.192782,0.402161,chi.090018182
166776,0.270244,0.149941,0.579815,hvd.32044089522510
166777,0.161509,0.172480,0.666012,uc1.31175035187601
166778,0.134430,0.151041,0.714530,aeu.ark+=13960=t09w1n868


In [72]:
#Clean industry data
industry = industry.rename(columns={'Unnamed: 0': 'HTID', '2-vote':'industry_2','3-vote':'industry_3'})
industry['HTID'] = industry['HTID'].map(lambda x: x.rstrip('.txt'))#remove '.txt' at the end of each string for HTIDs
industry

Unnamed: 0,HTID,industry_2,industry_3
0,hvd.32044025716390,0.205761,0.235340
1,uc2.ark+=13960=t7sn0cd5r,0.056452,0.057502
2,uiuo.ark+=13960=t83j42z5r,0.087561,0.088896
3,chi.65460297,1.745223,1.938208
4,uc2.ark+=13960=t8kd1v39,0.052708,0.052708
...,...,...,...
173062,mdp.39015063997871,0.521087,0.540769
173063,nnc1.1002316935,0.062852,0.062852
173064,mdp.39015033940241,0.142026,0.145587
173065,hvd.32044023949654,0.178859,0.178859


In [73]:
sentiment

Unnamed: 0.1,Unnamed: 0,HTID,key,percent_optimistic,percent_progress,percent_pessimism,percent_regression
0,0,aeu.ark+=13960=t0000h10f,1875,0.000000,0.000000,0.000000,0.000000
1,1,aeu.ark+=13960=t0000j18q,1875,0.000680,0.000000,0.001360,0.000000
2,2,aeu.ark+=13960=t00z7jk0b,1875,0.009639,0.001667,0.001884,0.000290
3,3,aeu.ark+=13960=t00z7pd6n,1875,0.001581,0.002146,0.000452,0.003276
4,4,aeu.ark+=13960=t00z87n8r,1875,0.001066,0.001092,0.000182,0.001066
...,...,...,...,...,...,...,...
165006,5,uc1.ax0000261321,1500,0.000000,0.000000,0.000000,0.000000
165007,6,uc1.l0054623798,1500,0.009223,0.000805,0.000862,0.000337
165008,7,uc1.l0097416127,1500,0.000000,0.000000,0.000000,0.000000
165009,8,wu.89007024094,1500,0.000000,0.000000,0.000000,0.002171


In [74]:
#Clean Sentiment Data
sentiment.drop(columns=['Unnamed: 0', 'key'], inplace=True)
sentiment


Unnamed: 0,HTID,percent_optimistic,percent_progress,percent_pessimism,percent_regression
0,aeu.ark+=13960=t0000h10f,0.000000,0.000000,0.000000,0.000000
1,aeu.ark+=13960=t0000j18q,0.000680,0.000000,0.001360,0.000000
2,aeu.ark+=13960=t00z7jk0b,0.009639,0.001667,0.001884,0.000290
3,aeu.ark+=13960=t00z7pd6n,0.001581,0.002146,0.000452,0.003276
4,aeu.ark+=13960=t00z87n8r,0.001066,0.001092,0.000182,0.001066
...,...,...,...,...,...
165006,uc1.ax0000261321,0.000000,0.000000,0.000000,0.000000
165007,uc1.l0054623798,0.009223,0.000805,0.000862,0.000337
165008,uc1.l0097416127,0.000000,0.000000,0.000000,0.000000
165009,wu.89007024094,0.000000,0.000000,0.000000,0.002171


In [75]:
#Merge Data
dfs = [volumes, industry, sentiment, metadata]

volumes_scores = reduce(lambda left,right: pd.merge(left, right, on = 'HTID', how = 'inner'), dfs) #merge on volume ID

#drop NA's
volumes_scores = volumes_scores.dropna()

volumes_scores

Unnamed: 0,Religion,Science,Political Economy,HTID,industry_2,industry_3,percent_optimistic,percent_progress,percent_pessimism,percent_regression,Year
0,0.326938,0.164118,0.508944,uc1.b5568131,0.099625,0.101499,0.006072,0.000000,0.000486,0.000000,1898.0
1,0.410077,0.104824,0.485099,uc1.$b135547,0.151136,0.183048,0.007906,0.001622,0.000405,0.000811,1832.0
2,0.065574,0.629993,0.304434,hvd.32044106314859,0.000000,0.000000,0.000216,0.000000,0.000000,0.000000,1896.0
3,0.400788,0.038419,0.560793,uc1.$b29323,0.051752,0.055923,0.005821,0.001362,0.001574,0.000681,1825.0
4,0.028457,0.594895,0.376647,mdp.39015076816662,0.044464,0.059229,0.000374,0.002903,0.000000,0.000287,1882.0
...,...,...,...,...,...,...,...,...,...,...,...
158545,0.405057,0.192782,0.402161,chi.090018182,0.367943,0.380383,0.007571,0.000803,0.000344,0.000918,1843.0
158546,0.270244,0.149941,0.579815,hvd.32044089522510,0.105344,0.125315,0.005857,0.002618,0.000910,0.000333,1874.0
158547,0.161509,0.172480,0.666012,uc1.31175035187601,0.183920,0.187089,0.001592,0.000187,0.000000,0.002341,1808.0
158548,0.134430,0.151041,0.714530,aeu.ark+=13960=t09w1n868,0.690849,0.736368,0.001540,0.000770,0.000000,0.000770,1857.0


In [76]:
#Net Optimism
volumes_scores['optimism_score'] = volumes_scores['percent_optimistic'] + volumes_scores['percent_progress'] - volumes_scores['percent_pessimism'] - volumes_scores['percent_regression']

volumes_scores

Unnamed: 0,Religion,Science,Political Economy,HTID,industry_2,industry_3,percent_optimistic,percent_progress,percent_pessimism,percent_regression,Year,optimism_score
0,0.326938,0.164118,0.508944,uc1.b5568131,0.099625,0.101499,0.006072,0.000000,0.000486,0.000000,1898.0,0.005587
1,0.410077,0.104824,0.485099,uc1.$b135547,0.151136,0.183048,0.007906,0.001622,0.000405,0.000811,1832.0,0.008311
2,0.065574,0.629993,0.304434,hvd.32044106314859,0.000000,0.000000,0.000216,0.000000,0.000000,0.000000,1896.0,0.000216
3,0.400788,0.038419,0.560793,uc1.$b29323,0.051752,0.055923,0.005821,0.001362,0.001574,0.000681,1825.0,0.004928
4,0.028457,0.594895,0.376647,mdp.39015076816662,0.044464,0.059229,0.000374,0.002903,0.000000,0.000287,1882.0,0.002991
...,...,...,...,...,...,...,...,...,...,...,...,...
158545,0.405057,0.192782,0.402161,chi.090018182,0.367943,0.380383,0.007571,0.000803,0.000344,0.000918,1843.0,0.007113
158546,0.270244,0.149941,0.579815,hvd.32044089522510,0.105344,0.125315,0.005857,0.002618,0.000910,0.000333,1874.0,0.007232
158547,0.161509,0.172480,0.666012,uc1.31175035187601,0.183920,0.187089,0.001592,0.000187,0.000000,0.002341,1808.0,-0.000562
158548,0.134430,0.151041,0.714530,aeu.ark+=13960=t09w1n868,0.690849,0.736368,0.001540,0.000770,0.000000,0.000770,1857.0,0.001540


In [77]:
#Percentiles
volumes_scores['optimism_percentile'] = volumes_scores.optimism_score.rank(pct=True)
volumes_scores['industry_2_percentile'] = volumes_scores.industry_2.rank(pct=True)
volumes_scores['industry_3_percentile'] = volumes_scores.industry_3.rank(pct=True)
volumes_scores

Unnamed: 0,Religion,Science,Political Economy,HTID,industry_2,industry_3,percent_optimistic,percent_progress,percent_pessimism,percent_regression,Year,optimism_score,optimism_percentile,industry_2_percentile,industry_3_percentile
0,0.326938,0.164118,0.508944,uc1.b5568131,0.099625,0.101499,0.006072,0.000000,0.000486,0.000000,1898.0,0.005587,0.742173,0.329601,0.292840
1,0.410077,0.104824,0.485099,uc1.$b135547,0.151136,0.183048,0.007906,0.001622,0.000405,0.000811,1832.0,0.008311,0.896892,0.520582,0.564200
2,0.065574,0.629993,0.304434,hvd.32044106314859,0.000000,0.000000,0.000216,0.000000,0.000000,0.000000,1896.0,0.000216,0.134853,0.006719,0.006277
3,0.400788,0.038419,0.560793,uc1.$b29323,0.051752,0.055923,0.005821,0.001362,0.001574,0.000681,1825.0,0.004928,0.681692,0.122097,0.114186
4,0.028457,0.594895,0.376647,mdp.39015076816662,0.044464,0.059229,0.000374,0.002903,0.000000,0.000287,1882.0,0.002991,0.467022,0.096541,0.124791
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158545,0.405057,0.192782,0.402161,chi.090018182,0.367943,0.380383,0.007571,0.000803,0.000344,0.000918,1843.0,0.007113,0.845943,0.807820,0.797395
158546,0.270244,0.149941,0.579815,hvd.32044089522510,0.105344,0.125315,0.005857,0.002618,0.000910,0.000333,1874.0,0.007232,0.851968,0.354406,0.390473
158547,0.161509,0.172480,0.666012,uc1.31175035187601,0.183920,0.187089,0.001592,0.000187,0.000000,0.002341,1808.0,-0.000562,0.060317,0.601320,0.573013
158548,0.134430,0.151041,0.714530,aeu.ark+=13960=t09w1n868,0.690849,0.736368,0.001540,0.000770,0.000000,0.000770,1857.0,0.001540,0.284873,0.917464,0.917212


In [78]:
#export data
volumes_scores.to_csv('../temporary/volumes_scores.csv', index=False)