In [1]:
import os
import pandas as pd
import numpy as np
import pickle
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from scipy import stats

In [13]:
volume_industry = pd.read_csv('../input/industry_scores.csv')
volume_industry = volume_industry.rename(columns = {'Unnamed: 0':'HTID', 'Weighted Sum': 'industry'})
volume_industry = volume_industry.drop(columns = ['Path', 'year'])
volume_industry['HTID'] = volume_industry['HTID'].map(lambda x: x.rstrip('.txt')) #remove '.txt' at the end of each string for HTIDs


volume_industry

Unnamed: 0,HTID,industry
0,hvd.32044025716390,0.289848
1,uc2.ark+=13960=t7sn0cd5r,0.088869
2,uiuo.ark+=13960=t83j42z5r,0.123275
3,chi.65460297,2.051452
4,uc2.ark+=13960=t8kd1v39,0.094585
...,...,...
172960,mdp.39015063997871,0.619494
172961,nnc1.1002316935,0.141651
172962,mdp.39015033940241,0.216927
172963,hvd.32044023949654,0.281018


In [14]:
volume_sentiment = volume_sentiment.merge(volume_industry, left_on = "HTID", right_on = "HTID")
test

Unnamed: 0.1,Unnamed: 0,HTID,key,percent_optimistic,percent_progress,percent_pessimism,percent_regression,industry
0,0,aeu.ark+=13960=t0000h10f,1875,0.000000,0.000000,0.000000,0.000000,0.551422
1,1,aeu.ark+=13960=t0000j18q,1875,0.000680,0.000000,0.001360,0.000000,0.116549
2,2,aeu.ark+=13960=t00z7jk0b,1875,0.009639,0.001667,0.001884,0.000290,0.267110
3,3,aeu.ark+=13960=t00z7pd6n,1875,0.001581,0.002146,0.000452,0.003276,1.133464
4,4,aeu.ark+=13960=t00z87n8r,1875,0.001066,0.001092,0.000182,0.001066,0.215760
...,...,...,...,...,...,...,...,...
160794,5,uc1.ax0000261321,1500,0.000000,0.000000,0.000000,0.000000,0.000000
160795,6,uc1.l0054623798,1500,0.009223,0.000805,0.000862,0.000337,0.099745
160796,7,uc1.l0097416127,1500,0.000000,0.000000,0.000000,0.000000,0.003401
160797,8,wu.89007024094,1500,0.000000,0.000000,0.000000,0.002171,0.000144


In [16]:
volume_topics = '../input/20191007_topics.txt'

volume_industry = pd.read_csv('../input/industry_scores.csv')
volume_industry = volume_industry.rename(columns = {'Unnamed: 0':'HTID', 'Weighted Sum': 'industry'})
volume_industry = volume_industry.drop(columns = ['Path', 'year'])
volume_industry['HTID'] = volume_industry['HTID'].map(lambda x: x.rstrip('.txt')) #remove '.txt' at the end of each string for HTIDs


volume_weights = pd.read_csv('../input/volume_weights', sep = ',', index_col = [0])
volume_sentiment = pd.read_csv('../Input/Sentiment Analysis Results (Thesaurus List).csv')
volume_sentiment = volume_sentiment.merge(volume_industry, left_on = "HTID", right_on = "HTID")


data = pd.read_csv(volume_topics, sep = '\t', lineterminator = '\n', header=None)
data.drop(columns = 0, inplace = True)
data[1] = [string[string.rfind('/UK_data/')+9:-4] for string in data[1]]
data.columns = ['HTID'] + [i for i in range(1,61)]
print("Dimensions ('Data'): " + str(data.shape))
# htids = data['HTID']
# data = data.drop(columns=['HTID'])
volume_weights['HTID'] = data['HTID']


df = data.merge(volume_sentiment, left_on='HTID', right_on='HTID')
df3 = volume_weights.merge(volume_sentiment, left_on='HTID', right_on='HTID')
df3 = df3.drop(columns=['HTID', 'Unnamed: 0', 'key'])
metapath = "../Input/metadata.p"
metadata = pickle.load(open(metapath, 'rb'))

metadata['Year_rounded'] = pd.to_numeric(metadata['Year'])
metadata['Year'] = pd.to_numeric(metadata['Year'], downcast='signed')

def fix_htid(row):
    return row['HTID'].replace(":","+").replace("/", "=")

with open('../Input/meta_weights.p', 'rb') as fp:
    moving_shares = pickle.load(fp)

metadata['HTID'] = metadata.apply(fix_htid, axis=1)
df2 = pd.merge(df, metadata, on='HTID', how='inner').drop(columns = ['oclc', 'Year'])
df['Year_rounded'] = df2['Year_rounded']
df = df.drop(columns=['HTID', 'Unnamed: 0', 'key'])
df

Dimensions ('Data'): (166779, 61)


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,57,58,59,60,percent_optimistic,percent_progress,percent_pessimism,percent_regression,industry,Year_rounded
0,0.335310,0.000703,3.859994e-05,0.070415,0.094437,1.841183e-05,0.001295,0.000024,0.000154,2.281807e-05,...,8.631037e-06,0.000059,0.000046,0.035256,0.006072,0.000000,0.000486,0.000000,0.150531,1898.0
1,0.002840,0.001595,3.160480e-05,0.001070,0.071715,1.507521e-05,0.000041,0.000020,0.067670,1.868295e-05,...,1.020237e-02,0.001323,0.000038,0.000065,0.007906,0.001622,0.000405,0.000811,0.263212,1832.0
2,0.000017,0.000032,1.514681e-05,0.000024,0.000045,7.224896e-06,0.000019,0.000009,0.000060,8.953928e-06,...,2.812933e-03,0.000023,0.000140,0.000031,0.000216,0.000000,0.000000,0.000000,0.002446,1896.0
3,0.000014,0.000003,3.943318e-03,0.013454,0.000005,3.564930e-02,0.000039,0.000075,0.000006,9.029192e-07,...,3.415332e-07,0.000002,0.068478,0.023605,0.005821,0.001362,0.001574,0.000681,0.079578,1825.0
4,0.000002,0.000004,9.407721e-04,0.000003,0.000006,9.914233e-07,0.000003,0.020569,0.033567,1.228687e-06,...,4.647562e-07,0.000020,0.000002,0.000004,0.000374,0.002903,0.000000,0.000287,0.137376,1882.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158446,0.000023,0.000678,1.974364e-05,0.000191,0.064386,1.686437e-04,0.024865,0.000490,0.148955,1.167131e-05,...,4.414725e-06,0.000190,0.000183,0.000041,0.007571,0.000803,0.000344,0.000918,0.460606,1787.0
158447,0.020714,0.000236,1.241825e-01,0.000008,0.000014,2.231040e-06,0.000044,0.016562,0.068709,2.764963e-06,...,1.045860e-06,0.008947,0.053230,0.015777,0.005857,0.002618,0.000910,0.000333,0.217157,1843.0
158448,0.000134,0.019152,1.454623e-05,0.000258,0.001803,6.938422e-06,0.000136,0.000009,0.000644,8.598896e-06,...,3.252571e-06,0.044483,0.138092,0.000968,0.001592,0.000187,0.000000,0.002341,0.264789,1874.0
158449,0.000068,0.008213,5.900777e-05,0.000095,0.005885,2.814618e-05,0.010069,0.000037,0.027360,3.488201e-05,...,1.319428e-05,0.043396,0.000070,0.004404,0.001540,0.000770,0.000000,0.000770,1.117117,1808.0


In [19]:
a = df3['percent_optimistic'] + df3['percent_progress'] #Total Optimism
b = df3['percent_pessimism'] + df3['percent_regression'] #Total Pessimism
c = a - b #Net Optimism Score
df3['Optimism'] = c
df3['Year_rounded'] = df['Year_rounded']
df3 = df3.drop(columns = ['percent_optimistic', 'percent_progress', 'percent_pessimism', 'percent_regression'])
df3

Unnamed: 0,Religion,Science,Politics,industry,Optimism,Year_rounded
0,0.148640,0.353064,0.498295,0.150531,0.005587,1898.0
1,0.123296,0.443872,0.432832,0.263212,0.008311,1832.0
2,0.557703,0.122978,0.319318,0.002446,0.000216,1896.0
3,0.036475,0.401058,0.562467,0.079578,0.004928,1825.0
4,0.568469,0.057432,0.374100,0.137376,0.002991,1882.0
...,...,...,...,...,...,...
158446,0.198425,0.418353,0.383222,0.460606,0.007113,1787.0
158447,0.131815,0.288556,0.579629,0.217157,0.007232,1843.0
158448,0.231177,0.139438,0.629385,0.264789,-0.000562,1874.0
158449,0.135120,0.137277,0.727603,1.117117,0.001540,1808.0


In [20]:
years = []
for year in range(1510,1891):
    years.append(year)

#Finding percentiles
opt = df3['Optimism']
p = stats.rankdata(opt, "average")/len(opt) #assign each "optimism" score to its percentile
df3['optimism_percentile'] = p

ind = df3['industry']
p = stats.rankdata(ind, "average")/len(opt)
df3['industry_percentile'] = p

#Rename 'Politics' to 'Political Economy'
df3.rename(columns ={'Politics':'Political Economy'}, inplace=True)

df3

Unnamed: 0,Religion,Science,Political Economy,industry,Optimism,Year_rounded,optimism_percentile,industry_percentile
0,0.148640,0.353064,0.498295,0.150531,0.005587,1898.0,0.742147,0.274956
1,0.123296,0.443872,0.432832,0.263212,0.008311,1832.0,0.896889,0.557977
2,0.557703,0.122978,0.319318,0.002446,0.000216,1896.0,0.135083,0.006431
3,0.036475,0.401058,0.562467,0.079578,0.004928,1825.0,0.681674,0.077519
4,0.568469,0.057432,0.374100,0.137376,0.002991,1882.0,0.467116,0.232532
...,...,...,...,...,...,...,...,...
158446,0.198425,0.418353,0.383222,0.460606,0.007113,1787.0,0.845921,0.772927
158447,0.131815,0.288556,0.579629,0.217157,0.007232,1843.0,0.851948,0.464806
158448,0.231177,0.139438,0.629385,0.264789,-0.000562,1874.0,0.060296,0.560924
158449,0.135120,0.137277,0.727603,1.117117,0.001540,1808.0,0.285016,0.945144


In [26]:
#export data
df3.to_csv("../temporary/volumes_opt_industry.csv")

In [21]:
volumes = {}
for year in years:
    volumes[year] = df3[(df3['Year_rounded'] >= (year-10)) & (df3['Year_rounded'] <= (year+10))].drop(columns=['Year_rounded'])

# Industry Triangles

In [25]:
for year in years:

    print(year)
    fig = px.scatter_ternary(volumes[year], a = 'Religion', b = 'Political Economy', c = 'Science',
                            color = 'industry_percentile',
                            range_color = [0,1]
                            )

        
    fig.update_layout(title_text = str(year),
                      title_font_size=30,
                      font_size=20,
                      margin_l = 110
                     )
    fig.update_ternaries(bgcolor="white",
                        aaxis_linecolor="black",
                        baxis_linecolor="black",
                        caxis_linecolor="black"
                        )
    
    fig.update_traces(
        showlegend=False
    )
    
    if year == 1850:   
        fig.write_image('../output/triangles_volumes/' + str(year) + '.png', width=900) #included because wider format needed for color scale
        
    else:
        fig.update(layout_coloraxis_showscale=False) #removes colorbar
        fig.write_image('../output/triangles_volumes_industry/' + str(year) + '.png') #only works with kaleido 0.1.0 for some reason, use 'conda install python-kaleido=0.1.0, also uses plotly 5.10.0


1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709


FileNotFoundError: [Errno 2] No such file or directory: '..\\output\\triangles_volumes\\1850.png'

In [24]:
for year in years:

    print(year)
    fig = px.scatter_ternary(volumes[year], a = 'Religion', b = 'Political Economy', c = 'Science',
                            color = 'optimism_percentile',
                            range_color = [0,1]
                            )

        
    fig.update_layout(title_text = str(year),
                      title_font_size=30,
                      font_size=20,
                      margin_l = 110
                     )
    fig.update_ternaries(bgcolor="white",
                        aaxis_linecolor="black",
                        baxis_linecolor="black",
                        caxis_linecolor="black"
                        )
    
    fig.update_traces(
        showlegend=False
    )
    
    if year == 1850:   
        fig.write_image('../output/triangles_volumes/' + str(year) + '.png', width=900) #included because wider format needed for color scale
        
    else:
        fig.update(layout_coloraxis_showscale=False) #removes colorbar
        fig.write_image('../output/triangles_volumes_optimism/' + str(year) + '.png') #only works with kaleido 0.1.0 for some reason, use 'conda install python-kaleido=0.1.0, also uses plotly 5.10.0


1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709


FileNotFoundError: [Errno 2] No such file or directory: '..\\output\\triangles_volumes\\1850.png'