In [1]:
import pandas as pd
from pandas import Series, DataFrame
import country_converter as coco
import os

In [2]:
def read_pcdb(file_name, tech_name, place='World'):
    df = pd.read_csv(file_name, usecols=[0, 1, 2])
    df.set_index(df.columns[1], inplace=True)
    df = df.transpose()
    df['Data Source'] = 'Santa Fe Institute'
    df['Metric'] = ['Price', 'Annual production']
    df['Technology Name'] = tech_name
    if place=='World':
        df['Spatial Scale'] = 'Global'
        df['Country Name'] = 'World'
        df['Country Code'] = 'World'
    if place=='US':
        df['Spatial Scale'] = 'National'
        df['Country Name'] = 'United States'
        df['Country Code'] = 'US'
    if place=='Japan':
        df['Spatial Scale'] = 'National'
        df['Country Name'] = 'Japan'
        df['Country Code'] = 'JP'
    df['ID'] = df['Technology Name'] + '_' + df['Metric'] + '_' + df['Country Code']
    df['Unit'] = [df.index[0], df.index[1]]
    df.set_index('ID', drop=True, inplace=True)
    df.columns.name = None
    new_file_name = 'processing/' + file_name
    df.to_csv(new_file_name)
    return df

In [3]:
os.chdir('/Users/ariana/desktop/historical_tech/raw data/PCDB')

In [4]:
# https://pcdb.santafe.edu/graph.php?curve=119
acrylic_fiber = read_pcdb('AcrylicFiber.csv', 'Acrylic Fiber')

In [5]:
# https://pcdb.santafe.edu/graph.php?curve=113
acrylonitrile = read_pcdb('Acrylonitrile.csv', 'Acrylonitrile')

In [6]:
# https://pcdb.santafe.edu/graph.php?curve=108
aniline = read_pcdb('Aniline.csv', 'Aniline')

In [7]:
# https://pcdb.santafe.edu/graph.php?curve=153
beer_japan = read_pcdb('Beer_(Japan).csv', 'Beer production', 'Japan')

In [8]:
# https://pcdb.santafe.edu/graph.php?curve=100
benzene = read_pcdb('Benzene.csv', 'Benzene')

In [9]:
# https://pcdb.santafe.edu/graph.php?curve=111
bisphenol_a = read_pcdb('BisphenolA.csv', 'Bisphenol A')

In [10]:
# https://pcdb.santafe.edu/graph.php?curve=112
caprolactam = read_pcdb('Caprolactam.csv', 'Caprolactam')

In [11]:
# https://pcdb.santafe.edu/graph.php?curve=97
crude_oil = read_pcdb('Crude_Oil.csv', 'Crude Oil')

In [12]:
# https://pcdb.santafe.edu/graph.php?curve=116
cyclohexane = read_pcdb('Cyclohexane.csv', 'Cyclohexane')

In [13]:
# https://pcdb.santafe.edu/graph.php?curve=118
ethanolamine = read_pcdb('Ethanolamine.csv', 'Ethanolamine')

In [14]:
# https://pcdb.santafe.edu/graph.php?curve=120
ethyl_alcohol = read_pcdb('EthylAlcohol.csv', 'Ethyl Alcohol')

In [15]:
# https://pcdb.santafe.edu/graph.php?curve=99
ethylene = read_pcdb('Ethylene.csv', 'Ethylene')

In [16]:
# https://pcdb.santafe.edu/graph.php?curve=122
# EthyleneGlycol
ethylene_glycol = read_pcdb('EthyleneGlycol.csv', 'Ethylene Glycol')

In [17]:
# https://pcdb.santafe.edu/graph.php?curve=121
# unclear to me how this is different from Ethylene
ethylene_2 = read_pcdb('Ethylene_2.csv', 'Ethylene 2')

In [18]:
# https://pcdb.santafe.edu/graph.php?curve=123
formaldehyde = read_pcdb('Formaldehyde.csv', 'Formaldehyde')

In [19]:
# https://pcdb.santafe.edu/graph.php?curve=124
hydrofluoric_acid = read_pcdb('HydrofluoricAcid.csv', 'Hydrofluoric Acid')

In [20]:
# https://pcdb.santafe.edu/graph.php?curve=102
# unclear to me how this is different from Polyethylene LD
low_density_polyethylene = read_pcdb('Low_Density_Polyethylene.csv', 'Low-Density Polyethylene')

In [21]:
# https://pcdb.santafe.edu/graph.php?curve=126
magnesium = read_pcdb('Magnesium.csv', 'Magnesium')

In [22]:
# https://pcdb.santafe.edu/graph.php?curve=127
maleic_anhydride = read_pcdb('MaleicAnhydride.csv', 'Maleic Anhydride')

In [23]:
# https://pcdb.santafe.edu/graph.php?curve=128
methanol = read_pcdb('Methanol.csv', 'Methanol')

In [24]:
# https://pcdb.santafe.edu/graph.php?curve=26
milk_us = read_pcdb('Milk_(US).csv', 'Milk production', 'US')

In [25]:
# https://pcdb.santafe.edu/graph.php?curve=98
motor_gasoline = read_pcdb('Motor_Gasoline.csv', 'Motor Gasoline')

In [26]:
# https://pcdb.santafe.edu/graph.php?curve=129
neoprene_rubber = read_pcdb('NeopreneRubber.csv', 'Neoprene Rubber')

In [27]:
# https://pcdb.santafe.edu/graph.php?curve=101
paraxylene = read_pcdb('Paraxylene.csv', 'Paraxylene')

In [28]:
# https://pcdb.santafe.edu/graph.php?curve=130
pentaerythritol = read_pcdb('Pentaerythritol.csv', 'Pentaerythritol')

In [29]:
# https://pcdb.santafe.edu/graph.php?curve=131
phenol = read_pcdb('Phenol.csv', 'Phenol')

In [30]:
# https://pcdb.santafe.edu/graph.php?curve=132
phthalic_anhydride = read_pcdb('PhthalicAnhydride.csv', 'Phthalic Anhydride')

In [31]:
# https://pcdb.santafe.edu/graph.php?curve=133
polyester_fiber = read_pcdb('PolyesterFiber.csv', 'Polyester Fiber')

In [32]:
# https://pcdb.santafe.edu/graph.php?curve=134
polyethylene_hd = read_pcdb('PolyethyleneHD.csv', 'Polyethylene HD')

In [33]:
# https://pcdb.santafe.edu/graph.php?curve=135
polyethylene_ld = read_pcdb('PolyethyleneLD.csv', 'Polyethylene LD')

In [34]:
# https://pcdb.santafe.edu/graph.php?curve=147
polystyrene = read_pcdb('Polystyrene.csv', 'Polystyrene')

In [35]:
# https://pcdb.santafe.edu/graph.php?curve=148
polyvinylchloride = read_pcdb('Polyvinylchloride.csv', 'Polyvinylchloride')

In [36]:
# https://pcdb.santafe.edu/graph.php?curve=149
# Primary_Aluminum
primary_aluminum = read_pcdb('Primary_Aluminum.csv', 'Primary Alumninum')

In [37]:
# https://pcdb.santafe.edu/graph.php?curve=150
primary_magnesium = read_pcdb('Primary_Magnesium.csv', 'Primary Magnesium')

In [38]:
# https://pcdb.santafe.edu/graph.php?curve=155
refined_cane_sugar = read_pcdb('Refined_Cane_Sugar.csv', 'Refined Cane Sugar Production')

In [39]:
# https://pcdb.santafe.edu/graph.php?curve=34
# Shotgun_Sanger_DNA_Sequencing

# due to the uneven structure of the data,
# we are using the average cost of all quarters where data is provided as the yearly average cost
# and the yearly production value of the latest quarter in the year

dna_sequencing = pd.read_csv('Shotgun_Sanger_DNA_Sequencing.csv', usecols=[0, 1, 2])
year_list = []
for year in dna_sequencing['Time (Year)']:
    year_list.append(str(year)[:4])
dna_sequencing['Year'] = year_list

year_set = set(year_list)
year_dict = {}

for year in year_set:
    year_dict[year] = []
    
year_dict = dict(sorted(year_dict.items(), key = lambda item: item[0]))

for year in dna_sequencing['Time (Year)']:
    year_4_char = str(year)[:4]
    year = str(year)
    year_dict[year_4_char].append(year)

greatest_qtrs = []
for year in year_dict:
    greatest_qtr = None
    for qtr in year_dict[year]:
        if greatest_qtr == None:
            greatest_qtr = qtr
        else:
            if qtr > greatest_qtr:
                greatest_qtr = qtr
    greatest_qtrs.append(qtr)

yearly_production = []
for idx in dna_sequencing.index:
    qtr_year = str(dna_sequencing['Time (Year)'].iloc[idx])
    yrly_production = dna_sequencing['Yearly Production (Kilobase)'].iloc[idx]
    if qtr_year in greatest_qtrs:
        yearly_production.append(yrly_production)
        
year_list = sorted(list(year_set))

cost_dict = {}
for year in year_set:
    cost_dict[year] = []
cost_dict = dict(sorted(cost_dict.items(), key = lambda item: item[0]))

for idx in dna_sequencing.index:
    cost = dna_sequencing['Cost (USD/Kilobase)'].iloc[idx]
    year = str(dna_sequencing['Year'].iloc[idx])
    cost_dict[year].append(cost)

avg_cost = []
for year in cost_dict:
    avg_cost.append(sum(cost_dict[year]) / len(cost_dict[year]))
    
new_df = pd.DataFrame(avg_cost, year_list)
new_df['Annual Production'] = yearly_production
new_df = new_df.transpose()
new_df['Data Source'] = 'Santa Fe Institute'
new_df['Spatial Scale'] = 'Global'
new_df['Country Name'] = 'World'
new_df['Country Code'] = 'World'
new_df['Metric'] = ['Price', 'Annual production']
new_df['Unit'] = ['USD/Kilobase', 'Kilobase']
new_df['Technology Name'] = 'Shotgun Sanger DNA Sequencing'
new_df['ID'] = new_df['Technology Name'] + '_' + new_df['Metric'] + '_' + new_df['Country Code']
new_df.set_index('ID', drop=True, inplace=True)
new_df.to_csv('processing/Shotgun_Sanger_DNA_Sequencing.csv')

In [40]:
# https://pcdb.santafe.edu/graph.php?curve=136
sodium = read_pcdb('Sodium.csv', 'Sodium')

In [41]:
# https://pcdb.santafe.edu/graph.php?curve=137
sodium_chlorate = read_pcdb('SodiumChlorate.csv', 'Sodium Chlorate')

In [42]:
# https://pcdb.santafe.edu/graph.php?curve=140
styrene = read_pcdb('Styrene.csv', 'Styrene')

In [43]:
# https://pcdb.santafe.edu/graph.php?curve=151
titanium_sponge = read_pcdb('Titanium_Sponge.csv', 'Titanium Sponge')

In [44]:
# https://pcdb.santafe.edu/graph.php?curve=143
urea = read_pcdb('Urea.csv', 'Urea')

In [45]:
# https://pcdb.santafe.edu/graph.php?curve=144
vinyl_acetate = read_pcdb('VinylAcetate.csv', 'Vinyl Acetate')

In [46]:
# https://pcdb.santafe.edu/graph.php?curve=145
vinyl_chloride = read_pcdb('VinylChloride.csv', 'Vinyl Chloride')

In [47]:
os.chdir('/Users/ariana/desktop/historical_tech/cleaned data')
frame = pd.read_csv('frame.csv')
df_list = [frame]

In [48]:
path = '/Users/ariana/desktop/historical_tech/raw data/PCDB/processing'
os.chdir(path)
for file in os.listdir(path):
    file = pd.read_csv(file)
    df_list.append(file)

In [49]:
pcdb_all = pd.concat(df_list)
pcdb_all

Unnamed: 0,ID,Spatial Scale,Country Code,Country Name,Technology Name,Metric,Unit,Data Source,1700,1701,...,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025
0,Caprolactam_Price_World,Global,World,World,Caprolactam,Price,Price (1966 USD/lbs),Santa Fe Institute,,,...,,,,,,,,,,
1,Caprolactam_Annual production_World,Global,World,World,Caprolactam,Annual production,Yearly Production (Mil. lbs),Santa Fe Institute,,,...,,,,,,,,,,
0,Crude Oil_Price_World,Global,World,World,Crude Oil,Price,Price (1958 USD),Santa Fe Institute,,,...,,,,,,,,,,
1,Crude Oil_Annual production_World,Global,World,World,Crude Oil,Annual production,Yearly Production (Billion Barrels),Santa Fe Institute,,,...,,,,,,,,,,
0,Titanium Sponge_Price_World,Global,World,World,Titanium Sponge,Price,Price (1958 USD/lbs),Santa Fe Institute,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,Polystyrene_Annual production_World,Global,World,World,Polystyrene,Annual production,Yearly Production (Million Pounds),Santa Fe Institute,,,...,,,,,,,,,,
0,Phenol_Price_World,Global,World,World,Phenol,Price,Price (1966 USD/lbs),Santa Fe Institute,,,...,,,,,,,,,,
1,Phenol_Annual production_World,Global,World,World,Phenol,Annual production,Yearly Production (Mil. lbs),Santa Fe Institute,,,...,,,,,,,,,,
0,Beer production_Price_JP,National,JP,Japan,Beer production,Price,Retail Price Minus Indirect Tax (USD 1955),Santa Fe Institute,,,...,,,,,,,,,,


In [50]:
os.chdir('/Users/ariana/desktop/historical_tech/cleaned data')

In [51]:
pcdb_all.to_csv('pcdb.csv', index=False)