In [38]:
import json
import glob
from tqdm import tqdm
import pandas as pd
import unidecode
import os
import subprocess
import  tarfile
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
import ast
import zipfile

### The New York Herald

In [3]:
# Find all tar.bz2 files in global tar file
tar = tarfile.open("../data/Institution.tar")
bz2_paths = [name for name in tar.getnames() if "Institution (2)/United States/" in name]

# Create dataframes
coal_articles = pd.DataFrame(columns=["publication_date", "content"])
steel_articles = pd.DataFrame(columns=["publication_date", "content"])
telegraph_articles = pd.DataFrame(columns=["publication_date", "content"])
elec_articles = pd.DataFrame(columns=["publication_date", "content"])
dates_us = set()
total_articles_count = 0

print("Creating the dataset (might take a few minutes) ...")

# open different US tar files one by one
for p in bz2_paths[1:]: # exclude first odt file
    subprocess.run(["tar", "-xvf", "../data/Institution.tar", p], stdout=subprocess.PIPE, text=True)
    tar = tarfile.open(p)
    for m in tar:
        if ".txt" in m.name:
            total_articles_count += 1
            pub_date = m.name.split("/")[1] + "-" + m.name.split("/")[2] + "-" + m.name.split("/")[3] 
            dates_us.add(pub_date)
            f=tar.extractfile(m)
            content=f.read()
            if b'coal' in content:
                coal_articles.loc[m.name.replace("/ocr.txt", "")] = {"publication_date" : pub_date, "content":content.decode("utf-8")}
            if b'steel' in content:
                steel_articles.loc[m.name.replace("/ocr.txt", "")] = {"publication_date" : pub_date, "content":content.decode("utf-8")}
            if b'telegraph' in content:
                telegraph_articles.loc[m.name.replace("/ocr.txt", "")] = {"publication_date" : pub_date, "content":content.decode("utf-8")}
            if b'electricity' in content:
                elec_articles.loc[m.name.replace("/ocr.txt", "")] = {"publication_date" : pub_date, "content":content.decode("utf-8")}

    # Removes files to save memory
    subprocess.run(["rm", "-rf", "Institution (2)"])
    # Intermediate saves
    coal_articles.to_csv("../data/coal_articles_us.csv")
    steel_articles.to_csv("../data/steel_articles_us.csv")
    telegraph_articles.to_csv("../data/telegraph_articles_us.csv")
    elec_articles.to_csv("../data/elec_articles_us.csv")
    
    # Save dates
    with open('../data/dates_us.txt','w') as f:
        f.write(str(dates_us)) 

print("Proportion of articles talking about coal", len(coal_articles.index)/total_articles_count)
print("Proportion of articles talking about steel", len(steel_articles.index)/total_articles_count)
print("Proportion of articles talking about telegraph", len(telegraph_articles.index)/total_articles_count)
print("Proportion of articles talking about electricity", len(elec_articles.index)/total_articles_count)

Creating the dataset (might take a few minutes) ...


x Institution (2)/United States/dlc_ambrosia_ver01.tar.bz2
x Institution (2)/United States/dlc_bindweed_ver02.tar.bz2
x Institution (2)/United States/dlc_crowfoot_ver01.tar.bz2
x Institution (2)/United States/dlc_deadnettle_ver01.tar.bz2
x Institution (2)/United States/dlc_eucalyptus_ver01.tar.bz2
x Institution (2)/United States/dlc_fairymoss_ver01.tar.bz2
x Institution (2)/United States/dlc_goldenglow_ver01.tar.bz2
x Institution (2)/United States/dlc_houseleek_ver01.tar.bz2
x Institution (2)/United States/dlc_itchweed_ver01.tar.bz2
x Institution (2)/United States/dlc_juneberry_ver01.tar.bz2
x Institution (2)/United States/dlc_kudzu_ver01.tar.bz2
x Institution (2)/United States/dlc_laceflower_ver01.tar.bz2
x Institution (2)/United States/dlc_marcus_ver01.tar.bz2
x Institution (2)/United States/dlc_nosebleed_ver01.tar.bz2
x Institution (2)/United States/dlc_poppy_ver01.tar.bz2
x Institution (2)/United States/dlc_quercitron_ver01.tar.bz2


Proportion of articles talking about steel  0.0


In [32]:
# Only keep dates that are in the time period studied
with open('../data/dates_us.txt','r') as f:
    dates_us = ast.literal_eval(f.read())
dates_us = {d for d in dates_us if int(d.split('-')[0]) > 1840 and int(d.split('-')[0]) < 1930}

cdfus = pd.read_csv("../data/coal_articles_us.csv")
all_us_coal_dates = set(cdfus.groupby("publication_date").count().index.to_list())
n_all_us_coal_dates = len({d for d in all_us_coal_dates if int(d.split('-')[0]) > 1840 and int(d.split('-')[0]) < 1930})

tdfus = pd.read_csv("../data/telegraph_articles_us.csv")
all_us_telegraph_dates = set(tdfus.groupby("publication_date").count().index.to_list())
n_all_us_telegraph_dates = len({d for d in all_us_telegraph_dates if int(d.split('-')[0]) > 1840 and int(d.split('-')[0]) < 1930})

sdfus = pd.read_csv("../data/steel_articles_us.csv")
all_us_steel_dates = set(sdfus.groupby("publication_date").count().index.to_list())
n_all_us_steel_dates = len({d for d in all_us_steel_dates if int(d.split('-')[0]) > 1840 and int(d.split('-')[0]) < 1930})

edfus = pd.read_csv("../data/elec_articles_us.csv")
all_us_elec_dates = set(edfus.groupby("publication_date").count().index.to_list())
n_all_us_elec_dates = len({d for d in all_us_elec_dates if int(d.split('-')[0]) > 1840 and int(d.split('-')[0]) < 1930})


print("Total number of issues in the american dataset : ", len(dates_us))
print("Total number of issues with the word 'coal' : ", n_all_us_coal_dates)
print("Total number of issues with the word 'telegraph' : ", n_all_us_telegraph_dates)
print("Total number of issues with the word 'steel' : ", n_all_us_steel_dates)
print("Total number of issues with the word 'electricity' : ", n_all_us_elec_dates)

Total number of issues in the american dataset :  14492
Total number of issues with the word 'coal' :  12000
Total number of issues with the word 'telegraph' :  12397
Total number of issues with the word 'steel' :  7798
Total number of issues with the word 'electricity' :  2593


### Le Figaro

In [42]:
# Open French archive
subprocess.run(["tar", "-xvf", "../data/Institution.tar", "Institution (2)/France/le_figaro.zip"], stdout=subprocess.PIPE, text=True)
# Unzip 
with zipfile.ZipFile('./Institution (2)/France/le_figaro.zip') as zip_ref:
    zip_ref.extractall('./Institution (2)/')

In [49]:
# Get list of all articles
every_article = glob.glob('./Institution (2)/*/*/*.json')
coal_articles = pd.DataFrame(columns=["publication_date", "content"])
telegraph_articles = pd.DataFrame(columns=["publication_date", "content"])
steel_articles = pd.DataFrame(columns=["publication_date", "content"])
elec_articles = pd.DataFrame(columns=["publication_date", "content"])
dates_fr = set()
for article in tqdm(every_article):
    date = article.split("/")[3][:4] + "-" + article.split("/")[3][4:6] + "-" + article.split("/")[3][6:]
    dates_fr.add(date)
    with open(article, 'r') as f:
        data = json.load(f)   
        if "charbon" in unidecode.unidecode(str.lower(data["contentAsText"][0])):
            coal_articles.loc["figaro_" + article.split("/")[3]] = {"publication_date" : article.split("/")[3], "content":data["contentAsText"][0]}
        if "telegraphe" in unidecode.unidecode(str.lower(data["contentAsText"][0])):
            telegraph_articles.loc["figaro_" + article.split("/")[3]] = {"publication_date" : article.split("/")[3], "content":data["contentAsText"][0]}
        if " acier" in unidecode.unidecode(str.lower(data["contentAsText"][0])):
            steel_articles.loc["figaro_" + article.split("/")[3]] = {"publication_date" : article.split("/")[3], "content":data["contentAsText"][0]}
        if "electricite" in unidecode.unidecode(str.lower(data["contentAsText"][0])):
            elec_articles.loc["figaro_" + article.split("/")[3]] = {"publication_date" : article.split("/")[3], "content":data["contentAsText"][0]}
        
# Save data
coal_articles.to_csv("../data/coal_articles_fr.csv")
telegraph_articles.to_csv("../data/telegraph_articles_fr.csv")
steel_articles.to_csv("../data/steel_articles_fr.csv")
elec_articles.to_csv("../data/elec_articles_fr.csv")

# Save dates
with open('../data/dates_fr.txt','w') as f:
    f.write(str(dates_fr))

# Removes files to save memory
subprocess.run(["rm", "-rf", "Institution (2)"])

100%|█████████████████████████████████| 28502/28502 [00:00<00:00, 239273.95it/s]


In [50]:
# Only keep dates that are in the time period studied
with open('../data/dates_fr.txt','r') as f:
    dates_fr = ast.literal_eval(f.read())
dates_fr = {d for d in dates_fr if int(d.split('-')[0]) > 1840 and int(d.split('-')[0]) < 1930}

cdffr = pd.read_csv("../data/coal_articles_fr.csv")
all_fr_coal_dates = set(cdffr.groupby("publication_date").count().index.to_list())
n_all_fr_coal_dates = len({d for d in all_fr_coal_dates if int(d.split('-')[0]) > 1840 and int(d.split('-')[0]) < 1930})

tdffr = pd.read_csv("../data/telegraph_articles_fr.csv")
all_fr_telegraph_dates = set(tdffr.groupby("publication_date").count().index.to_list())
n_all_fr_telegraph_dates = len({d for d in all_fr_telegraph_dates if int(d.split('-')[0]) > 1840 and int(d.split('-')[0]) < 1930})

sdffr = pd.read_csv("../data/steel_articles_fr.csv")
all_fr_steel_dates = set(sdffr.groupby("publication_date").count().index.to_list())
n_all_fr_steel_dates = len({d for d in all_fr_steel_dates if int(d.split('-')[0]) > 1840 and int(d.split('-')[0]) < 1930})

edffr = pd.read_csv("../data/elec_articles_fr.csv")
all_fr_elec_dates = set(edffr.groupby("publication_date").count().index.to_list())
n_all_fr_elec_dates = len({d for d in all_fr_elec_dates if int(d.split('-')[0]) > 1840 and int(d.split('-')[0]) < 1930})


print("Total number of issues in the frenc dataset : ", len(dates_fr))
print("Total number of issues with the word 'coal' : ", n_all_fr_coal_dates)
print("Total number of issues with the word 'telegraph' : ", n_all_fr_telegraph_dates)
print("Total number of issues with the word 'steel' : ", n_all_fr_steel_dates)
print("Total number of issues with the word 'electricity' : ", n_all_fr_elec_dates)

Total number of issues in the american dataset :  19917
Total number of issues with the word 'coal' :  1955
Total number of issues with the word 'telegraph' :  4889
Total number of issues with the word 'steel' :  352
Total number of issues with the word 'electricity' :  1058


### El Imparcial

In [53]:
# Open spanish archive
subprocess.run(["tar", "-xvf", "../data/Institution.tar", "Institution (2)/Spain/2171-0244-el-imparcial-madrid-1867.zip"], stdout=subprocess.PIPE, text=True)
# Unzip 
with zipfile.ZipFile('./Institution (2)/Spain/2171-0244-el-imparcial-madrid-1867.zip') as zip_ref:
    zip_ref.extractall('./Institution (2)/')

x Institution (2)/Spain/2171-0244-el-imparcial-madrid-1867.zip


In [55]:
every_article = glob.glob('./Institution (2)/*.txt')
coal_articles = pd.DataFrame(columns=["publication_date", "content"])
steel_articles = pd.DataFrame(columns=["publication_date", "content"])
telegraph_articles = pd.DataFrame(columns=["publication_date", "content"])
elec_articles = pd.DataFrame(columns=["publication_date", "content"])
dates_es = set()

for article in tqdm(every_article):
    date = article.split("/")[2].split("_")[0][:4] + "-" + article.split("/")[2].split("_")[0][4:6] + "-" + article.split("/")[2].split("_")[0][6:8]
    dates_es.add(date)
    with open(article, 'r') as f:
        data = "\n".join(f.readlines())
        if "carbon" in unidecode.unidecode(str.lower(data)):
            coal_articles.loc["imparcial_" + article.split("/")[2].split("_")[0]] = {"publication_date" : date, "content":data}
        if "acero" in unidecode.unidecode(str.lower(data)):
            steel_articles.loc["imparcial_" + article.split("/")[2].split("_")[0]] = {"publication_date" : date, "content":data}
        if "telegrafo" in unidecode.unidecode(str.lower(data)):
            telegraph_articles.loc["imparcial_" + article.split("/")[2].split("_")[0]] = {"publication_date" : date, "content":data}
        if "electricidad" in unidecode.unidecode(str.lower(data)):
            elec_articles.loc["imparcial_" + article.split("/")[2].split("_")[0]] = {"publication_date" : date, "content":data}

# Save data
coal_articles.to_csv("../data/coal_articles_es.csv")
telegraph_articles.to_csv("../data/telegraph_articles_es.csv")
steel_articles.to_csv("../data/steel_articles_es.csv")
elec_articles.to_csv("../data/elec_articles_es.csv")

# Save dates
with open('../data/dates_es.txt','w') as f:
    f.write(str(dates_es))

# Removes files to save memory
subprocess.run(["rm", "-rf", "Institution (2)"])

100%|█████████████████████████████████| 23008/23008 [00:00<00:00, 269860.20it/s]


In [59]:
# Only keep dates that are in the time period studied
with open('../data/dates_es.txt','r') as f:
    dates_es = ast.literal_eval(f.read())
dates_es = {d for d in dates_es if int(d.split('-')[0]) > 1840 and int(d.split('-')[0]) < 1930}

cdfes = pd.read_csv("../data/coal_articles_es.csv")
all_es_coal_dates = set(cdfes.groupby("publication_date").count().index.to_list())
n_all_es_coal_dates = len({d for d in all_es_coal_dates if int(d.split('-')[0]) > 1840 and int(d.split('-')[0]) < 1930})

tdfes = pd.read_csv("../data/telegraph_articles_es.csv")
all_es_telegraph_dates = set(tdfes.groupby("publication_date").count().index.to_list())
n_all_es_telegraph_dates = len({d for d in all_es_telegraph_dates if int(d.split('-')[0]) > 1840 and int(d.split('-')[0]) < 1930})

sdfes = pd.read_csv("../data/steel_articles_es.csv")
all_es_steel_dates = set(sdfes.groupby("publication_date").count().index.to_list())
n_all_es_steel_dates = len({d for d in all_es_steel_dates if int(d.split('-')[0]) > 1840 and int(d.split('-')[0]) < 1930})

edfes = pd.read_csv("../data/elec_articles_es.csv")
all_es_elec_dates = set(edfes.groupby("publication_date").count().index.to_list())
n_all_es_elec_dates = len({d for d in all_es_elec_dates if int(d.split('-')[0]) > 1840 and int(d.split('-')[0]) < 1930})


print("Total number of issues in the spanish dataset : ", len(dates_es))
print("Total number of issues with the word 'coal' : ", n_all_es_coal_dates)
print("Total number of issues with the word 'telegraph' : ", n_all_es_telegraph_dates)
print("Total number of issues with the word 'steel' : ", n_all_es_steel_dates)
print("Total number of issues with the word 'electricity' : ", n_all_es_elec_dates)

Total number of issues in the spanish dataset :  22062
Total number of issues with the word 'coal' :  14031
Total number of issues with the word 'telegraph' :  15102
Total number of issues with the word 'steel' :  11442
Total number of issues with the word 'electricity' :  4678


### Neue Hamburger Zeitung

In [None]:
def get_whole_text(file_name):
    tree = ET.parse(file_name) 
    root = tree.getroot()
    txt = ""
    for textblock in root.findall(".//{http://www.loc.gov/standards/alto/ns-v2#}TextBlock"):
        txt += " ".join([s.attrib["CONTENT"] for s in textblock.findall('.//{http://www.loc.gov/standards/alto/ns-v2#}String')])
        txt += "\n\n"
    return txt

In [82]:
# Create dataframes
coal_articles = pd.DataFrame(columns=["publication_date", "content"])
elec_articles = pd.DataFrame(columns=["publication_date", "content"])
telegraph_articles = pd.DataFrame(columns=["publication_date", "content"])
steel_articles = pd.DataFrame(columns=["publication_date", "content"])
dates_de = set()

In [83]:
# Open spanish archive
subprocess.run(["tar", "-xvf", "../data/Institution.tar", "Institution (2)/Hamburg/PPN689063377.zip"], stdout=subprocess.PIPE, text=True)
# Unzip 
with zipfile.ZipFile('./Institution (2)/Hamburg/PPN689063377.zip') as zip_ref:
    zip_ref.extractall('./Institution (2)/')

all_files = sorted(glob.glob("./Institution (2)/*/*.xml"))

for file in tqdm(all_files):
    pub_date = file.split("_")[2][:4] + "-" + file.split("_")[2][4:6] + "-" + file.split("_")[2][6:8]
    dates_de.add(pub_date)
"""
 text = get_whole_text(file)
    for block in text.split("\n"):
        if "Kohle" in block:
            try :
                coal_articles.loc["hamburg_" + pub_date] = {"publication_date" : pub_date, "content": coal_articles.loc["hamburg_" + pub_date].content + block}
            except:
                coal_articles.loc["hamburg_" + pub_date] = {"publication_date" : pub_date, "content": block}
        if "Elektrizität" in block:
            try :
                elec_articles.loc["hamburg_" + pub_date] = {"publication_date" : pub_date, "content": elec_articles.loc["hamburg_" + pub_date].content + block}
            except:
                elec_articles.loc["hamburg_" + pub_date] = {"publication_date" : pub_date, "content": block}
        if "Stahl" in block:
            try :
                steel_articles.loc["hamburg_" + pub_date] = {"publication_date" : pub_date, "content": steel_articles.loc["hamburg_" + pub_date].content + block}
            except:
                steel_articles.loc["hamburg_" + pub_date] = {"publication_date" : pub_date, "content": block}
        if "Telegraph" in block:
            try :
                telegraph_articles.loc["hamburg_" + pub_date] = {"publication_date" : pub_date, "content": telegraph_articles.loc["hamburg_" + pub_date].content + block}
            except:
                telegraph_articles.loc["hamburg_" + pub_date] = {"publication_date" : pub_date, "content": block}"""

# Removes files to save memory
subprocess.run(["rm", "-rf", "Institution (2)"])

x Institution (2)/Hamburg/PPN689063377.zip
100%|███████████████████████████████| 152554/152554 [00:00<00:00, 367455.85it/s]


CompletedProcess(args=['rm', '-rf', 'Institution (2)'], returncode=0)

In [84]:
subprocess.run(["tar", "-xvf", "../data/Institution.tar", "Institution (2)/Hamburg/PPN689063512.zip"], stdout=subprocess.PIPE, text=True)
#Unzip 
with zipfile.ZipFile('./Institution (2)/Hamburg/PPN689063512.zip') as zip_ref:
    zip_ref.extractall('./Institution (2)/')

all_files = sorted(glob.glob("./Institution (2)/*/*.xml"))

for file in tqdm(all_files):
    pub_date = file.split("_")[2][:4] + "-" + file.split("_")[2][4:6] + "-" + file.split("_")[2][6:8]
    dates_de.add(pub_date)
"""
 text = get_whole_text(file)
    for block in text.split("\n"):
        if "Kohle" in block:
            try :
                coal_articles.loc["hamburg_" + pub_date] = {"publication_date" : pub_date, "content": coal_articles.loc["hamburg_" + pub_date].content + block}
            except:
                coal_articles.loc["hamburg_" + pub_date] = {"publication_date" : pub_date, "content": block}
        if "Elektrizität" in block:
            try :
                elec_articles.loc["hamburg_" + pub_date] = {"publication_date" : pub_date, "content": elec_articles.loc["hamburg_" + pub_date].content + block}
            except:
                elec_articles.loc["hamburg_" + pub_date] = {"publication_date" : pub_date, "content": block}
        if "Stahl" in block:
            try :
                steel_articles.loc["hamburg_" + pub_date] = {"publication_date" : pub_date, "content": steel_articles.loc["hamburg_" + pub_date].content + block}
            except:
                steel_articles.loc["hamburg_" + pub_date] = {"publication_date" : pub_date, "content": block}
        if "Telegraph" in block:
            try :
                telegraph_articles.loc["hamburg_" + pub_date] = {"publication_date" : pub_date, "content": telegraph_articles.loc["hamburg_" + pub_date].content + block}
            except:
                telegraph_articles.loc["hamburg_" + pub_date] = {"publication_date" : pub_date, "content": block}"""

# Removes files to save memory
subprocess.run(["rm", "-rf", "Institution (2)"])

x Institution (2)/Hamburg/PPN689063512.zip
100%|███████████████████████████████| 104194/104194 [00:00<00:00, 388215.96it/s]


CompletedProcess(args=['rm', '-rf', 'Institution (2)'], returncode=0)

In [85]:
# Open spanish archive
subprocess.run(["tar", "-xvf", "../data/Institution.tar", "Institution (2)/Hamburg/PPN689065744 (1).zip"], stdout=subprocess.PIPE, text=True)
# Unzip 
with zipfile.ZipFile('./Institution (2)/Hamburg/PPN689065744 (1).zip') as zip_ref:
    zip_ref.extractall('./Institution (2)/')

all_files = sorted(glob.glob("./Institution (2)/*/*.xml"))

for file in tqdm(all_files):
    pub_date = file.split("_")[2][:4] + "-" + file.split("_")[2][4:6] + "-" + file.split("_")[2][6:8]
    dates_de.add(pub_date)
"""
 text = get_whole_text(file)
    for block in text.split("\n"):
        if "Kohle" in block:
            try :
                coal_articles.loc["hamburg_" + pub_date] = {"publication_date" : pub_date, "content": coal_articles.loc["hamburg_" + pub_date].content + block}
            except:
                coal_articles.loc["hamburg_" + pub_date] = {"publication_date" : pub_date, "content": block}
        if "Elektrizität" in block:
            try :
                elec_articles.loc["hamburg_" + pub_date] = {"publication_date" : pub_date, "content": elec_articles.loc["hamburg_" + pub_date].content + block}
            except:
                elec_articles.loc["hamburg_" + pub_date] = {"publication_date" : pub_date, "content": block}
        if "Stahl" in block:
            try :
                steel_articles.loc["hamburg_" + pub_date] = {"publication_date" : pub_date, "content": steel_articles.loc["hamburg_" + pub_date].content + block}
            except:
                steel_articles.loc["hamburg_" + pub_date] = {"publication_date" : pub_date, "content": block}
        if "Telegraph" in block:
            try :
                telegraph_articles.loc["hamburg_" + pub_date] = {"publication_date" : pub_date, "content": telegraph_articles.loc["hamburg_" + pub_date].content + block}
            except:
                telegraph_articles.loc["hamburg_" + pub_date] = {"publication_date" : pub_date, "content": block}"""

# Removes files to save memory
subprocess.run(["rm", "-rf", "Institution (2)"])

x Institution (2)/Hamburg/PPN689065744 (1).zip
100%|███████████████████████████████| 122923/122923 [00:00<00:00, 309911.45it/s]


CompletedProcess(args=['rm', '-rf', 'Institution (2)'], returncode=0)

In [None]:
# Save data
coal_articles.to_csv("../data/coal_articles_es.csv")
telegraph_articles.to_csv("../data/telegraph_articles_es.csv")
steel_articles.to_csv("../data/steel_articles_es.csv")
elec_articles.to_csv("../data/elec_articles_es.csv")

In [88]:
# Save dates
with open('../data/dates_de.txt','w') as f:
    f.write(str(dates_de))

In [89]:
# Only keep dates that are in the time period studied
with open('../data/dates_de.txt','r') as f:
    dates_de = ast.literal_eval(f.read())
dates_de = {d for d in dates_de if int(d.split('-')[0]) > 1840 and int(d.split('-')[0]) < 1930}

cdfde = pd.read_csv("../data/coal_articles_de.csv")
all_de_coal_dates = set(cdfde.groupby("publication_date").count().index.to_list())
n_all_de_coal_dates = len({d for d in all_de_coal_dates if int(d.split('-')[0]) > 1840 and int(d.split('-')[0]) < 1930})

tdfde = pd.read_csv("../data/telegraph_articles_de.csv")
all_de_telegraph_dates = set(tdfde.groupby("publication_date").count().index.to_list())
n_all_de_telegraph_dates = len({d for d in all_de_telegraph_dates if int(d.split('-')[0]) > 1840 and int(d.split('-')[0]) < 1930})

sdfde = pd.read_csv("../data/steel_articles_de.csv")
all_de_steel_dates = set(sdfde.groupby("publication_date").count().index.to_list())
n_all_de_steel_dates = len({d for d in all_de_steel_dates if int(d.split('-')[0]) > 1840 and int(d.split('-')[0]) < 1930})

edfde = pd.read_csv("../data/elec_articles_de.csv")
all_de_elec_dates = set(edfde.groupby("publication_date").count().index.to_list())
n_all_de_elec_dates = len({d for d in all_de_elec_dates if int(d.split('-')[0]) > 1840 and int(d.split('-')[0]) < 1930})


print("Total number of issues in the german dataset : ", len(dates_de))
print("Total number of issues with the word 'coal' : ", n_all_de_coal_dates)
print("Total number of issues with the word 'telegraph' : ", n_all_de_telegraph_dates)
print("Total number of issues with the word 'steel' : ", n_all_de_steel_dates)
print("Total number of issues with the word 'electricity' : ", n_all_de_elec_dates)

Total number of issues in the german dataset :  13836
Total number of issues with the word 'coal' :  12635
Total number of issues with the word 'telegraph' :  10372
Total number of issues with the word 'steel' :  10358
Total number of issues with the word 'electricity' :  6189


### La Stampa

In [90]:
all_files = glob.glob("../data/Italian newspaper/out_fernandez/*")

In [None]:
def get_kw_lines(lines, kw):
    sentences = lines.split(".")
    coal_idx = [idx for idx in range(len(sentences)) if kw in sentences[idx]]
    coal_idx_aug = set(sum(list(map(lambda x: [max(0, x-1), x, min(x+1, len(sentences)-1)], coal_idx)), []))
    return " ".join([sentences[i] for i in coal_idx_aug])

In [110]:
coal_articles = pd.DataFrame(columns=["publication_date", "content"])
steel_articles = pd.DataFrame(columns=["publication_date", "content"])
telegraph_articles = pd.DataFrame(columns=["publication_date", "content"])
elec_articles = pd.DataFrame(columns=["publication_date", "content"])

n_articles = 0

for file in tqdm(all_files):
    with open(file) as f:
        lines = f.readlines()
        if len(lines) > 0:
            year = file.split("_")[-1]
            for i, l in enumerate(lines):
                n_articles += 1
                content = get_kw_lines(l, "carbone")
                if content != "":
                    coal_articles.loc["stampa_" + str(year) + "_" + str(i)] = {"publication_date" : year, "content": content}
                content = get_kw_lines(l, "acciaio")
                if content != "":
                    steel_articles.loc["stampa_" + str(year) + "_" + str(i)] = {"publication_date" : year, "content": content}
                content = get_kw_lines(l, "elettricità")
                if content != "":
                    elec_articles.loc["stampa_" + str(year) + "_" + str(i)] = {"publication_date" : year, "content": content}
                content = get_kw_lines(l, "telegrafo")
                if content != "":
                    telegraph_articles.loc["stampa_" + str(year) + "_" + str(i)] = {"publication_date" : year, "content": content}
            
coal_articles.to_csv("../data/coal_articles_it.csv")
steel_articles.to_csv("../data/steel_articles_it.csv")
telegraph_articles.to_csv("../data/telegraph_articles_it.csv")
elec_articles.to_csv("../data/elec_articles_it.csv")

100%|█████████████████████████████████████████| 133/133 [00:08<00:00, 15.57it/s]


'                content = get_kw_lines(l, "carbone")\n                if content != "":\n                    coal_articles.loc["stampa_" + str(year) + "_" + str(i)] = {"publication_date" : year, "content": content}\n                content = get_kw_lines(l, "acciaio")\n                if content != "":\n                    steel_articles.loc["stampa_" + str(year) + "_" + str(i)] = {"publication_date" : year, "content": content}\n                content = get_kw_lines(l, "elettricità")\n                if content != "":\n                    elec_articles.loc["stampa_" + str(year) + "_" + str(i)] = {"publication_date" : year, "content": content}\n                content = get_kw_lines(l, "telegrafo")\n                if content != "":\n                    telegraph_articles.loc["stampa_" + str(year) + "_" + str(i)] = {"publication_date" : year, "content": content}\n            \ncoal_articles.to_csv("../data/coal_articles_it.csv")\nsteel_articles.to_csv("../data/steel_articles_it.csv")\

In [119]:
cdfit = pd.read_csv("../data/coal_articles_it.csv")
n_all_it_coal_dates = len(cdfit.index.to_list())

tdfit = pd.read_csv("../data/telegraph_articles_it.csv")
n_all_it_telegraph_dates = len(tdfit.index.to_list())

sdfit = pd.read_csv("../data/steel_articles_it.csv")
n_all_it_steel_dates = len(sdfit.index.to_list())

edfit = pd.read_csv("../data/elec_articles_it.csv")
n_all_it_elec_dates = len(edfit.index.to_list())


print("Total number of articles in the italian dataset : ", n_articles)
print("Total number of articles with the word 'coal' : ", n_all_it_coal_dates)
print("Total number of articles with the word 'telegraph' : ", n_all_it_telegraph_dates)
print("Total number of articles with the word 'steel' : ", n_all_it_steel_dates)
print("Total number of articles with the word 'electricity' : ", n_all_it_elec_dates)

Total number of articles in the italian dataset :  54797
Total number of articles with the word 'coal' :  2875
Total number of articles with the word 'telegraph' :  4425
Total number of articles with the word 'steel' :  39251
Total number of articles with the word 'electricity' :  11951
