In [None]:
import json
import glob
from tqdm import tqdm
import pandas as pd
import unidecode
import os
import subprocess
import  tarfile
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET

### The New York Herald

In [None]:
# Find all tar.bz2 files in global tar file
tar = tarfile.open("../data/Institution.tar")
bz2_paths = [name for name in tar.getnames() if "Institution (2)/United States/" in name]

# Create dataframe
coal_articles = pd.DataFrame(columns=["publication_date", "content"])

total_articles_count = 0

print("Creating the dataset (might take a few minutes) ...")

# open different US tar files one by one
for p in bz2_paths[1:]: # exclude first odt file
    subprocess.run(["tar", "-xvf", "../data/Institution.tar", p], stdout=subprocess.PIPE, text=True)
    tar = tarfile.open(p)
    for m in tar:
        if ".txt" in m.name:
            total_articles_count += 1
            f=tar.extractfile(m)
            content=f.read()
            if b'telegraph' in content:
                pub_date = m.name.split("/")[1] + "-" + m.name.split("/")[2] + "-" + m.name.split("/")[3] 
                coal_articles.loc[m.name.replace("/ocr.txt", "")] = {"publication_date" : pub_date, "content":content.decode("utf-8")}
    # Removes files to save memory
    subprocess.run(["rm", "-rf", "Institution (2)"])
    coal_articles.to_csv("../data/telegraph_articles_us.csv")

print("Proportion of articles talking about steel ", len(coal_articles.index)/total_articles_count)

### Le Figaro

In [None]:
every_article = glob.glob('../data/le_figaro/*/*/*.json')
coal_articles = pd.DataFrame(columns=["publication_date", "content"])
for article in tqdm(every_article):
    with open(article, 'r') as f:
        data = json.load(f)
        if " acier" in unidecode.unidecode(str.lower(data["contentAsText"][0])):
            coal_articles.loc["figaro_" + article.split("/")[3]] = {"publication_date" : article.split("/")[3], "content":data["contentAsText"][0]}

In [None]:
len(coal_articles)/len(every_article)

In [None]:
coal_articles["publication_date"] = coal_articles.apply(lambda row: row["publication_date"][:4]+"-"+row["publication_date"][4:6]
                                                        +"-"+row["publication_date"][6:]
                                                        , axis=1)
coal_articles.to_csv("steel_articles_fr.csv")

### El Imparcial

In [None]:
every_article = glob.glob('data/el_imparcial/*.txt')
coal_articles = pd.DataFrame(columns=["publication_date", "content"])
for article in tqdm(every_article):
    with open(article, 'r') as f:
        data = "\n".join(f.readlines())
        if "telegrafo" in unidecode.unidecode(str.lower(data)):
            coal_articles.loc["imparcial_" + article.split("/")[2].split("_")[0]] = {"publication_date" : article.split("/")[2].split("_")[0][:4] + "-" + article.split("/")[2].split("_")[0][4:6] + "-" + article.split("/")[2].split("_")[0][6:8], "content":data}

In [None]:
len(coal_articles)/len(every_article)

In [None]:
coal_articles.to_csv("telegraph_articles_es.csv")

### Neue Hamburger Zeitung

In [None]:
def get_whole_text(file_name):
    tree = ET.parse(file_name) 
    root = tree.getroot()
    txt = ""
    for textblock in root.findall(".//{http://www.loc.gov/standards/alto/ns-v2#}TextBlock"):
        txt += " ".join([s.attrib["CONTENT"] for s in textblock.findall('.//{http://www.loc.gov/standards/alto/ns-v2#}String')])
        txt += "\n\n"
    return txt

In [None]:
# TODO do this with the 3 hamburgers
all_files = sorted(glob.glob("../Institution (2)/Hamburg/PPN689063377/*/*.xml"))
# Create dataframes
coal_articles = pd.DataFrame(columns=["publication_date", "content"])
elec_articles = pd.DataFrame(columns=["publication_date", "content"])
telegraph_articles = pd.DataFrame(columns=["publication_date", "content"])
steel_articles = pd.DataFrame(columns=["publication_date", "content"])

In [None]:
for file in tqdm(all_files):
    pub_date = file.split("_")[2][:4] + "-" + file.split("_")[2][4:6] + "-" + file.split("_")[2][6:8]
    text = get_whole_text(file)
    for block in text.split("\n"):
        if "Kohle" in block:
            try :
                coal_articles.loc["hamburg_" + pub_date] = {"publication_date" : pub_date, "content": coal_articles.loc["hamburg_" + pub_date].content + block}
            except:
                coal_articles.loc["hamburg_" + pub_date] = {"publication_date" : pub_date, "content": block}
        if "Elektrizität" in block:
            try :
                elec_articles.loc["hamburg_" + pub_date] = {"publication_date" : pub_date, "content": elec_articles.loc["hamburg_" + pub_date].content + block}
            except:
                elec_articles.loc["hamburg_" + pub_date] = {"publication_date" : pub_date, "content": block}
        if "Stahl" in block:
            try :
                steel_articles.loc["hamburg_" + pub_date] = {"publication_date" : pub_date, "content": steel_articles.loc["hamburg_" + pub_date].content + block}
            except:
                steel_articles.loc["hamburg_" + pub_date] = {"publication_date" : pub_date, "content": block}
        if "Telegraph" in block:
            try :
                telegraph_articles.loc["hamburg_" + pub_date] = {"publication_date" : pub_date, "content": telegraph_articles.loc["hamburg_" + pub_date].content + block}
            except:
                telegraph_articles.loc["hamburg_" + pub_date] = {"publication_date" : pub_date, "content": block}

In [None]:
len(coal_articles.index)

In [None]:
#every_german = pd.concat([data0, data1, data2], ignore_index=True, axis=0)
every_german["id"] = every_german.apply(lambda row: row[0].split("/")[0], axis=1)
every_german = every_german.groupby('id')["content"].agg(lambda x: "\n".join(x.tolist()))
every_german = pd.DataFrame(every_german)
every_german["publication_date"] = every_german.index.to_list()
every_german["publication_date"] = every_german.apply(lambda row : row["publication_date"].split("_")[1], axis=1)

In [None]:
every_german.to_csv("coal_articles_de.csv")

### La Stampa

In [None]:
all_files = glob.glob("../data/Italian newspaper/out_fernandez/*")

In [None]:
def get_kw_lines(lines, kw):
    sentences = lines.split(".")
    coal_idx = [idx for idx in range(len(sentences)) if kw in sentences[idx]]
    coal_idx_aug = set(sum(list(map(lambda x: [max(0, x-1), x, min(x+1, len(sentences)-1)], coal_idx)), []))
    return " ".join([sentences[i] for i in coal_idx_aug])

In [None]:
coal_articles = pd.DataFrame(columns=["publication_date", "content"])
kw = "elettricità"
for file in tqdm(all_files):
    with open(file) as f:
        lines = f.readlines()
        if len(lines) > 0:
            year = file.split("_")[-1]
            for i, l in enumerate(lines):
                content = get_kw_lines(l, kw)
                if content != "":
                    coal_articles.loc["stampa_" + str(year) + "_" + str(i)] = {"publication_date" : year, "content": content}

In [None]:
coal_articles.to_csv("../data/elec_articles_it.csv")