# Import libraries

In [None]:
import os
import shutil
import random
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By

# https://www.thepythoncode.com/article/extract-pdf-images-in-python
import fitz # PyMuPDF
import io
from PIL import Image

# https://github.com/joke2k/faker
from faker import Faker
fake = Faker()

import csv
import subprocess

# Load Bik data

In [None]:
# Load data into dataframe from file
# Bik_v2 is updated with First Author Affiliation completely filled
bik_df = pd.read_csv('Bik_v2.tsv', sep='\t', encoding='unicode-escape')

# Show all columns
pd.set_option('display.max_columns', None)

# Clean data

In [None]:
# Drop first column (indices), since pandas automatically provides indices - column 0
# Drop duplicate university name column - column 27

bik_df = bik_df.drop(bik_df.columns[[0,27]], axis=1)

In [None]:
# Remove special characters from Affiliations
# Easier for LaTeX

fa_aff = bik_df["First Author Affiliation"].tolist()
fa_aff = [i.strip() if isinstance(i,str) else np.nan for i in fa_aff]

for i in range(len(fa_aff)):
    if fa_aff[i] == "UniversitÃ\xa0 degli Studi di Bari Aldo Moro":
        fa_aff[i] = "University of Bari Aldo Moro"
    elif fa_aff[i] == "Blood transfusion centre of Slovenia - Zavod Republike Slovenije za transfuzijsko medicino":
        fa_aff[i] = "Blood Transfusion Centre of Slovenia"
    elif fa_aff[i] == "Justus-Liebig-UniversitÃ¤t GieÃ\x9fen":
        fa_aff[i] = "Justus Liebig University Giessen"
    elif fa_aff[i] == "University of Wisconsinâ\x80\x93Madison":
        fa_aff[i] = "University of Wisconsin-Madison"
    elif fa_aff[i] == "University of Iowa, Iowa City, USA":
        fa_aff[i] = "University of Iowa"
    elif fa_aff[i] == "Universidad PolitÃ©cnica de Valencia-C.S.I.C":
        fa_aff[i] = "Technical University of Valencia"
    elif fa_aff[i] == "Concordia Universityâ\x80\x93Ann Arbor":
        fa_aff[i] = "Concordia University Ann Arbor"
    elif fa_aff[i] == "University of Maryland, College Park":
        fa_aff[i] = "University of Maryland"
    elif fa_aff[i] == "UniversitÃ¤tsklinikum Erlangen":
        fa_aff[i] = "University Hospital Erlangen"
    elif fa_aff[i] == "Universidad AndrÃ©s Bello":
        fa_aff[i] = "Andres Bello National University"
    elif fa_aff[i] == "Hospital Son Dureta and Instituto Universitario de InvestigaciÃ³n en Ciencias de la Salud":
        fa_aff[i] = "Hospital Son Dureta and Instituto Universitario de Investigacion en Ciencias de la Salud"
    elif fa_aff[i] == "Department of OncologyFaculty of MedicineMcGill UniversityLady Davis Institute-Segal Cancer Center from the Jewish General HospitalMontrealQuebecCanada":
        fa_aff[i] = "McGill University"
    elif fa_aff[i] == "Cancer Research LaboratoryFudan University Shanghai Cancer CenterShanghai China":
        fa_aff[i] = "Fudan University"
    elif fa_aff[i] == "Department of Biochemistry and Molecular BiophysicsBiochemistry and Molecular and Cellular Biology Graduate ProgramUniversity of ArizonaTucson AZ USA":
        fa_aff[i] = "University of Arizona"
    elif fa_aff[i] == "Department of Experimental and Clinical MedicineUniversitÃ\xa0 Politecnica delle MarcheVia Tronto 10/aAncona60020 Italy":
        fa_aff[i] = "Marche Polytechnic University Faculty of Medicine"
    elif fa_aff[i] == "UniversitÃ© de Strasbourg-CNRS" or fa_aff[i] == "CNRS UniversitÃ© de Strasbourg":
        fa_aff[i] = "University of Strasbourg"
    elif fa_aff[i] == "Divisiont of Urology Kobe University Graduate School of Medicine Kobe Hyogo 650-0017 Japan":
        fa_aff[i] = "Kobe University"
    elif fa_aff[i] == "School of Human Science and Environment University of Hyogo Himeji Hyogo 670-0092 Japan":
        fa_aff[i] = "University of Hyogo"
    elif fa_aff[i] == "Johns Hopkins University School of Medicine and the Sidney Kimmel Comprehensive Cancer Center at Johns Hopkins":
        fa_aff[i] = "Johns Hopkins University"
    elif fa_aff[i] == "UniversitÃ© Paris-Saclay":
        fa_aff[i] = "University of Paris-Saclay"
    elif fa_aff[i] == "INFN - Istituto Nazionale di Fisica Nuclear":
        fa_aff[i] = "Istituto Nazionale di Fisica Nuclear"
    elif fa_aff[i] == 'UniversitÃ\xa0 degli Studi della Campania "Luigi Vanvitelli':
        fa_aff[i] = "University of Campania Luigi Vanvitelli"
    elif fa_aff[i] == "UniversitÃ© CÃ´te d'Azur":
        fa_aff[i] = "Cote d'Azur University"
    elif fa_aff[i] == "Institut de CancÃ©rologie Gustave Roussy":
        fa_aff[i] = "Gustave Roussy Institute of Cancerology"
    elif fa_aff[i] == "Graduate School at Shenzhen, Tsinghua Univeristy":
        fa_aff[i] = "Tsinghua University"
    elif fa_aff[i] == "Ecole Normale SupÃ©rieure de Paris":
        fa_aff[i] = "Ecole Normale Superieure, Paris"   

In [None]:
bik_df = bik_df.drop(columns=["First Author Affiliation"])
bik_df.insert(22,"First Author Affiliation",fa_aff)

In [None]:
# bik_df

In [None]:
print(Force error here.)

# Download PDFs

- **USC VPN required**: `Wiley` [119:121], `Cancer` [135:143], `SciDirect` [144:177], `Science` [178:180], `Nature` [181:187]

In [None]:
# Initialize dictionary of Elsevier IDs to use later for file renaming
elsevierID = {}

# Download PDFs
def download_pdf(df,url,indx):
    
    # Create PDFS folder to save PDFs to
    folder_location = r'PDFS'
    if not os.path.exists(folder_location):os.mkdir(folder_location)
    
    # Get PDF download link
    if df["Home Site"][indx] == "PLOS":
        pdf_link = "https://journals.plos.org/plosone/article/file?id=" + df["DOI"][indx] + "&type=printable"
    elif df["Home Site"][indx] == "ASM":
        pdf_link = "https://journals.asm.org/doi/pdf/" + df["DOI"][indx]
    elif df["Home Site"][indx] == "PubMed":
        # Scrape page
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")
        # Get DOI
        doi = soup.find("a",{"data-ga-action":"DOI"}).text.replace("\n","").strip()
        # Replace PMID with DOI in dataframe
        df.at[indx,'DOI'] = doi
        # PDF download link format for ASM Journals
        pdf_link = "https://journals.asm.org/doi/pdf/" + doi
    elif df["Home Site"][indx] == "Hindawi":
        if df["Citation"][indx][:3] == "Bio":
            pdf_link = "https://downloads.hindawi.com/journals/bmri/" + "/".join(df["DOI"][indx].split("/")[-2:]) + ".pdf"
        else: # == "Evi"
            pdf_link = "https://downloads.hindawi.com/journals/ecam/" + "/".join(df["DOI"][indx].split("/")[-2:]) + ".pdf"
    elif df["Home Site"][indx] == "Wiley":
        if df["DOI"][indx][3:7] == "1002":
            pdf_link = "https://onlinelibrary.wiley.com/doi/pdf/" + df["DOI"][indx]
        else: # == "1111"
            pdf_link = "https://sfamjournals.onlinelibrary.wiley.com/doi/pdf/" + df["DOI"][indx]
    elif df["Home Site"][indx] == "BioMed":
        if df["DOI"][indx][8:10] == "gb":
            pdf_link = "https://genomebiology.biomedcentral.com/track/pdf/" + df["DOI"][indx] + ".pdf"
        elif df["DOI"][indx][8:10] == "bc":
            pdf_link = "https://breast-cancer-research.biomedcentral.com/track/pdf/" + df["DOI"][indx] + ".pdf"
        else: # == "14"
            pdf_link = "https://bmcmicrobiol.biomedcentral.com/track/pdf/" + df["DOI"][indx] + ".pdf"
    elif df["Home Site"][indx] == "Cancer":
        if df["DOI"][indx][8:12] == "j.cc":
            # Scrape page
            s = Service(ChromeDriverManager().install())
            driver = webdriver.Chrome(service = s)
            driver.get(url)
            html = driver.page_source
            soup = BeautifulSoup(html)
            # Choose PDF viewing option
            pdf_options = soup.find("li", class_="article-tools__item article-tools__pdf").find_all("a")
            for a in pdf_options:
                if pdf_options[0]["href"] == "#":
                    url_path = pdf_options[1]["href"]
                else:
                    url_path = pdf_options[0]["href"]
            url_domain = "https://www.cell.com"
            # Combine URL domain and scraped path
            pdf_link = url_domain + url_path
        else: # == S016 or j.lu
            s = Service(ChromeDriverManager().install())
            driver = webdriver.Chrome(service = s)
            driver.get(url)
            time.sleep(5) # Wait while URL redirects
            driver.switch_to.window(driver.window_handles[-1])
            pdf_link = driver.current_url.split("?")[0] + "/pdfft?isDTMRedir=true&download=true"
            elsevierID[indx] = str(driver.current_url.split("?")[0].split("/")[-1])
    elif df["Home Site"][indx] == "SciDirect":
        # Scrape page
        s = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service = s)
        driver.get(url)
        time.sleep(5) # Wait while URL redirects
        driver.switch_to.window(driver.window_handles[-1])
        pdf_link = driver.current_url.split("?")[0] + "/pdfft?isDTMRedir=true&download=true"
        elsevierID[indx] = str(driver.current_url.split("?")[0].split("/")[-1]) 
    elif df["Home Site"][indx] == "Science":
        pdf_link = "https://www.science.org/doi/pdf/" + df["DOI"][indx]
    elif df["Home Site"][indx] == "Nature":
        if df["DOI"][indx][8:11] == "onc":
            pdf_link = "https://www.nature.com/articles/" + df["DOI"][indx].split("/")[1].replace(".","") + ".pdf"
        else:
            pdf_link = "https://www.nature.com/articles/" + df["DOI"][indx].split("/")[1] + ".pdf"
    elif df["Home Site"][indx] == "RUPress":
        pdf_link = "https://rupress.org/jcb/article-pdf/199/3/481/1357646/" + df["DOI"][indx].split("/")[1].replace(".","_") + ".pdf"
    elif df["Home Site"][indx] == "Spandidos":
        pdf_link = "https://www.spandidos-publications.com/" + df["DOI"][indx] + "/download"
    elif df["Home Site"][indx] == "PNAS":
        pdf_link = "https://www.pnas.org/doi/pdf/" + df["DOI"][indx]
    else:
        pdf_link = str(indx) + ": ERROR - Link not recognized."
    
    # Feedback
    print(pdf_link)
    
    # File name: Bik index
    filename = folder_location + "/" + str(indx) + ".pdf"
    
    # Download PDF from link
    # https://stackoverflow.com/questions/54616638/download-all-pdf-files-from-a-website-using-python
    try:
        # Not blocked by Cloudflare
        if df["Home Site"][indx] in ["PLOS","Hindawi","BioMed","Cancer","Nature","Spandidos"]:
            with open(filename, 'wb') as f:
                f.write(requests.get(pdf_link).content)
        # Bypass Cloudflare
        elif df["Home Site"][indx] in ["ASM","PubMed","Wiley","Cancer","SciDirect","Science","RUPress","PNAS"]:
            s = Service(ChromeDriverManager().install())
            # Bypass Cloudflare: https://blog.m157q.tw/posts/2020/09/11/bypass-cloudflare-detection-while-using-selenium-with-chromedriver
            options = webdriver.ChromeOptions()
            options.add_experimental_option("excludeSwitches", ["enable-automation"])
            options.add_experimental_option('useAutomationExtension', False)
            options.add_argument("--disable-blink-features=AutomationControlled")
            # Download PDF with Selenium: https://stackoverflow.com/questions/43149534/selenium-webdriver-how-to-download-a-pdf-file-with-python
            options.add_experimental_option('prefs', {
            "download.default_directory": "/Users/auderoy/dsci550/Assignment 2/PDFS", # SET FILE PATH
            "download.prompt_for_download": False,
            "download.directory_upgrade": True,
            "plugins.always_open_pdf_externally": True
            })
            driver = webdriver.Chrome(service=s,options=options)
            driver.get(pdf_link)
            time.sleep(5)
        # Bypass Cloudflare, click to confirm download
        elif df["Home Site"][indx] == "Wiley":
            s = Service(ChromeDriverManager().install())
            # Bypass Cloudflare
            options = webdriver.ChromeOptions()
            options.add_experimental_option("excludeSwitches", ["enable-automation"])
            options.add_experimental_option('useAutomationExtension', False)
            options.add_argument("--disable-blink-features=AutomationControlled")
            # Download PDF with Selenium
            options.add_experimental_option('prefs', {
            "download.default_directory": "/Users/auderoy/dsci550/Assignment 2/PDFS", # SET FILE PATH
            "download.prompt_for_download": False,
            "download.directory_upgrade": True,
            "plugins.always_open_pdf_externally": True
            })
            driver = webdriver.Chrome(service=s,options=options)
            driver.get(pdf_link)
            time.sleep(5)
            # Click button in iframe to confirm download: https://www.selenium.dev/documentation/webdriver/browser/frames
            driver.switch_to.frame(driver.find_element(By.CSS_SELECTOR,'body.pb-ui').find_element(By.TAG_NAME,'iframe'))
            driver.find_element(By.XPATH,'''/html/body/div/div/a/button''').click()
            time.sleep(5)    
    except:
        print("------------------------------")
        print(f"ERROR: Index - {indx}, Home Site - {df['Home Site'][indx]}, PDF link - {pdf_link}")
        print("------------------------------")

In [None]:
# Call download_pdf for each paper
for index, row in bik_df.iterrows():
    print("INDEX: ", index)
    print(row["Home Site"], ", ", row["DOI"])
    print(row["Title"])
    print(row["URL"])
    # download_pdf(bik_df,row["URL"],index) # PDFs already downloaded; uncomment to redownload

## Rename PDFs downloaded with Selenium &rarr; "{Bik index}.pdf"

In [None]:
# Dictionary saved for convenience, derived from download_pdf
# Matching Bik index to Elselvier ID
elsevierID = {135: 'S0169500201002124',
 136: 'S0169500203002393',
 137: 'S016950020600287X',
 138: 'S0169500206003308',
 139: 'S0169500208003085',
 140: 'S0169500209003687',
 141: 'S0169500209005418',
 142: 'S0169500211000481',
 143: 'S0169500211005150',
 144: 'S089684110500123X',
 145: 'S0896841104001234',
 146: 'S0896841106000357',
 147: 'S0896841107000182',
 148: 'S089684110800139X',
 149: 'S089684111400064X',
 150: 'S1043466603004423',
 151: 'S1043466605000323',
 152: 'S1043466605000128',
 153: 'S104346660600216X',
 154: 'S1043466607000038',
 155: 'S1043466606003450',
 156: 'S1043466606000020',
 157: 'S1043466607000828',
 158: 'S1043466608001853',
 159: 'S1043466608001816',
 160: 'S1043466608000021',
 161: 'S1043466608000409',
 162: 'S1043466609001896',
 163: 'S1043466609000271',
 164: 'S1043466609000325',
 165: 'S1043466609000179',
 166: 'S1043466609008813',
 167: 'S1043466611001839',
 168: 'S1043466611006806',
 169: 'S1043466611000573',
 170: 'S1043466610006897',
 171: 'S1043466611001827',
 172: 'S1043466612001597',
 173: 'S1043466612006667',
 174: 'S1043466612002177',
 175: 'S1043466613001749',
 176: 'S1043466613001531',
 177: 'S1043466614004748'}

In [None]:
# Rename ASM, PubMed, Science, RUPress, PNAS papers
# SET FILE PATH (x3)
for f in os.listdir('/Users/auderoy/dsci550/Assignment 2/PDFS'):
    count = 0
    for c in bik_df["DOI"]:
        if c[12:] == f[4:-4]:
            shutil.move(os.path.join('/Users/auderoy/dsci550/Assignment 2/PDFS',f),os.path.join('/Users/auderoy/dsci550/Assignment 2/PDFS',f"{count}.pdf"))
        else:
            count += 1

In [None]:
# Rename Elsevier (Cancer, SciDirect) papers
# SET FILE PATH (x3)
for f in os.listdir('/Users/auderoy/dsci550/Assignment 2/PDFS'):
    for key,value in elsevierID.items():
        try:
            if value in f.split("-")[2]:
                shutil.move(os.path.join('/Users/auderoy/dsci550/Assignment 2/PDFS',f),os.path.join('/Users/auderoy/dsci550/Assignment 2/PDFS',f"{key}.pdf"))
        except:
            continue
            
            

In [None]:
# Rename Wiley papers
# Manual: 111, 114, 116, 119
# SET FILE PATH (x3)
for f in os.listdir('/Users/auderoy/dsci550/Assignment 2/PDFS'):
    count = 0
    try:
        filename = f.split("-")[3:][0].strip()
    except:
        filename = f
    for c in bik_df["Title"]:
        if c[:15] == filename[:15]:
            shutil.move(os.path.join('/Users/auderoy/dsci550/Assignment 2/PDFS',f),os.path.join('/Users/auderoy/dsci550/Assignment 2/PDFS',f"{count}.pdf"))
        else:
            count += 1

# Extract images from PDFs

- [The Python Code: Extract PDF images in Python](https://www.thepythoncode.com/article/extract-pdf-images-in-python)
- Different [organizational] methods for image generation tests

In [None]:
# Save images to individual folders

def extract_images(df,file,indx):
    
    print("------------------------------")
    print("INDEX: ", indx)
    
    # Open file
    pdf_file = fitz.open(file)
    
    # Create folder for images
    folder_location = "images/extracted_images/" + df["Title"][indx][:240].replace("/","-") + "-images"
    if not os.path.exists(folder_location):os.mkdir(folder_location)
        
    # Iterate over PDF pages
    for page_index in range(len(pdf_file)):
        # Get page
        page = pdf_file[page_index]
        image_list = page.get_images()
        # Print number of images found on page
        if image_list:
            print(f"[+] Found a total of {len(image_list)} images on page {page_index}")
        else:
            print("[!] No images found on page", page_index)
        for image_index, img in enumerate(page.get_images(), start=1):
            try:
                # Get image XREF
                xref = img[0]
                # Extract image bytes
                base_image = pdf_file.extract_image(xref)
                image_bytes = base_image["image"]
                # Get image extension
                image_ext = base_image["ext"]
                # Load to PIL
                image = Image.open(io.BytesIO(image_bytes))
                # Save to local disk
                image.save(open(f"{folder_location}/image{page_index+1}_{image_index}.{image_ext}", "wb"))
            except:
                continue

In [None]:
# Call extract_images for each PDF
# PDFs named by index
for i in range(0,214):
    try:
        pdf_file = f"PDFS/{i}.pdf"
        extract_images(bik_df,pdf_file,i)
    except:
        continue

In [None]:
# Save images to one folder

def extract_images_new(df,file,indx):
    
    print("------------------------------")
    print("INDEX: ", indx)
    
    # Open file
    pdf_file = fitz.open(file)
    
    # Create folder for images
    folder_location = "images/extracted_images_new"
    if not os.path.exists(folder_location):os.mkdir(folder_location)
    
    # Iterate over PDF pages
    for page_index in range(len(pdf_file)):
        # Get page
        page = pdf_file[page_index]
        image_list = page.get_images()
        # Print number of images found on page
        if image_list:
            print(f"[+] Found a total of {len(image_list)} images on page {page_index}")
        else:
            print("[!] No images found on page", page_index)
        for image_index, img in enumerate(page.get_images(), start=1):
            try:
                # Get image XREF
                xref = img[0]
                # Extract image bytes
                base_image = pdf_file.extract_image(xref)
                image_bytes = base_image["image"]
                # Get image extension
                image_ext = base_image["ext"]
                # Load to PIL
                image = Image.open(io.BytesIO(image_bytes))
                # Save to local disk
                image.save(open(f"{folder_location}/image{indx}_{page_index+1}_{image_index}.{image_ext}", "wb"))
            except:
                continue

In [None]:
# Call extract_images_new for each PDF
# PDFs named by index
for i in range(0,214):
    try:
        pdf_file = f"PDFS/{i}.pdf"
        extract_images_new(bik_df,pdf_file,i)
    except:
        continue

In [None]:
# Save images to individual folders

def extract_images_jpg(df,file,indx):
    
    print("------------------------------")
    print("INDEX: ", indx)
    
    # Open file
    pdf_file = fitz.open(file)
    
    # Create folder for images
    folder_location = "images/extracted_images_jpg/" + df["Title"][indx][:240].replace("/","-") + "-images"
    if not os.path.exists(folder_location):os.mkdir(folder_location)
        
    # Iterate over PDF pages
    for page_index in range(len(pdf_file)):
        # Get page
        page = pdf_file[page_index]
        image_list = page.get_images()
        # Print number of images found on page
        if image_list:
            print(f"[+] Found a total of {len(image_list)} images on page {page_index}")
        else:
            print("[!] No images found on page", page_index)
        for image_index, img in enumerate(page.get_images(), start=1):
            try:
                # Get image XREF
                xref = img[0]
                # Extract image bytes
                base_image = pdf_file.extract_image(xref)
                image_bytes = base_image["image"]
                # Get image extension
                image_ext = base_image["ext"]
                # Load to PIL
                image = Image.open(io.BytesIO(image_bytes))
                # Save to local disk
                image.save(open(f"{folder_location}/image{page_index+1}_{image_index}.jpg", "wb"))
            except:
                continue

In [None]:
# Call extract_images for each PDF
# PDFs named by index
for i in range(0,214):
    try:
        pdf_file = f"PDFS/{i}.pdf"
        extract_images_jpg(bik_df,pdf_file,i)
    except:
        continue

# Grover model

- [GitHub: Grover](https://github.com/rowanz/grover) ([demo](https://rowanzellers.com/grover/))

# Generate fake text

# Generate fake images

- [GitHub: Face Generator](https://github.com/gsurma/face_generator)

In [None]:
# Export image file names to extracted_images.txt for FaceGeneratorDCGAN.ipynb

imagepaths = []

for folder in os.listdir('images/extracted_images'):
    try:
        for image in os.listdir(f'images/extracted_images/{folder}'):
            imagepaths.append(f"{folder}/{image}")
    except:
        continue

with open('images/extracted_images.txt', 'w') as f:
    for imagepath in imagepaths:
        f.write(imagepath)
        f.write('\n')

# Generate fake features

## Author names

In [None]:
# List of all authors from all Bik papers
authors214 = bik_df["Authors"].tolist()

# List of number of authors per paper
authors_per_paper = []
for i in authors214:
    authors_per_paper.append(len(i.split(",")))
# authors_per_paper

In [None]:
# Generate list of author names (string type)
def generate_authors():
    # for i in range(0,random.randint(3,9)): # Most papers have 3-9 authors
    for i in range(0,random.choice(authors_per_paper)): # Weighted random
        name = fake.name()
        # First author
        if i == 0:
            authors = name
        # Subsequent authors require leading comma
        else:
            authors = authors + ", " + name
    return authors

## Affiliations

In [None]:
# Sample affiliation from Bik papers
# Remove empty floats
affs214 = bik_df["First Author Affiliation"].tolist()
sample_aff = [i for i in affs214 if isinstance(i,str)]

# Generate affiliation
def generate_aff():
    return random.choice(sample_aff)

## Other

In [None]:
def weighted_random(df,col_name):
    values214 = df[col_name].tolist()
    # print(len(values214), values214)
    return random.choice(values214)

In [None]:
# Year

def generate_year():
    return weighted_random(bik_df,"Year")

In [None]:
# Month

months214 = bik_df["Month"].tolist()
sample_months = [i for i in months214 if i>0] # Remove nan from list

def generate_month():
    return random.choice(sample_months)

months_abc = {1:"January",
             2:"February",
             3:"March",
             4:"April",
             5:"May",
             6:"June",
             7:"July",
             8:"August"}

In [None]:
# 0

def generate_0():
    return weighted_random(bik_df,"0")

In [None]:
# 1

def generate_1():
    return weighted_random(bik_df,"1")

In [None]:
# 2

def generate_2():
    return weighted_random(bik_df,"2")

In [None]:
# 3

def generate_3():
    return weighted_random(bik_df,"3")

In [None]:
# Retraction

def generate_retraction():
    return weighted_random(bik_df,"Retraction")

In [None]:
# Correction

def generate_correction():
    return weighted_random(bik_df,"Correction")

In [None]:
# No Action

def generate_noaction():
    return weighted_random(bik_df,"No Action")

In [None]:
# First Author Career Duration

def generate_duration():
    return weighted_random(bik_df,"First Author Career Duration")

In [None]:
# Home Site

def generate_home_site():
    return weighted_random(bik_df,"Home Site")

## Generate

In [None]:
authors = []
affiliations = []
year = []
month = []
month_abc = []
zero = []
one = []
two = []
three = []
retraction = []
correction = []
noaction = []
duration = []
homesite = []
title = []

for i in range(0,500):
    authors.append(generate_authors())
    affiliations.append(generate_aff())
    year.append(generate_year())
    month.append(generate_month())
    month_abc.append(months_abc[int(month[i])])
    zero.append(generate_0())
    one.append(generate_1())
    two.append(generate_2())
    three.append(generate_3())
    retraction.append(generate_retraction())
    correction.append(generate_correction())
    noaction.append(generate_noaction())
    duration.append(generate_duration())
    homesite.append(generate_homesite())

# Make accessible to LaTeX.ipynb
%store authors
%store affiliations
%store year
%store month
%store month_abc

# Generate 500 full fake papers

- [StackExchange: Automatic document generation](https://tex.stackexchange.com/questions/270714/automatic-document-generation-based-on-a-database)
- Will run LaTeX generation in separate notebook for organizational purposes (LaTeX/LaTeX.ipynb)
- Version in this notebook is note updated

In [None]:
LatexContent = '''
\\documentclass{article}
\\usepackage[utf8]{inputenc}
\\usepackage{textalpha}
\\usepackage{newunicodechar}
\\newunicodechar{≥}{\\ensuremath{\\geq}}
\\usepackage{graphicx}
\\graphicspath{{./LaTeX/images/}}

\\title{%(Ti)s}
\\author{%(Au)s}
\\date{2020}

\\begin{document}

\\maketitle

\\includegraphics[width=\\textwidth]{universe \\caption{Testing.}}

%(Bo)s

\\end{document}'''

for i in range(0,1):
    
        ID = str(i + 200)
        Author = authors30[i].replace(",","\\and")
        Body = open(f'faketext/{i}.txt', 'r').read().replace("\\n","\n\n").replace("$","\$")[1:-2]
        Title = Body[:25]

        TexFileName = "LaTeX/" + ID + '.tex'
        TexFile = open(TexFileName,'w')
        TexFile.write(LatexContent %{"Id" : ID, "Ti" : Title, "Au" : Author, "Bo" : Body })
        TexFile.close()
        
        subprocess.Popen(['pdflatex', TexFileName],shell=False)

In [None]:
# Double checking that all PDFs are created, no errors

count = 0
#files = []
for i in os.listdir('LaTeX'):
    if i[-3:] == "pdf":
        #files.append(i)
        count += 1
count
#files.sort()
#files

# Update dataframe, export TSV

In [None]:
# bik_df.to_csv('Bik_v2_updated.tsv', sep="\t")

# Generate fake faces